/* * Copyright 2012 The Android Open Source Project * * Use of this source code is governed by a BSD-style license that can be * found in the LICENSE file. */ #include "SkBlitRow_opts_arm_neon.h" #include "SkBlitMask.h" #include "SkBlitRow.h" #include "SkColorData.h" #include "SkDither.h" #include "SkMathPriv.h" #include "SkUtils.h" #include "SkColor_opts_neon.h" #include <arm_neon.h> /* Neon version of S32_Blend_BlitRow32() * portable version is in src/core/SkBlitRow_D32.cpp */ void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, const SkPMColor* SK_RESTRICT src, int count, U8CPU alpha) { SkASSERT(alpha <= 255); if (count <= 0) { return; } uint16_t src_scale = SkAlpha255To256(alpha); uint16_t dst_scale = 256 - src_scale; while (count >= 2) { uint8x8_t vsrc, vdst, vres; uint16x8_t vsrc_wide, vdst_wide; /* These commented prefetches are a big win for count * values > 64 on an A9 (Pandaboard) but hurt by 10% for count = 4. * They also hurt a little (<5%) on an A15 */ //__builtin_prefetch(src+32); //__builtin_prefetch(dst+32); // Load vsrc = vreinterpret_u8_u32(vld1_u32(src)); vdst = vreinterpret_u8_u32(vld1_u32(dst)); // Process src vsrc_wide = vmovl_u8(vsrc); vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale)); // Process dst vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale)); // Combine vdst_wide += vsrc_wide; vres = vshrn_n_u16(vdst_wide, 8); // Store vst1_u32(dst, vreinterpret_u32_u8(vres)); src += 2; dst += 2; count -= 2; } if (count == 1) { uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres; uint16x8_t vsrc_wide, vdst_wide; // Load vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0)); vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0)); // Process vsrc_wide = vmovl_u8(vsrc); vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale)); vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale)); vdst_wide += vsrc_wide; vres = vshrn_n_u16(vdst_wide, 8); // Store vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); } } #ifdef SK_CPU_ARM32 void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, const SkPMColor* SK_RESTRICT src, int count, U8CPU alpha) { SkASSERT(255 > alpha); if (count <= 0) { return; } unsigned alpha256 = SkAlpha255To256(alpha); // First deal with odd counts if (count & 1) { uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres; uint16x8_t vdst_wide, vsrc_wide; unsigned dst_scale; // Load vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0)); vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0)); // Calc dst_scale dst_scale = vget_lane_u8(vsrc, 3); dst_scale = SkAlphaMulInv256(dst_scale, alpha256); // Process src vsrc_wide = vmovl_u8(vsrc); vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256); // Process dst vdst_wide = vmovl_u8(vdst); vdst_wide = vmulq_n_u16(vdst_wide, dst_scale); // Combine vdst_wide += vsrc_wide; vres = vshrn_n_u16(vdst_wide, 8); vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); dst++; src++; count--; } if (count) { uint8x8_t alpha_mask; static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; alpha_mask = vld1_u8(alpha_mask_setup); do { uint8x8_t vsrc, vdst, vres, vsrc_alphas; uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale; __builtin_prefetch(src+32); __builtin_prefetch(dst+32); // Load vsrc = vreinterpret_u8_u32(vld1_u32(src)); vdst = vreinterpret_u8_u32(vld1_u32(dst)); // Prepare src_scale vsrc_scale = vdupq_n_u16(alpha256); // Calc dst_scale vsrc_alphas = vtbl1_u8(vsrc, alpha_mask); vdst_scale = vmovl_u8(vsrc_alphas); // Calculate SkAlphaMulInv256(vdst_scale, vsrc_scale). // A 16-bit lane would overflow if we used 0xFFFF here, // so use an approximation with 0xFF00 that is off by 1, // and add back 1 after to get the correct value. // This is valid if alpha256 <= 255. vdst_scale = vmlsq_u16(vdupq_n_u16(0xFF00), vdst_scale, vsrc_scale); vdst_scale = vsraq_n_u16(vdst_scale, vdst_scale, 8); vdst_scale = vsraq_n_u16(vdupq_n_u16(1), vdst_scale, 8); // Process src vsrc_wide = vmovl_u8(vsrc); vsrc_wide *= vsrc_scale; // Process dst vdst_wide = vmovl_u8(vdst); vdst_wide *= vdst_scale; // Combine vdst_wide += vsrc_wide; vres = vshrn_n_u16(vdst_wide, 8); vst1_u32(dst, vreinterpret_u32_u8(vres)); src += 2; dst += 2; count -= 2; } while(count); } } /////////////////////////////////////////////////////////////////////////////// #endif // #ifdef SK_CPU_ARM32 /////////////////////////////////////////////////////////////////////////////// const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = { nullptr, // S32_Opaque, S32_Blend_BlitRow32_neon, // S32_Blend, nullptr, // Ported to SkOpts #ifdef SK_CPU_ARM32 S32A_Blend_BlitRow32_neon // S32A_Blend #else nullptr #endif };