#include <arm_neon.h> #define SCALE_NOFILTER_NAME MAKENAME(_nofilter_scale) #define SCALE_FILTER_NAME MAKENAME(_filter_scale) #define AFFINE_NOFILTER_NAME MAKENAME(_nofilter_affine) #define AFFINE_FILTER_NAME MAKENAME(_filter_affine) #define PERSP_NOFILTER_NAME MAKENAME(_nofilter_persp) #define PERSP_FILTER_NAME MAKENAME(_filter_persp) #define PACK_FILTER_X_NAME MAKENAME(_pack_filter_x) #define PACK_FILTER_Y_NAME MAKENAME(_pack_filter_y) #define PACK_FILTER_X4_NAME MAKENAME(_pack_filter_x4) #define PACK_FILTER_Y4_NAME MAKENAME(_pack_filter_y4) #ifndef PREAMBLE #define PREAMBLE(state) #define PREAMBLE_PARAM_X #define PREAMBLE_PARAM_Y #define PREAMBLE_ARG_X #define PREAMBLE_ARG_Y #endif static void SCALE_NOFILTER_NAME(const SkBitmapProcState& s, uint32_t xy[], int count, int x, int y) { SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask | SkMatrix::kScale_Mask)) == 0); PREAMBLE(s); // we store y, x, x, x, x, x const unsigned maxX = s.fBitmap->width() - 1; SkFractionalInt fx; { SkPoint pt; s.fInvProc(s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf, SkIntToScalar(y) + SK_ScalarHalf, &pt); fx = SkScalarToFractionalInt(pt.fY); const unsigned maxY = s.fBitmap->height() - 1; *xy++ = TILEY_PROCF(SkFractionalIntToFixed(fx), maxY); fx = SkScalarToFractionalInt(pt.fX); } if (0 == maxX) { // all of the following X values must be 0 memset(xy, 0, count * sizeof(uint16_t)); return; } const SkFractionalInt dx = s.fInvSxFractionalInt; #ifdef CHECK_FOR_DECAL // test if we don't need to apply the tile proc if (can_truncate_to_fixed_for_decal(fx, dx, count, maxX)) { decal_nofilter_scale_neon(xy, SkFractionalIntToFixed(fx), SkFractionalIntToFixed(dx), count); return; } #endif if (count >= 8) { SkFractionalInt dx2 = dx+dx; SkFractionalInt dx4 = dx2+dx2; SkFractionalInt dx8 = dx4+dx4; // now build fx/fx+dx/fx+2dx/fx+3dx SkFractionalInt fx1, fx2, fx3; int32x4_t lbase, hbase; int16_t *dst16 = (int16_t *)xy; fx1 = fx+dx; fx2 = fx1+dx; fx3 = fx2+dx; lbase = vdupq_n_s32(SkFractionalIntToFixed(fx)); lbase = vsetq_lane_s32(SkFractionalIntToFixed(fx1), lbase, 1); lbase = vsetq_lane_s32(SkFractionalIntToFixed(fx2), lbase, 2); lbase = vsetq_lane_s32(SkFractionalIntToFixed(fx3), lbase, 3); hbase = vaddq_s32(lbase, vdupq_n_s32(SkFractionalIntToFixed(dx4))); // store & bump while (count >= 8) { int16x8_t fx8; fx8 = TILEX_PROCF_NEON8(lbase, hbase, maxX); vst1q_s16(dst16, fx8); // but preserving base & on to the next lbase = vaddq_s32 (lbase, vdupq_n_s32(SkFractionalIntToFixed(dx8))); hbase = vaddq_s32 (hbase, vdupq_n_s32(SkFractionalIntToFixed(dx8))); dst16 += 8; count -= 8; fx += dx8; }; xy = (uint32_t *) dst16; } uint16_t* xx = (uint16_t*)xy; for (int i = count; i > 0; --i) { *xx++ = TILEX_PROCF(SkFractionalIntToFixed(fx), maxX); fx += dx; } } static void AFFINE_NOFILTER_NAME(const SkBitmapProcState& s, uint32_t xy[], int count, int x, int y) { SkASSERT(s.fInvType & SkMatrix::kAffine_Mask); SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask | SkMatrix::kScale_Mask | SkMatrix::kAffine_Mask)) == 0); PREAMBLE(s); SkPoint srcPt; s.fInvProc(s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf, SkIntToScalar(y) + SK_ScalarHalf, &srcPt); SkFractionalInt fx = SkScalarToFractionalInt(srcPt.fX); SkFractionalInt fy = SkScalarToFractionalInt(srcPt.fY); SkFractionalInt dx = s.fInvSxFractionalInt; SkFractionalInt dy = s.fInvKyFractionalInt; int maxX = s.fBitmap->width() - 1; int maxY = s.fBitmap->height() - 1; if (count >= 8) { SkFractionalInt dx4 = dx * 4; SkFractionalInt dy4 = dy * 4; SkFractionalInt dx8 = dx * 8; SkFractionalInt dy8 = dy * 8; int32x4_t xbase, ybase; int32x4_t x2base, y2base; int16_t *dst16 = (int16_t *) xy; // now build fx, fx+dx, fx+2dx, fx+3dx xbase = vdupq_n_s32(SkFractionalIntToFixed(fx)); xbase = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx), xbase, 1); xbase = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx+dx), xbase, 2); xbase = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx+dx+dx), xbase, 3); // same for fy ybase = vdupq_n_s32(SkFractionalIntToFixed(fy)); ybase = vsetq_lane_s32(SkFractionalIntToFixed(fy+dy), ybase, 1); ybase = vsetq_lane_s32(SkFractionalIntToFixed(fy+dy+dy), ybase, 2); ybase = vsetq_lane_s32(SkFractionalIntToFixed(fy+dy+dy+dy), ybase, 3); x2base = vaddq_s32(xbase, vdupq_n_s32(SkFractionalIntToFixed(dx4))); y2base = vaddq_s32(ybase, vdupq_n_s32(SkFractionalIntToFixed(dy4))); // store & bump do { int16x8x2_t hi16; hi16.val[0] = TILEX_PROCF_NEON8(xbase, x2base, maxX); hi16.val[1] = TILEY_PROCF_NEON8(ybase, y2base, maxY); vst2q_s16(dst16, hi16); // moving base and on to the next xbase = vaddq_s32(xbase, vdupq_n_s32(SkFractionalIntToFixed(dx8))); ybase = vaddq_s32(ybase, vdupq_n_s32(SkFractionalIntToFixed(dy8))); x2base = vaddq_s32(x2base, vdupq_n_s32(SkFractionalIntToFixed(dx8))); y2base = vaddq_s32(y2base, vdupq_n_s32(SkFractionalIntToFixed(dy8))); dst16 += 16; // 8x32 aka 16x16 count -= 8; fx += dx8; fy += dy8; } while (count >= 8); xy = (uint32_t *) dst16; } for (int i = count; i > 0; --i) { *xy++ = (TILEY_PROCF(SkFractionalIntToFixed(fy), maxY) << 16) | TILEX_PROCF(SkFractionalIntToFixed(fx), maxX); fx += dx; fy += dy; } } static void PERSP_NOFILTER_NAME(const SkBitmapProcState& s, uint32_t* SK_RESTRICT xy, int count, int x, int y) { SkASSERT(s.fInvType & SkMatrix::kPerspective_Mask); PREAMBLE(s); // max{X,Y} are int here, but later shown/assumed to fit in 16 bits int maxX = s.fBitmap->width() - 1; int maxY = s.fBitmap->height() - 1; SkPerspIter iter(s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf, SkIntToScalar(y) + SK_ScalarHalf, count); while ((count = iter.next()) != 0) { const SkFixed* SK_RESTRICT srcXY = iter.getXY(); if (count >= 8) { int32_t *mysrc = (int32_t *) srcXY; int16_t *mydst = (int16_t *) xy; do { int16x8x2_t hi16; int32x4x2_t xy1, xy2; xy1 = vld2q_s32(mysrc); xy2 = vld2q_s32(mysrc+8); hi16.val[0] = TILEX_PROCF_NEON8(xy1.val[0], xy2.val[0], maxX); hi16.val[1] = TILEY_PROCF_NEON8(xy1.val[1], xy2.val[1], maxY); vst2q_s16(mydst, hi16); count -= 8; // 8 iterations mysrc += 16; // 16 longs mydst += 16; // 16 shorts, aka 8 longs } while (count >= 8); // get xy and srcXY fixed up srcXY = (const SkFixed *) mysrc; xy = (uint32_t *) mydst; } while (--count >= 0) { *xy++ = (TILEY_PROCF(srcXY[1], maxY) << 16) | TILEX_PROCF(srcXY[0], maxX); srcXY += 2; } } } static inline uint32_t PACK_FILTER_Y_NAME(SkFixed f, unsigned max, SkFixed one PREAMBLE_PARAM_Y) { unsigned i = TILEY_PROCF(f, max); i = (i << 4) | TILEY_LOW_BITS(f, max); return (i << 14) | (TILEY_PROCF((f + one), max)); } static inline uint32_t PACK_FILTER_X_NAME(SkFixed f, unsigned max, SkFixed one PREAMBLE_PARAM_X) { unsigned i = TILEX_PROCF(f, max); i = (i << 4) | TILEX_LOW_BITS(f, max); return (i << 14) | (TILEX_PROCF((f + one), max)); } static inline int32x4_t PACK_FILTER_X4_NAME(int32x4_t f, unsigned max, SkFixed one PREAMBLE_PARAM_X) { int32x4_t ret, res, wide_one; // Prepare constants wide_one = vdupq_n_s32(one); // Step 1 res = TILEX_PROCF_NEON4(f, max); // Step 2 ret = TILEX_LOW_BITS_NEON4(f, max); ret = vsliq_n_s32(ret, res, 4); // Step 3 res = TILEX_PROCF_NEON4(f + wide_one, max); ret = vorrq_s32(vshlq_n_s32(ret, 14), res); return ret; } static inline int32x4_t PACK_FILTER_Y4_NAME(int32x4_t f, unsigned max, SkFixed one PREAMBLE_PARAM_X) { int32x4_t ret, res, wide_one; // Prepare constants wide_one = vdupq_n_s32(one); // Step 1 res = TILEY_PROCF_NEON4(f, max); // Step 2 ret = TILEY_LOW_BITS_NEON4(f, max); ret = vsliq_n_s32(ret, res, 4); // Step 3 res = TILEY_PROCF_NEON4(f + wide_one, max); ret = vorrq_s32(vshlq_n_s32(ret, 14), res); return ret; } static void SCALE_FILTER_NAME(const SkBitmapProcState& s, uint32_t xy[], int count, int x, int y) { SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask | SkMatrix::kScale_Mask)) == 0); SkASSERT(s.fInvKy == 0); PREAMBLE(s); const unsigned maxX = s.fBitmap->width() - 1; const SkFixed one = s.fFilterOneX; const SkFractionalInt dx = s.fInvSxFractionalInt; SkFractionalInt fx; { SkPoint pt; s.fInvProc(s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf, SkIntToScalar(y) + SK_ScalarHalf, &pt); const SkFixed fy = SkScalarToFixed(pt.fY) - (s.fFilterOneY >> 1); const unsigned maxY = s.fBitmap->height() - 1; // compute our two Y values up front *xy++ = PACK_FILTER_Y_NAME(fy, maxY, s.fFilterOneY PREAMBLE_ARG_Y); // now initialize fx fx = SkScalarToFractionalInt(pt.fX) - (SkFixedToFractionalInt(one) >> 1); } #ifdef CHECK_FOR_DECAL // test if we don't need to apply the tile proc if (can_truncate_to_fixed_for_decal(fx, dx, count, maxX)) { decal_filter_scale_neon(xy, SkFractionalIntToFixed(fx), SkFractionalIntToFixed(dx), count); return; } #endif { if (count >= 4) { int32x4_t wide_fx; wide_fx = vdupq_n_s32(SkFractionalIntToFixed(fx)); wide_fx = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx), wide_fx, 1); wide_fx = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx+dx), wide_fx, 2); wide_fx = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx+dx+dx), wide_fx, 3); while (count >= 4) { int32x4_t res; res = PACK_FILTER_X4_NAME(wide_fx, maxX, one PREAMBLE_ARG_X); vst1q_u32(xy, vreinterpretq_u32_s32(res)); wide_fx += vdupq_n_s32(SkFractionalIntToFixed(dx+dx+dx+dx)); fx += dx+dx+dx+dx; xy += 4; count -= 4; } } while (--count >= 0) { *xy++ = PACK_FILTER_X_NAME(SkFractionalIntToFixed(fx), maxX, one PREAMBLE_ARG_X); fx += dx; } } } static void AFFINE_FILTER_NAME(const SkBitmapProcState& s, uint32_t xy[], int count, int x, int y) { SkASSERT(s.fInvType & SkMatrix::kAffine_Mask); SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask | SkMatrix::kScale_Mask | SkMatrix::kAffine_Mask)) == 0); PREAMBLE(s); SkPoint srcPt; s.fInvProc(s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf, SkIntToScalar(y) + SK_ScalarHalf, &srcPt); SkFixed oneX = s.fFilterOneX; SkFixed oneY = s.fFilterOneY; SkFixed fx = SkScalarToFixed(srcPt.fX) - (oneX >> 1); SkFixed fy = SkScalarToFixed(srcPt.fY) - (oneY >> 1); SkFixed dx = s.fInvSx; SkFixed dy = s.fInvKy; unsigned maxX = s.fBitmap->width() - 1; unsigned maxY = s.fBitmap->height() - 1; if (count >= 4) { int32x4_t wide_fy, wide_fx; wide_fx = vdupq_n_s32(fx); wide_fx = vsetq_lane_s32(fx+dx, wide_fx, 1); wide_fx = vsetq_lane_s32(fx+dx+dx, wide_fx, 2); wide_fx = vsetq_lane_s32(fx+dx+dx+dx, wide_fx, 3); wide_fy = vdupq_n_s32(fy); wide_fy = vsetq_lane_s32(fy+dy, wide_fy, 1); wide_fy = vsetq_lane_s32(fy+dy+dy, wide_fy, 2); wide_fy = vsetq_lane_s32(fy+dy+dy+dy, wide_fy, 3); while (count >= 4) { int32x4x2_t vxy; // do the X side, then the Y side, then interleave them vxy.val[0] = PACK_FILTER_Y4_NAME(wide_fy, maxY, oneY PREAMBLE_ARG_Y); vxy.val[1] = PACK_FILTER_X4_NAME(wide_fx, maxX, oneX PREAMBLE_ARG_X); // interleave as YXYXYXYX as part of the storing vst2q_s32((int32_t*)xy, vxy); // prepare next iteration wide_fx += vdupq_n_s32(dx+dx+dx+dx); fx += dx + dx + dx + dx; wide_fy += vdupq_n_s32(dy+dy+dy+dy); fy += dy+dy+dy+dy; xy += 8; // 4 x's, 4 y's count -= 4; } } while (--count >= 0) { // NB: writing Y/X *xy++ = PACK_FILTER_Y_NAME(fy, maxY, oneY PREAMBLE_ARG_Y); fy += dy; *xy++ = PACK_FILTER_X_NAME(fx, maxX, oneX PREAMBLE_ARG_X); fx += dx; } } static void PERSP_FILTER_NAME(const SkBitmapProcState& s, uint32_t* SK_RESTRICT xy, int count, int x, int y) { SkASSERT(s.fInvType & SkMatrix::kPerspective_Mask); PREAMBLE(s); unsigned maxX = s.fBitmap->width() - 1; unsigned maxY = s.fBitmap->height() - 1; SkFixed oneX = s.fFilterOneX; SkFixed oneY = s.fFilterOneY; SkPerspIter iter(s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf, SkIntToScalar(y) + SK_ScalarHalf, count); while ((count = iter.next()) != 0) { const SkFixed* SK_RESTRICT srcXY = iter.getXY(); while (count >= 4) { int32x4_t wide_x, wide_y; int32x4x2_t vxy, vresyx; // load src: x-y-x-y-x-y-x-y vxy = vld2q_s32(srcXY); // do the X side, then the Y side, then interleave them wide_x = vsubq_s32(vxy.val[0], vdupq_n_s32(oneX>>1)); wide_y = vsubq_s32(vxy.val[1], vdupq_n_s32(oneY>>1)); vresyx.val[0] = PACK_FILTER_Y4_NAME(wide_y, maxY, oneY PREAMBLE_ARG_Y); vresyx.val[1] = PACK_FILTER_X4_NAME(wide_x, maxX, oneX PREAMBLE_ARG_X); // store interleaved as y-x-y-x-y-x-y-x (NB != read order) vst2q_s32((int32_t*)xy, vresyx); // on to the next iteration srcXY += 2*4; count -= 4; xy += 2*4; } while (--count >= 0) { // NB: we read x/y, we write y/x *xy++ = PACK_FILTER_Y_NAME(srcXY[1] - (oneY >> 1), maxY, oneY PREAMBLE_ARG_Y); *xy++ = PACK_FILTER_X_NAME(srcXY[0] - (oneX >> 1), maxX, oneX PREAMBLE_ARG_X); srcXY += 2; } } } const SkBitmapProcState::MatrixProc MAKENAME(_Procs)[] = { SCALE_NOFILTER_NAME, SCALE_FILTER_NAME, AFFINE_NOFILTER_NAME, AFFINE_FILTER_NAME, PERSP_NOFILTER_NAME, PERSP_FILTER_NAME }; #undef TILEX_PROCF_NEON8 #undef TILEY_PROCF_NEON8 #undef TILEX_PROCF_NEON4 #undef TILEY_PROCF_NEON4 #undef TILEX_LOW_BITS_NEON4 #undef TILEY_LOW_BITS_NEON4 #undef MAKENAME #undef TILEX_PROCF #undef TILEY_PROCF #ifdef CHECK_FOR_DECAL #undef CHECK_FOR_DECAL #endif #undef SCALE_NOFILTER_NAME #undef SCALE_FILTER_NAME #undef AFFINE_NOFILTER_NAME #undef AFFINE_FILTER_NAME #undef PERSP_NOFILTER_NAME #undef PERSP_FILTER_NAME #undef PREAMBLE #undef PREAMBLE_PARAM_X #undef PREAMBLE_PARAM_Y #undef PREAMBLE_ARG_X #undef PREAMBLE_ARG_Y #undef TILEX_LOW_BITS #undef TILEY_LOW_BITS