/* * Copyright (C) 2012 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include <machine/cpu-features.h> #include <machine/asm.h> /* r0 = dst r1 = y0 base pointer r2 = y1 base pointer r3 = y2 base pointer sp = coeffs sp = length / 2 */ ENTRY(rsdIntrinsicConvolve3x3_K) push {r4-r8, r10, r11, lr} vpush {q4-q7} /* Get the coeffs pointer from the stack and load the coefficients in the q0, q1 NEON registers */ ldr r4, [sp, #32+64] vld1.16 {q0, q1}, [r4] /* Get count from the stack */ ldr r4, [sp, #36+64] /* Load the frequently used immediate in a register */ mov r5, #8 1: /* Load and post-increase the address by r5=#8 */ vld1.8 {q13}, [r1], r5 vld1.8 {q14}, [r2], r5 vld1.8 {q15}, [r3], r5 /* Signal memory for data that will be used in the loop after the next */ PLD (r1, r5) PLD (r2, r5) PLD (r3, r5) vmovl.u8 q2, d26 vmovl.u8 q3, d27 vmovl.u8 q4, d28 vmovl.u8 q5, d29 vmovl.u8 q6, d30 vmovl.u8 q7, d31 /* The two pixel source array is d4, d5, d6, d7 d8, d9, d10, d11 d12, d13, d14, d15 */ vmull.s16 q8, d4, d0[0] vmlal.s16 q8, d5, d0[1] vmlal.s16 q8, d6, d0[2] vmlal.s16 q8, d8, d0[3] vmlal.s16 q8, d9, d1[0] vmlal.s16 q8, d10, d1[1] vmlal.s16 q8, d12, d1[2] vmlal.s16 q8, d13, d1[3] vmlal.s16 q8, d14, d2[0] vmull.s16 q9, d5, d0[0] vmlal.s16 q9, d6, d0[1] vmlal.s16 q9, d7, d0[2] vmlal.s16 q9, d9, d0[3] vmlal.s16 q9, d10, d1[0] vmlal.s16 q9, d11, d1[1] vmlal.s16 q9, d13, d1[2] vmlal.s16 q9, d14, d1[3] vmlal.s16 q9, d15, d2[0] vshrn.i32 d16, q8, #8 vshrn.i32 d17, q9, #8 vqmovun.s16 d16, q8 vst1.8 d16, [r0]! /* Are we done yet? */ subs r4, r4, #1 bne 1b /* We're done, bye! */ vpop {q4-q7} pop {r4-r8, r10, r11, lr} bx lr END(rsdIntrinsicConvolve3x3_K) /* static void OneVF(float4 *out, const uchar *ptrIn, int iStride, const float* gPtr, int iradius, int x1, int x2) r0 = out r1 = pin r2 = stride r3 = gptr r4 = sp, ct r5 = sp+4, x1 r6 = sp+8, x2 */ ENTRY(rsdIntrinsicBlurVFU4_K) push {r4-r8, r10, r11, lr} vpush {q4-q7} ldr r4, [sp, #32+64] ldr r5, [sp, #32+64 + 4] ldr r6, [sp, #32+64 + 8] 1: veor q10, q10, q10 /* float4 blurredPixel = 0; */ veor q11, q11, q11 /* float4 blurredPixel = 0; */ add r7, r1, r5, lsl #2 /* const uchar *pi = ptrIn + x1 * 4; */ mov r10, r3 mov r11, r4 2: vld1.32 {d2}, [r7] vmovl.u8 q1, d2 vmovl.u16 q3, d2 vmovl.u16 q4, d3 vcvt.f32.s32 q3, q3 vcvt.f32.s32 q4, q4 vld1.32 {d0[0]}, [r10]! add r7, r7, r2 vmla.f32 q10, q3, d0[0] vmla.f32 q11, q4, d0[0] subs r11, r11, #1 bne 2b vst1.32 {q10}, [r0]! vst1.32 {q11}, [r0]! add r5, r5, #2 cmp r5, r6 bne 1b vpop {q4-q7} pop {r4-r8, r10, r11, lr} bx lr END(rsdIntrinsicBlurVFU4_K) /* static void OneVF(float4 *out, const uchar *ptrIn, int iStride, const float* gPtr, int iradius, int x1, int x2) r0 = out r1 = pin r2 = gptr r3 = ct r4 = sp, x1 r5 = sp+4, x2 */ ENTRY(rsdIntrinsicBlurHFU4_K) push {r4-r8, r10, r11, lr} vpush {q4-q7} ldr r4, [sp, #32+64] ldr r5, [sp, #32+64 + 4] 1: add r7, r1, r4, lsl #4 /* const uchar *pi = ptrIn + x1 * 4; */ mov r10, r2 mov r11, r3 vld1.32 {q1}, [r7]! vld1.32 {d6[0]}, [r10]! vmul.f32 q0, q1, d6[0] sub r11, r11, #1 2: vld1.32 {q1}, [r7]! vld1.32 {q2}, [r7]! vld1.32 {d6}, [r10]! vmla.f32 q0, q1, d6[0] vmla.f32 q0, q2, d6[1] subs r11, r11, #2 bne 2b vcvt.s32.f32 q0, q0 vmovn.u32 d0, q0 vmovn.u16 d0, q0 vst1.32 {d0[0]}, [r0]! add r4, r4, #1 cmp r4, r5 bne 1b vpop {q4-q7} pop {r4-r8, r10, r11, lr} bx lr END(rsdIntrinsicBlurHFU4_K) ENTRY(rsdIntrinsicBlurHFU1_K) push {r4-r8, r10, r11, lr} vpush {q4-q7} ldr r4, [sp, #32+64] ldr r5, [sp, #32+64 + 4] 1: add r7, r1, r4, lsl #2 /* const uchar *pi = ptrIn + x1 * 4; */ mov r10, r2 mov r11, r3 veor q0, q0 2: vld1.32 {q1}, [r7] add r7, r7, #4 vld1.32 {d4[0]}, [r10]! vmla.f32 q0, q1, d4[0] subs r11, r11, #1 bne 2b vcvt.s32.f32 q0, q0 vmovn.u32 d0, q0 vmovn.u16 d0, q0 vst1.32 {d0[0]}, [r0]! add r4, r4, #4 cmp r4, r5 bne 1b vpop {q4-q7} pop {r4-r8, r10, r11, lr} bx lr END(rsdIntrinsicBlurHFU1_K) /* Function called with the following arguments: dst, Y, vu, len, YuvCoeff r0 = dst r1 = Y r2 = VU r3 = length (pixels / 8) ---- Args below will be in the stack ---- sp = YuvCoeff This function converts 8 pixels per iteration */ ENTRY(rsdIntrinsicYuv_K) push {r4, r5, lr} @ preserve clobbered int registers vpush {Q4-Q7} @ preserve Vregisters we clobber mov r5, #16 @ Integer 16 in r5; used as an incrementing value ldr r4, [sp, #64+12] @ load the coeffs address in memory in r4 (16*4 + 4*3) vld1.16 {Q2}, [r4]! @ load the multipliers from the coeffs matrix (r4) in Q2 vld1.8 {d6[]}, [r4], r5 @ load y offset 16 from the coeffs matrix (r4) in d6 vld1.8 {d8[]}, [r4], r5 @ load V and U offset of 128 from the coeffs matrix (r4) in d8 mov r4, #8 @ Integer 8 in r4; used as an incrementing value vdup.8 d3, d5[1] @ d3 = 255 (alpha) from the multipliers line in @ the coeffs matrix (Q2) 1: vld1.8 {d10}, [r1]! @ get Y (r1->Y) vld2.8 {d12, d14}, [r2], r4 @ split V from U (r2 -> VU) and increase pointer by 8 (in r4) pld [r1, #64] @ preloading data from address y(r1) + 64 for subsequent loops pld [r2, #64] @ preloading data from address vu(r2) + 64 for subsequent loops vsubl.u8 Q5, d10, d6 @ Y to 16 bit - 16 (in 16bit) (n to n+7) vmull.s16 Q8, d10, d4[0] @ Y(n,n+1,n+2,n+3) * 298 = Q8 (to 32bit) vmull.s16 Q11, d11, d4[0] @ Y(n+4,n+5,n+6,n+7) * 298 = Q11 (to 32bit) vsubl.u8 Q5, d12, d8 @ V to 16 bit - 128 = Q5 // V(n, n+1, n+2,n+3) vsubl.u8 Q6, d14, d8 @ U to 16 bit - 128 = Q6 // U(n, n+1, n+2,n+3) vmov.u16 d11, d10 @ Copying V to d11 vmov.u16 d13, d12 @ Copying U to d13 vzip.u16 d10, d11 @ Q5 = V (n,n n+1, n+1) V(n+2, n+2, n+3, n+3) vzip.u16 d12, d13 @ Q5 = U (n,n n+1, n+1) U(n+2, n+2, n+3, n+3) vmov Q9, Q8 @ Copy Q8(Y: n, n+1, n+2, n+3) to Q9 vmov Q10, Q8 @ Copy Q8(Y: n, n+1, n+2, n+3) to Q10 vmov Q12, Q11 @ Copy Q11(Y: n+5, n+6, n+6, n+7) to Q12 vmov Q13, Q11 @ Copy Q11(Y: n+5, n+6, n+6, n+7) to Q13 @ R G B @ Pixel(0-3) Q8, Q9, Q10 @ Pixel(4-7) Q11, Q12, Q13 @ @ Pixel(0-3) vmlal.s16 Q8, d10, d4[1] @ R : Q8 = Q8(Y-16) + (V-128) * 409 vmlal.s16 Q9, d10, d5[0] @ G : Q9 = Q9(Y-16) + (V-128) * (-208) vmlal.s16 Q9, d12, d4[2] @ + (U-128) * (-100) vmlal.s16 Q10, d12, d4[3] @ B : Q10 = Q10(Y-16) + (U-128) * 516 @ Pixel(4-7) vmlal.s16 Q11, d11, d4[1] @ R : Q11 = Q11(Y-16) + (V-128) * 409 vmlal.s16 Q12, d11, d5[0] @ G : Q12 = Q12(Y-16) + (V-128) * (-208) vmlal.s16 Q12, d13, d4[2] @ + (U-128) * (-100) vmlal.s16 Q13, d13, d4[3] @ B : Q13 = Q13(Y-16) + (U-128) * 516 @ Pixel(0-3) vrshrn.i32 d16, Q8, #8 @ d16 : R shifted right by 8 rounded'n narrowed to 16bit vrshrn.i32 d18, Q9, #8 @ d18 : G shifted right by 8 rounded'n narrowed to 16bit vrshrn.i32 d20, Q10, #8 @ d20 : B shifted right by 8 rounded'n narrowed to 16bit @ Pixel(4-7) vrshrn.i32 d17, Q11, #8 @ d17 : R shifted right by 8 rounded'n narrowed to 16bit vrshrn.i32 d19, Q12, #8 @ d19 : G shifted right by 8 rounded'n narrowed to 16bit vrshrn.i32 d21, Q13, #8 @ d21 : B shifted right by 8 rounded'n narrowed to 16bit vqmovun.s16 d0, Q8 @ r = d0 (saturated, unsigned and narrowed to 8bit) vqmovun.s16 d1, Q9 @ g = d1 (saturated, unsigned and narrowed to 8bit) vqmovun.s16 d2, Q10 @ b = d2 (saturated, unsigned and narrowed to 8bit) subs r3, r3, #1 @ Checking length (r3) vst4.8 {d0, d1, d2, d3}, [r0]! @ Writing out 8 RGBA values to dst (r0) bne 1b @ if not done with length, loop vpop {Q4-Q7} @ Restore Vregisters pop {r4, r5, lr} @ Restore int registers bx lr END(rsdIntrinsicYuv_K) /* Function called with the following arguments: dst, Y, vu, len, YuvCoeff r0 = dst r1 = Y r2 = UV r3 = length (pixels / 8) ---- Args below will be in the stack ---- sp = YuvCoeff This function converts 8 pixels per iteration */ ENTRY(rsdIntrinsicYuvR_K) push {r4, r5, lr} @ preserve clobbered int registers vpush {Q4-Q7} @ preserve Vregisters we clobber mov r5, #16 @ Integer 16 in r5; used as an incrementing value ldr r4, [sp, #64+12] @ load the coeffs address in memory in r4 (16*4 + 4*3) vld1.16 {Q2}, [r4]! @ load the multipliers from the coeffs matrix (r4) in Q2 vld1.8 {d6[]}, [r4], r5 @ load y offset 16 from the coeffs matrix (r4) in d6 vld1.8 {d8[]}, [r4], r5 @ load V and U offset of 128 from the coeffs matrix (r4) in d8 mov r4, #8 @ Integer 8 in r4; used as an incrementing value vdup.8 d3, d5[1] @ d3 = 255 (alpha) from the multipliers line in @ the coeffs matrix (Q2) 1: vld1.8 {d10}, [r1]! @ get Y (r1->Y) vld2.8 {d12, d14}, [r2], r4 @ split V from U (r2 -> VU) and increase pointer by 8 (in r4) pld [r1, #64] @ preloading data from address y(r1) + 64 for subsequent loops pld [r2, #64] @ preloading data from address vu(r2) + 64 for subsequent loops vsubl.u8 Q5, d10, d6 @ Y to 16 bit - 16 (in 16bit) (n to n+7) vmull.s16 Q8, d10, d4[0] @ Y(n,n+1,n+2,n+3) * 298 = Q8 (to 32bit) vmull.s16 Q11, d11, d4[0] @ Y(n+4,n+5,n+6,n+7) * 298 = Q11 (to 32bit) vsubl.u8 Q5, d14, d8 @ V to 16 bit - 128 = Q5 // V(n, n+1, n+2,n+3) vsubl.u8 Q6, d12, d8 @ U to 16 bit - 128 = Q6 // U(n, n+1, n+2,n+3) vmov.u16 d11, d10 @ Copying V to d11 vmov.u16 d13, d12 @ Copying U to d13 vzip.u16 d10, d11 @ Q5 = V (n,n n+1, n+1) V(n+2, n+2, n+3, n+3) vzip.u16 d12, d13 @ Q5 = U (n,n n+1, n+1) U(n+2, n+2, n+3, n+3) vmov Q9, Q8 @ Copy Q8(Y: n, n+1, n+2, n+3) to Q9 vmov Q10, Q8 @ Copy Q8(Y: n, n+1, n+2, n+3) to Q10 vmov Q12, Q11 @ Copy Q11(Y: n+5, n+6, n+6, n+7) to Q12 vmov Q13, Q11 @ Copy Q11(Y: n+5, n+6, n+6, n+7) to Q13 @ R G B @ Pixel(0-3) Q8, Q9, Q10 @ Pixel(4-7) Q11, Q12, Q13 @ @ Pixel(0-3) vmlal.s16 Q8, d10, d4[1] @ R : Q8 = Q8(Y-16) + (V-128) * 409 vmlal.s16 Q9, d10, d5[0] @ G : Q9 = Q9(Y-16) + (V-128) * (-208) vmlal.s16 Q9, d12, d4[2] @ + (U-128) * (-100) vmlal.s16 Q10, d12, d4[3] @ B : Q10 = Q10(Y-16) + (U-128) * 516 @ Pixel(4-7) vmlal.s16 Q11, d11, d4[1] @ R : Q11 = Q11(Y-16) + (V-128) * 409 vmlal.s16 Q12, d11, d5[0] @ G : Q12 = Q12(Y-16) + (V-128) * (-208) vmlal.s16 Q12, d13, d4[2] @ + (U-128) * (-100) vmlal.s16 Q13, d13, d4[3] @ B : Q13 = Q13(Y-16) + (U-128) * 516 @ Pixel(0-3) vrshrn.i32 d16, Q8, #8 @ d16 : R shifted right by 8 rounded'n narrowed to 16bit vrshrn.i32 d18, Q9, #8 @ d18 : G shifted right by 8 rounded'n narrowed to 16bit vrshrn.i32 d20, Q10, #8 @ d20 : B shifted right by 8 rounded'n narrowed to 16bit @ Pixel(4-7) vrshrn.i32 d17, Q11, #8 @ d17 : R shifted right by 8 rounded'n narrowed to 16bit vrshrn.i32 d19, Q12, #8 @ d19 : G shifted right by 8 rounded'n narrowed to 16bit vrshrn.i32 d21, Q13, #8 @ d21 : B shifted right by 8 rounded'n narrowed to 16bit vqmovun.s16 d0, Q8 @ r = d0 (saturated, unsigned and narrowed to 8bit) vqmovun.s16 d1, Q9 @ g = d1 (saturated, unsigned and narrowed to 8bit) vqmovun.s16 d2, Q10 @ b = d2 (saturated, unsigned and narrowed to 8bit) subs r3, r3, #1 @ Checking length (r3) vst4.8 {d0, d1, d2, d3}, [r0]! @ Writing out 8 RGBA values to dst (r0) bne 1b @ if not done with length, loop vpop {Q4-Q7} @ Restore Vregisters pop {r4, r5, lr} @ Restore int registers bx lr END(rsdIntrinsicYuvR_K) /* Function called with the following arguments: dst, Y, v, u, len, YuvCoeff r0 = dst r1 = Y r2 = V, r3 = U ---- Args below will be in the stack ---- sp = length (pixels / 8) sp+4 = YuvCoeff This function converts 8 pixels per iteration */ ENTRY(rsdIntrinsicYuv2_K) push {r4, r5, r6, lr} @ preserve clobbered int registers vpush {Q4-Q7} @ preserve Vregisters we clobber mov r5, #16 @ Integer 16 in r5; used as an incrementing value ldr r4, [sp, #64+16+4] @ load the coeffs address in memory in r4 (16*4 + 4*4 + 4) ldr r6, [sp, #64+16] @ load the length in r6 (16*4 + 4*4) vld1.16 {Q2}, [r4]! @ load the multipliers from the coeffs matrix (r4) in Q2 vld1.8 {d6[]}, [r4], r5 @ load y offset 16 from the coeffs matrix (r4) in d6 vld1.8 {d8[]}, [r4], r5 @ load V and U offset of 128 from the coeffs matrix (r4) in d8 mov r4, #4 @ Integer 8 in r4; used as an incrementing value vdup.8 d3, d5[1] @ d3 = 255 (alpha) from the multipliers line in @ the coeffs matrix (Q2) 1: vld1.8 {d10}, [r1]! @ get Y (r1->Y) vld1.8 {d12}, [r3], r4 @ split V from U (r2 -> VU) and increase pointer by 4 (in r4) vld1.8 {d14}, [r2], r4 @ split V from U (r2 -> VU) and increase pointer by 4 (in r4) pld [r1, #64] @ preloading data from address y(r1) + 64 for subsequent loops pld [r2, #64] @ preloading data from address vu(r2) + 64 for subsequent loops vsubl.u8 Q5, d10, d6 @ Y to 16 bit - 16 (in 16bit) (n to n+7) vmull.s16 Q8, d10, d4[0] @ Y(n,n+1,n+2,n+3) * 298 = Q8 (to 32bit) vmull.s16 Q11, d11, d4[0] @ Y(n+4,n+5,n+6,n+7) * 298 = Q11 (to 32bit) vsubl.u8 Q5, d12, d8 @ V to 16 bit - 128 = Q5 // V(n, n+1, n+2,n+3) vsubl.u8 Q6, d14, d8 @ U to 16 bit - 128 = Q6 // U(n, n+1, n+2,n+3) vmov.u16 d11, d10 @ Copying V to d11 vmov.u16 d13, d12 @ Copying U to d13 vzip.u16 d10, d11 @ Q5 = V (n,n n+1, n+1) V(n+2, n+2, n+3, n+3) vzip.u16 d12, d13 @ Q5 = U (n,n n+1, n+1) U(n+2, n+2, n+3, n+3) vmov Q9, Q8 @ Copy Q8(Y: n, n+1, n+2, n+3) to Q9 vmov Q10, Q8 @ Copy Q8(Y: n, n+1, n+2, n+3) to Q10 vmov Q12, Q11 @ Copy Q11(Y: n+5, n+6, n+6, n+7) to Q12 vmov Q13, Q11 @ Copy Q11(Y: n+5, n+6, n+6, n+7) to Q13 @ R G B @ Pixel(0-3) Q8, Q9, Q10 @ Pixel(4-7) Q11, Q12, Q13 @ @ Pixel(0-3) vmlal.s16 Q8, d10, d4[1] @ R : Q8 = Q8(Y-16) + (V-128) * 409 vmlal.s16 Q9, d10, d5[0] @ G : Q9 = Q9(Y-16) + (V-128) * (-208) vmlal.s16 Q9, d12, d4[2] @ + (U-128) * (-100) vmlal.s16 Q10, d12, d4[3] @ B : Q10 = Q10(Y-16) + (U-128) * 516 @ Pixel(4-7) vmlal.s16 Q11, d11, d4[1] @ R : Q11 = Q11(Y-16) + (V-128) * 409 vmlal.s16 Q12, d11, d5[0] @ G : Q12 = Q12(Y-16) + (V-128) * (-208) vmlal.s16 Q12, d13, d4[2] @ + (U-128) * (-100) vmlal.s16 Q13, d13, d4[3] @ B : Q13 = Q13(Y-16) + (U-128) * 516 @ Pixel(0-3) vrshrn.i32 d16, Q8, #8 @ d16 : R shifted right by 8 rounded'n narrowed to 16bit vrshrn.i32 d18, Q9, #8 @ d18 : G shifted right by 8 rounded'n narrowed to 16bit vrshrn.i32 d20, Q10, #8 @ d20 : B shifted right by 8 rounded'n narrowed to 16bit @ Pixel(4-7) vrshrn.i32 d17, Q11, #8 @ d17 : R shifted right by 8 rounded'n narrowed to 16bit vrshrn.i32 d19, Q12, #8 @ d19 : G shifted right by 8 rounded'n narrowed to 16bit vrshrn.i32 d21, Q13, #8 @ d21 : B shifted right by 8 rounded'n narrowed to 16bit vqmovun.s16 d0, Q8 @ r = d0 (saturated, unsigned and narrowed to 8bit) vqmovun.s16 d1, Q9 @ g = d1 (saturated, unsigned and narrowed to 8bit) vqmovun.s16 d2, Q10 @ b = d2 (saturated, unsigned and narrowed to 8bit) subs r6, r6, #1 @ Checking length (r6) vst4.8 {d0, d1, d2, d3}, [r0]! @ Writing out 8 RGBA values to dst (r0) bne 1b @ if not done with length, loop vpop {Q4-Q7} @ Restore Vregisters pop {r4, r5, r6, lr} @ Restore int registers bx lr END(rsdIntrinsicYuv2_K) /* Convolve 5x5 */ /* r0 = dst r1 = y0 base pointer r2 = y1 base pointer r3 = y2 base pointer r4 = y3 base pointer r5 = y4 base pointer r6 = coeffs r7 = length */ ENTRY(rsdIntrinsicConvolve5x5_K) push {r4-r7, lr} vpush {q4-q7} /* load y3 in r4 */ ldr r4, [sp, #20 + 64] /* load y4 in r5 */ ldr r5, [sp, #24 + 64] /* Load the coefficients pointer */ ldr r6, [sp, #28 + 64] /* Create the coefficients vector */ vld1.16 {d0, d1, d2, d3}, [r6]! vld1.16 {d4, d5, d6}, [r6] vmov.u32 q15, #0x7f /* load the count */ ldr r6, [sp, #32 + 64] /* Load the frequently used immediate in a register */ mov r7, #8 1: /* Load the y base pointers in Qregs and post-increase the address by r7=#8 */ vld1.8 {d24, d25, d26}, [r1], r7 @ y0 ( y - 2 ) vld1.8 {d27, d28, d29}, [r2], r7 @ y0 ( y - 1 ) /* Signal memory for data that will be used in the loop after the next */ PLD (r1, r7) PLD (r2, r7) /* Promoting the 8bit channels to 16bit */ vmovl.u8 q9, d24 vmovl.u8 q10, d25 vmovl.u8 q11, d26 vmovl.u8 q12, d27 vmovl.u8 q13, d28 vmovl.u8 q14, d29 /* d18, d19, d20, d21, d22, d23, d24, d25 */ vmull.s16 q4, d18, d0[0] vmlal.s16 q4, d19, d0[1] vmlal.s16 q4, d20, d0[2] vmlal.s16 q4, d21, d0[3] vmlal.s16 q4, d22, d1[0] vmlal.s16 q4, d24, d1[1] vmlal.s16 q4, d25, d1[2] vmlal.s16 q4, d26, d1[3] vmlal.s16 q4, d27, d2[0] vmlal.s16 q4, d28, d2[1] vmull.s16 q5, d19, d0[0] vmlal.s16 q5, d20, d0[1] vmlal.s16 q5, d21, d0[2] vmlal.s16 q5, d22, d0[3] vmlal.s16 q5, d23, d1[0] vmlal.s16 q5, d25, d1[1] vmlal.s16 q5, d26, d1[2] vmlal.s16 q5, d27, d1[3] vmlal.s16 q5, d28, d2[0] vmlal.s16 q5, d29, d2[1] /* Next 2 rows */ /* Load the y base pointers in Qregs and post-increase the address by r7=#8 */ vld1.8 {d24, d25, d26}, [r3], r7 @ y0 ( y ) vld1.8 {d27, d28, d29}, [r4], r7 @ y0 ( y + 1 ) /* Signal memory for data that will be used in the loop after the next */ PLD (r3, r7) PLD (r4, r7) /* Promoting the 8bit channels to 16bit */ vmovl.u8 q9, d24 vmovl.u8 q10, d25 vmovl.u8 q11, d26 vmovl.u8 q12, d27 vmovl.u8 q13, d28 vmovl.u8 q14, d29 /* d18, d19, d20, d21, d22, d23, d24, d25 */ vmlal.s16 q4, d18, d2[2] vmlal.s16 q4, d19, d2[3] vmlal.s16 q4, d20, d3[0] vmlal.s16 q4, d21, d3[1] vmlal.s16 q4, d22, d3[2] vmlal.s16 q4, d24, d3[3] vmlal.s16 q4, d25, d4[0] vmlal.s16 q4, d26, d4[1] vmlal.s16 q4, d27, d4[2] vmlal.s16 q4, d28, d4[3] vmlal.s16 q5, d19, d2[2] vmlal.s16 q5, d20, d2[3] vmlal.s16 q5, d21, d3[0] vmlal.s16 q5, d22, d3[1] vmlal.s16 q5, d23, d3[2] vmlal.s16 q5, d25, d3[3] vmlal.s16 q5, d26, d4[0] vmlal.s16 q5, d27, d4[1] vmlal.s16 q5, d28, d4[2] vmlal.s16 q5, d29, d4[3] /* Last row */ /* Load the y base pointers in Qregs and post-increase the address by r7=#8 */ vld1.8 {d24, d25, d26}, [r5], r7 @ y0 ( y + 2 ) /* Signal memory for data that will be used in the loop after the next */ PLD (r5, r7) /* Promoting the 8bit channels to 16bit */ vmovl.u8 q9, d24 vmovl.u8 q10, d25 vmovl.u8 q11, d26 /* d18, d19, d20, d21, d22, d23, d24, d25 */ vmlal.s16 q4, d18, d5[0] vmlal.s16 q4, d19, d5[1] vmlal.s16 q4, d20, d5[2] vmlal.s16 q4, d21, d5[3] vmlal.s16 q4, d22, d6[0] vmlal.s16 q5, d19, d5[0] vmlal.s16 q5, d20, d5[1] vmlal.s16 q5, d21, d5[2] vmlal.s16 q5, d22, d5[3] vmlal.s16 q5, d23, d6[0] vadd.i32 q4, q4, q15 vadd.i32 q5, q5, q15 /* Narrow it to a d-reg 32 -> 16 bit */ vrshrn.i32 d8, q4, #8 vrshrn.i32 d9, q5, #8 /* Pack 16 -> 8 bit, saturate, put two pixels into D reg */ vqmovun.s16 d8, q4 vst1.8 d8, [r0]! @ return the output and increase the address of r0 /* Are we done? */ subs r6, r6, #1 bne 1b /* Yup, bye */ vpop {q4-q7} pop {r4-r7, lr} bx lr END(rsdIntrinsicConvolve5x5_K) /* dst = src + dst * (1.0 - src.a) r0 = dst r1 = src r2 = length */ ENTRY(rsdIntrinsicBlendSrcOver_K) .save {r4, lr} stmfd sp!, {r4, lr} vpush {q4-q7} mov r4, #255 vdup.16 q7, r4 mov r4, r0 1: /* src */ vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]! vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]! vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]! vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]! vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]! vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]! vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]! vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]! vshll.u8 q12, d0, #8 vshll.u8 q13, d1, #8 vshll.u8 q14, d2, #8 vmovl.u8 q6, d3 vsub.i16 q6, q7, q6 // q6 = 1 - src.a vshll.u8 q15, d3, #8 /* dst */ vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]! vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]! vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]! vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]! vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]! vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]! vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]! vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]! vmovl.u8 q8, d0 vmovl.u8 q9, d1 vmovl.u8 q10, d2 vmovl.u8 q11, d3 vmla.i16 q12, q8, q6 vmla.i16 q13, q9, q6 vmla.i16 q14, q10, q6 vmla.i16 q15, q11, q6 vshrn.i16 d0, q12, #8 vshrn.i16 d1, q13, #8 vshrn.i16 d2, q14, #8 vshrn.i16 d3, q15, #8 vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]! vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]! vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]! vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]! vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]! vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]! vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]! vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]! subs r2, r2, #1 bne 1b vpop {q4-q7} ldmfd sp!, {r4, lr} bx lr END(rsdIntrinsicBlendSrcOver_K) /* dst = dst + src * (1.0 - dst.a) r0 = dst r1 = src r2 = length */ ENTRY(rsdIntrinsicBlendDstOver_K) .save {r4, lr} stmfd sp!, {r4, lr} vpush {q4-q7} mov r4, #255 vdup.16 q7, r4 mov r4, r0 1: /* src */ vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]! vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]! vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]! vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]! vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]! vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]! vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]! vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]! vmovl.u8 q12, d0 vmovl.u8 q13, d1 vmovl.u8 q14, d2 vmovl.u8 q15, d3 /* dst */ vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]! vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]! vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]! vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]! vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]! vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]! vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]! vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]! vshll.u8 q8, d0, #8 vshll.u8 q9, d1, #8 vshll.u8 q10, d2, #8 vmovl.u8 q6, d3 vsub.i16 q6, q7, q6 // q6 = 1 - dst.a vshll.u8 q11, d3, #8 vmla.i16 q8, q12, q6 vmla.i16 q9, q13, q6 vmla.i16 q10, q14, q6 vmla.i16 q11, q15, q6 vshrn.i16 d0, q8, #8 vshrn.i16 d1, q9, #8 vshrn.i16 d2, q10, #8 vshrn.i16 d3, q11, #8 vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]! vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]! vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]! vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]! vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]! vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]! vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]! vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]! subs r2, r2, #1 bne 1b vpop {q4-q7} ldmfd sp!, {r4, lr} bx lr END(rsdIntrinsicBlendDstOver_K) /* dst = src * dst.a r0 = dst r1 = src r2 = length */ ENTRY(rsdIntrinsicBlendSrcIn_K) .save {r4, lr} stmfd sp!, {r4, lr} vpush {q4-q7} mov r4, r0 1: /* src */ vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]! vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]! vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]! vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]! vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]! vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]! vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]! vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]! vmovl.u8 q12, d0 vmovl.u8 q13, d1 vmovl.u8 q14, d2 vmovl.u8 q15, d3 /* dst */ vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]! vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]! vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]! vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]! vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]! vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]! vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]! vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]! //vmovl.u8 q8, d0 //vmovl.u8 q9, d1 //vmovl.u8 q10, d2 vmovl.u8 q11, d3 vmul.i16 q12, q12, q11 vmul.i16 q13, q13, q11 vmul.i16 q14, q14, q11 vmul.i16 q15, q15, q11 vshrn.i16 d0, q12, #8 vshrn.i16 d1, q13, #8 vshrn.i16 d2, q14, #8 vshrn.i16 d3, q15, #8 vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]! vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]! vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]! vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]! vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]! vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]! vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]! vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]! subs r2, r2, #1 bne 1b vpop {q4-q7} ldmfd sp!, {r4, lr} bx lr END(rsdIntrinsicBlendSrcIn_K) /* dst = dst * src.a r0 = dst r1 = src r2 = length */ ENTRY(rsdIntrinsicBlendDstIn_K) .save {r4, lr} stmfd sp!, {r4, lr} vpush {q4-q7} mov r4, r0 1: /* src */ vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]! vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]! vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]! vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]! vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]! vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]! vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]! vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]! //vmovl.u8 q12, d0 //vmovl.u8 q13, d1 //vmovl.u8 q14, d2 vmovl.u8 q15, d3 /* dst */ vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]! vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]! vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]! vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]! vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]! vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]! vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]! vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]! vmovl.u8 q8, d0 vmovl.u8 q9, d1 vmovl.u8 q10, d2 vmovl.u8 q11, d3 vmul.i16 q8, q8, q15 vmul.i16 q9, q9, q15 vmul.i16 q10, q10, q15 vmul.i16 q11, q11, q15 vshrn.i16 d0, q8, #8 vshrn.i16 d1, q9, #8 vshrn.i16 d2, q10, #8 vshrn.i16 d3, q11, #8 vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]! vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]! vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]! vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]! vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]! vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]! vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]! vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]! subs r2, r2, #1 bne 1b vpop {q4-q7} ldmfd sp!, {r4, lr} bx lr END(rsdIntrinsicBlendDstIn_K) /* dst = src * (1.0 - dst.a) r0 = dst r1 = src r2 = length */ ENTRY(rsdIntrinsicBlendSrcOut_K) .save {r4, lr} stmfd sp!, {r4, lr} vpush {q4-q7} mov r4, #255 vdup.16 q7, r4 mov r4, r0 1: /* src */ vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]! vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]! vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]! vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]! vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]! vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]! vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]! vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]! vmovl.u8 q12, d0 vmovl.u8 q13, d1 vmovl.u8 q14, d2 vmovl.u8 q15, d3 /* dst */ vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]! vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]! vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]! vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]! vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]! vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]! vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]! vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]! //vmovl.u8 q8, d0 //vmovl.u8 q9, d1 //vmovl.u8 q10, d2 vmovl.u8 q11, d3 vsub.i16 q6, q7, q11 // q6 = 1 - dst.a vmul.i16 q12, q12, q6 vmul.i16 q13, q13, q6 vmul.i16 q14, q14, q6 vmul.i16 q15, q15, q6 vshrn.i16 d0, q12, #8 vshrn.i16 d1, q13, #8 vshrn.i16 d2, q14, #8 vshrn.i16 d3, q15, #8 vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]! vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]! vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]! vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]! vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]! vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]! vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]! vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]! subs r2, r2, #1 bne 1b vpop {q4-q7} ldmfd sp!, {r4, lr} bx lr END(rsdIntrinsicBlendSrcOut_K) /* dst = dst * (1.0 - src.a) r0 = dst r1 = src r2 = length */ ENTRY(rsdIntrinsicBlendDstOut_K) .save {r4, lr} stmfd sp!, {r4, lr} vpush {q4-q7} mov r4, #255 vdup.16 q7, r4 mov r4, r0 1: /* src */ vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]! vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]! vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]! vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]! vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]! vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]! vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]! vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]! //vmovl.u8 q12, d0 //vmovl.u8 q13, d1 //vmovl.u8 q14, d2 vmovl.u8 q15, d3 /* dst */ vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]! vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]! vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]! vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]! vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]! vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]! vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]! vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]! vmovl.u8 q8, d0 vmovl.u8 q9, d1 vmovl.u8 q10, d2 vmovl.u8 q11, d3 vsub.i16 q6, q7, q15 // q6 = 1 - src.a vmul.i16 q12, q8, q6 vmul.i16 q13, q9, q6 vmul.i16 q14, q10, q6 vmul.i16 q15, q11, q6 vshrn.i16 d0, q12, #8 vshrn.i16 d1, q13, #8 vshrn.i16 d2, q14, #8 vshrn.i16 d3, q15, #8 vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]! vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]! vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]! vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]! vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]! vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]! vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]! vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]! subs r2, r2, #1 bne 1b vpop {q4-q7} ldmfd sp!, {r4, lr} bx lr END(rsdIntrinsicBlendDstOut_K) /* dst.rgb = src.rgb * dst.a + (1.0 - src.a) * dst.rgb dst.a = dst.a r0 = dst r1 = src r2 = length */ ENTRY(rsdIntrinsicBlendSrcAtop_K) .save {r4, lr} stmfd sp!, {r4, lr} vpush {q4-q7} mov r4, #255 vdup.16 q7, r4 mov r4, r0 1: /* src */ vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]! vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]! vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]! vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]! vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]! vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]! vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]! vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]! vmovl.u8 q12, d0 vmovl.u8 q13, d1 vmovl.u8 q14, d2 vmovl.u8 q15, d3 /* dst */ vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]! vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]! vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]! vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]! vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]! vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]! vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]! vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]! vmovl.u8 q8, d0 vmovl.u8 q9, d1 vmovl.u8 q10, d2 vmovl.u8 q11, d3 vsub.i16 q6, q7, q15 // q6 = 1 - src.a vmul.i16 q8, q8, q6 vmul.i16 q9, q9, q6 vmul.i16 q10, q10, q6 vmla.i16 q8, q12, q11 vmla.i16 q9, q13, q11 vmla.i16 q10, q14, q11 vshrn.i16 d0, q8, #8 vshrn.i16 d1, q9, #8 vshrn.i16 d2, q10, #8 //vshrn.i16 d3, q15, #8 vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]! vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]! vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]! vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]! vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]! vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]! vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]! vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]! subs r2, r2, #1 bne 1b vpop {q4-q7} ldmfd sp!, {r4, lr} bx lr END(rsdIntrinsicBlendSrcAtop_K) /* dst = dst.rgb * src.a + (1.0 - dst.a) * src.rgb dst.a = src.a r0 = dst r1 = src r2 = length */ ENTRY(rsdIntrinsicBlendDstAtop_K) .save {r4, lr} stmfd sp!, {r4, lr} vpush {q4-q7} mov r4, #255 vdup.16 q7, r4 mov r4, r0 1: /* src */ vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]! vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]! vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]! vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]! vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]! vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]! vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]! vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]! vmovl.u8 q12, d0 vmovl.u8 q13, d1 vmovl.u8 q14, d2 vmovl.u8 q15, d3 /* dst */ vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]! vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]! vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]! vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]! vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]! vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]! vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]! vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]! vmovl.u8 q8, d0 vmovl.u8 q9, d1 vmovl.u8 q10, d2 vmovl.u8 q11, d3 vsub.i16 q6, q7, q11 // q6 = 1 - dst.a vmul.i16 q12, q12, q6 vmul.i16 q13, q13, q6 vmul.i16 q14, q14, q6 vmla.i16 q12, q8, q15 vmla.i16 q13, q9, q15 vmla.i16 q14, q10, q15 vshrn.i16 d0, q12, #8 vshrn.i16 d1, q13, #8 vshrn.i16 d2, q14, #8 //vshrn.i16 d3, q15, #8 vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]! vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]! vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]! vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]! vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]! vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]! vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]! vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]! subs r2, r2, #1 bne 1b vpop {q4-q7} ldmfd sp!, {r4, lr} bx lr END(rsdIntrinsicBlendDstAtop_K) /* dst = dst ^ src r0 = dst r1 = src r2 = length */ ENTRY(rsdIntrinsicBlendXor_K) .save {r4, lr} stmfd sp!, {r4, lr} vpush {q4-q7} mov r4, #255 vdup.16 q7, r4 mov r4, r0 1: /* src */ vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]! vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]! vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]! vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]! vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]! vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]! vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]! vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]! vmov.u8 d4, d0 vmov.u8 d5, d1 vmov.u8 d6, d2 vmov.u8 d7, d3 /* dst */ vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]! vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]! vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]! vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]! vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]! vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]! vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]! vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]! veor d0, d0, d4 veor d1, d1, d5 veor d2, d2, d6 veor d3, d3, d7 vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]! vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]! vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]! vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]! vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]! vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]! vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]! vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]! subs r2, r2, #1 bne 1b vpop {q4-q7} ldmfd sp!, {r4, lr} bx lr END(rsdIntrinsicBlendXor_K) /* dst = dst * src r0 = dst r1 = src r2 = length */ ENTRY(rsdIntrinsicBlendMultiply_K) .save {r4, lr} stmfd sp!, {r4, lr} vpush {q4-q7} mov r4, #255 vdup.16 q7, r4 mov r4, r0 1: /* src */ vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]! vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]! vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]! vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]! vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]! vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]! vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]! vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]! vmovl.u8 q12, d0 vmovl.u8 q13, d1 vmovl.u8 q14, d2 vmovl.u8 q15, d3 /* dst */ vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]! vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]! vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]! vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]! vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]! vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]! vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]! vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]! vmovl.u8 q8, d0 vmovl.u8 q9, d1 vmovl.u8 q10, d2 vmovl.u8 q11, d3 vmul.i16 q8, q8, q12 vmul.i16 q9, q9, q13 vmul.i16 q10, q10, q14 vmul.i16 q11, q11, q15 vshrn.i16 d0, q8, #8 vshrn.i16 d1, q9, #8 vshrn.i16 d2, q10, #8 vshrn.i16 d3, q11, #8 vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]! vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]! vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]! vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]! vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]! vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]! vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]! vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]! subs r2, r2, #1 bne 1b vpop {q4-q7} ldmfd sp!, {r4, lr} bx lr END(rsdIntrinsicBlendMultiply_K) /* dst = min(src + dst, 1.0) r0 = dst r1 = src r2 = length */ ENTRY(rsdIntrinsicBlendAdd_K) .save {r4, lr} stmfd sp!, {r4, lr} vpush {q4-q7} mov r4, #255 vdup.16 q7, r4 mov r4, r0 1: /* src */ vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]! vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]! vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]! vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]! vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]! vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]! vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]! vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]! vmovl.u8 q12, d0 vmovl.u8 q13, d1 vmovl.u8 q14, d2 vmovl.u8 q15, d3 /* dst */ vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]! vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]! vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]! vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]! vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]! vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]! vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]! vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]! vmovl.u8 q8, d0 vmovl.u8 q9, d1 vmovl.u8 q10, d2 vmovl.u8 q11, d3 vadd.i16 q8, q8, q12 vadd.i16 q9, q9, q13 vadd.i16 q10, q10, q14 vadd.i16 q11, q11, q15 vqmovun.s16 d0, q8 vqmovun.s16 d1, q9 vqmovun.s16 d2, q10 vqmovun.s16 d3, q11 vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]! vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]! vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]! vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]! vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]! vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]! vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]! vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]! subs r2, r2, #1 bne 1b vpop {q4-q7} ldmfd sp!, {r4, lr} bx lr END(rsdIntrinsicBlendAdd_K) /* dst = max(dst - src, 0.0) r0 = dst r1 = src r2 = length */ ENTRY(rsdIntrinsicBlendSub_K) .save {r4, lr} stmfd sp!, {r4, lr} vpush {q4-q7} mov r4, #255 vdup.16 q7, r4 mov r4, r0 1: /* src */ vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]! vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]! vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]! vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]! vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]! vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]! vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]! vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]! vmovl.u8 q12, d0 vmovl.u8 q13, d1 vmovl.u8 q14, d2 vmovl.u8 q15, d3 /* dst */ vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]! vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]! vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]! vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]! vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]! vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]! vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]! vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]! vmovl.u8 q8, d0 vmovl.u8 q9, d1 vmovl.u8 q10, d2 vmovl.u8 q11, d3 vsub.i16 q8, q8, q12 vsub.i16 q9, q9, q13 vsub.i16 q10, q10, q14 vsub.i16 q11, q11, q15 vqmovun.s16 d0, q8 vqmovun.s16 d1, q9 vqmovun.s16 d2, q10 vqmovun.s16 d3, q11 vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]! vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]! vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]! vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]! vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]! vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]! vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]! vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]! subs r2, r2, #1 bne 1b vpop {q4-q7} ldmfd sp!, {r4, lr} bx lr END(rsdIntrinsicBlendSub_K) /* 3D LUT */ /* r0 = dst r1 = src r2 = cube base pointer r3 = cube Y stride r4 = cube Z stride r5 = count xr10 = * constants d0 / q0 = weight 1 p1 d1 = weight 2 p1 d2 / q1 = weight 1 p2 d3 = weight 2 p2 d4 / q2 = src1 d5 = src2 d6 / q3 = baseCoord d7 = baseCoord d8 / q4 = coord1 p1 d9 = d10 / q5 = coord1 p2 d11 = d12 / q6 = d13 = d14 / q7 = d15 = d16 / q8 = x0 y0 z0 d17 = x1 y0 z0 d18 / q9 = x0 y1 z0 d19 = x1 y1 z0 d20 / q10 = x0 y0 z1 d21 = x1 y0 z1 d22 / q11 = x0 y1 z1 d23 = x1 y1 z1 d24 / q12 = alpha mash d25 = current pixel alpha d26 / q13 = 4, y stride d27 = z stride, 0 d28 / q14 = 0x8000 d29 = 0x7fff d30 / q15 = 0, 0, 0, 0xffff d31 = coordMult */ ENTRY(rsdIntrinsic3DLUT_K) push {r4-r8, r10, r11, lr} vpush {q4-q7} /* load Z stride in r4 */ ldr r4, [sp, #32 + 64] /* Load count */ ldr r5, [sp, #36 + 64] vmov.u16 d28, #0x8000 vmov.u16 d29, #0x7fff vmov.u32 d24, #0xff000000 /* load constants using r10 */ ldr r10, [sp, #40 + 64] vld1.32 {d31}, [r10]! vld1.32 {d30}, [r10]! mov r6, #4 vmov d26, r6, r3 mov r6, #0 vmov d27, r4, r6 add r8, r3, r4 1: vld1.8 {d4}, [r1]! vand.u8 d25, d4, d24 vmovl.u8 q2, d4 vmull.u16 q3, d4, d31 vshr.u32 q4, q3, #15 // coord1 p1 vmovn.u32 d1, q3 vand.u16 d1, d29 // weight 2 vsub.u16 d0, d28, d1 // weight 1 vmul.u32 q4, q4, q13 // q4 = x*4, y*ystride, z*zstride, 0 vmull.u16 q3, d5, d31 vshr.u32 q5, q3, #15 // coord1 p2 vmovn.u32 d3, q3 vand.u16 d3, d29 // weight 2 vsub.u16 d2, d28, d3 // weight 1 vmul.u32 q5, q5, q13 // q5 = x*4, y*ystride, z*zstride, 0 vpadd.u32 d8, d8, d9 vpadd.u32 d9, d10, d11 vpadd.u32 d8, d8, d9 vmov r6, r7, d8 // base pointers add r6, r6, r2 add r7, r7, r2 vld1.8 {d16}, [r6] add r11, r6, r3 vld1.8 {d18}, [r11] add r11, r6, r4 vld1.8 {d20}, [r11] add r11, r6, r8 vld1.8 {d22}, [r11] vmovl.u8 q8, d16 vmovl.u8 q9, d18 vmovl.u8 q10, d20 vmovl.u8 q11, d22 vmull.u16 q6, d16, d0[0] vmlal.u16 q6, d17, d1[0] vshrn.u32 d16, q6, #7 vmull.u16 q6, d18, d0[0] vmlal.u16 q6, d19, d1[0] vshrn.u32 d18, q6, #7 vmull.u16 q6, d20, d0[0] vmlal.u16 q6, d21, d1[0] vshrn.u32 d20, q6, #7 vmull.u16 q6, d22, d0[0] vmlal.u16 q6, d23, d1[0] vshrn.u32 d22, q6, #7 vmull.u16 q6, d16, d0[1] vmlal.u16 q6, d18, d1[1] vshrn.u32 d16, q6, #15 vmull.u16 q6, d20, d0[1] vmlal.u16 q6, d22, d1[1] vshrn.u32 d18, q6, #15 vmull.u16 q6, d16, d0[2] vmlal.u16 q6, d18, d1[2] vshrn.u32 d14, q6, #15 vld1.8 {d16}, [r7] add r11, r7, r3 vld1.8 {d18}, [r11] add r11, r7, r4 vld1.8 {d20}, [r11] add r11, r7, r8 vld1.8 {d22}, [r11] vmovl.u8 q8, d16 vmovl.u8 q9, d18 vmovl.u8 q10, d20 vmovl.u8 q11, d22 vmull.u16 q6, d16, d2[0] vmlal.u16 q6, d17, d3[0] vshrn.u32 d16, q6, #7 vmull.u16 q6, d18, d2[0] vmlal.u16 q6, d19, d3[0] vshrn.u32 d18, q6, #7 vmull.u16 q6, d20, d2[0] vmlal.u16 q6, d21, d3[0] vshrn.u32 d20, q6, #7 vmull.u16 q6, d22, d2[0] vmlal.u16 q6, d23, d3[0] vshrn.u32 d22, q6, #7 vmull.u16 q6, d16, d2[1] vmlal.u16 q6, d18, d3[1] vshrn.u32 d16, q6, #15 vmull.u16 q6, d20, d2[1] vmlal.u16 q6, d22, d3[1] vshrn.u32 d18, q6, #15 vmull.u16 q6, d16, d2[2] vmlal.u16 q6, d18, d3[2] vshrn.u32 d15, q6, #15 vrshrn.u16 d14, q7, #8 vbic.u8 d14, d14, d24 // mix in alpha vorr.u8 d14, d14, d25 vst1.32 {d14}, [r0]! /* Are we done? */ subs r5, r5, #1 bne 1b /* Yup, bye */ vpop {q4-q7} pop {r4-r8, r10, r11, lr} bx lr END(rsdIntrinsic3DLUT_K)