Home | History | Annotate | Download | only in cpu_ref
      1 /*
      2  * Copyright (C) 2012 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 
     18 
     19 #include <machine/cpu-features.h>
     20 #include <machine/asm.h>
     21 
     22 /*
     23         r0 = dst
     24         r1 = y0 base pointer
     25         r2 = y1 base pointer
     26         r3 = y2 base pointer
     27         sp = coeffs
     28         sp = length / 2
     29 */
     30 
     31 ENTRY(rsdIntrinsicConvolve3x3_K)
     32         push            {r4-r8, r10, r11, lr}
     33         vpush           {q4-q7}
     34 
     35         /* Get the coeffs pointer from the stack and load the
     36            coefficients in the q0, q1 NEON registers */
     37         ldr r4, [sp, #32+64]
     38         vld1.16 {q0, q1}, [r4]
     39 
     40         /* Get count from the stack */
     41         ldr r4, [sp, #36+64]
     42 
     43         /* Load the frequently used immediate in a register */
     44         mov r5, #8
     45 
     46 1:
     47         /* Load and post-increase the address by r5=#8 */
     48         vld1.8 {q13}, [r1], r5
     49         vld1.8 {q14}, [r2], r5
     50         vld1.8 {q15}, [r3], r5
     51 
     52         /* Signal memory for data that will be used in the loop after the next */
     53         PLD         (r1, r5)
     54         PLD         (r2, r5)
     55         PLD         (r3, r5)
     56 
     57         vmovl.u8 q2, d26
     58         vmovl.u8 q3, d27
     59         vmovl.u8 q4, d28
     60         vmovl.u8 q5, d29
     61         vmovl.u8 q6, d30
     62         vmovl.u8 q7, d31
     63 
     64 /*
     65         The two pixel source array is
     66         d4,  d5,  d6,  d7
     67         d8,  d9,  d10, d11
     68         d12, d13, d14, d15
     69 */
     70 
     71         vmull.s16 q8, d4, d0[0]
     72         vmlal.s16 q8, d5, d0[1]
     73         vmlal.s16 q8, d6, d0[2]
     74         vmlal.s16 q8, d8, d0[3]
     75         vmlal.s16 q8, d9, d1[0]
     76         vmlal.s16 q8, d10, d1[1]
     77         vmlal.s16 q8, d12, d1[2]
     78         vmlal.s16 q8, d13, d1[3]
     79         vmlal.s16 q8, d14, d2[0]
     80 
     81         vmull.s16 q9, d5, d0[0]
     82         vmlal.s16 q9, d6, d0[1]
     83         vmlal.s16 q9, d7, d0[2]
     84         vmlal.s16 q9, d9, d0[3]
     85         vmlal.s16 q9, d10, d1[0]
     86         vmlal.s16 q9, d11, d1[1]
     87         vmlal.s16 q9, d13, d1[2]
     88         vmlal.s16 q9, d14, d1[3]
     89         vmlal.s16 q9, d15, d2[0]
     90 
     91         vshrn.i32 d16, q8, #8
     92         vshrn.i32 d17, q9, #8
     93 
     94         vqmovun.s16 d16, q8
     95         vst1.8 d16, [r0]!
     96 
     97         /* Are we done yet? */
     98         subs r4, r4, #1
     99         bne 1b
    100 
    101         /* We're done, bye! */
    102         vpop            {q4-q7}
    103         pop             {r4-r8, r10, r11, lr}
    104         bx              lr
    105 END(rsdIntrinsicConvolve3x3_K)
    106 
    107 
    108 /*
    109 static void OneVF(float4 *out, const uchar *ptrIn, int iStride,
    110                   const float* gPtr, int iradius, int x1, int x2)
    111 
    112     r0 = out
    113     r1 = pin
    114     r2 = stride
    115     r3 = gptr
    116     r4 = sp, ct
    117     r5 = sp+4, x1
    118     r6 = sp+8, x2
    119 */
    120 ENTRY(rsdIntrinsicBlurVFU4_K)
    121         push            {r4-r8, r10, r11, lr}
    122         vpush           {q4-q7}
    123 
    124         ldr r4, [sp, #32+64]
    125         ldr r5, [sp, #32+64 + 4]
    126         ldr r6, [sp, #32+64 + 8]
    127 
    128 1:
    129         veor q10, q10, q10         /* float4 blurredPixel = 0; */
    130         veor q11, q11, q11         /* float4 blurredPixel = 0; */
    131         add r7, r1, r5, lsl #2  /* const uchar *pi = ptrIn + x1 * 4; */
    132         mov r10, r3
    133 
    134         mov r11, r4
    135 
    136 2:
    137         vld1.32 {d2}, [r7]
    138         vmovl.u8 q1, d2
    139         vmovl.u16 q3, d2
    140         vmovl.u16 q4, d3
    141         vcvt.f32.s32 q3, q3
    142         vcvt.f32.s32 q4, q4
    143         vld1.32 {d0[0]}, [r10]!
    144         add r7, r7, r2
    145         vmla.f32 q10, q3, d0[0]
    146         vmla.f32 q11, q4, d0[0]
    147         subs r11, r11, #1
    148         bne 2b
    149 
    150         vst1.32 {q10}, [r0]!
    151         vst1.32 {q11}, [r0]!
    152         add r5, r5, #2
    153         cmp r5, r6
    154         bne 1b
    155 
    156 
    157         vpop            {q4-q7}
    158         pop             {r4-r8, r10, r11, lr}
    159         bx              lr
    160 END(rsdIntrinsicBlurVFU4_K)
    161 
    162 /*
    163 static void OneVF(float4 *out, const uchar *ptrIn, int iStride,
    164                   const float* gPtr, int iradius, int x1, int x2)
    165 
    166     r0 = out
    167     r1 = pin
    168     r2 = gptr
    169     r3 = ct
    170     r4 = sp, x1
    171     r5 = sp+4, x2
    172 */
    173 ENTRY(rsdIntrinsicBlurHFU4_K)
    174         push            {r4-r8, r10, r11, lr}
    175         vpush           {q4-q7}
    176 
    177         ldr r4, [sp, #32+64]
    178         ldr r5, [sp, #32+64 + 4]
    179 
    180 1:
    181         add r7, r1, r4, lsl #4  /* const uchar *pi = ptrIn + x1 * 4; */
    182         mov r10, r2
    183         mov r11, r3
    184 
    185         vld1.32 {q1}, [r7]!
    186         vld1.32 {d6[0]}, [r10]!
    187         vmul.f32 q0, q1, d6[0]
    188         sub r11, r11, #1
    189 
    190 2:
    191         vld1.32 {q1}, [r7]!
    192         vld1.32 {q2}, [r7]!
    193         vld1.32 {d6}, [r10]!
    194         vmla.f32 q0, q1, d6[0]
    195         vmla.f32 q0, q2, d6[1]
    196         subs r11, r11, #2
    197         bne 2b
    198 
    199         vcvt.s32.f32 q0, q0
    200         vmovn.u32 d0, q0
    201         vmovn.u16 d0, q0
    202 
    203         vst1.32 {d0[0]}, [r0]!
    204         add r4, r4, #1
    205         cmp r4, r5
    206         bne 1b
    207 
    208         vpop            {q4-q7}
    209         pop             {r4-r8, r10, r11, lr}
    210         bx              lr
    211 END(rsdIntrinsicBlurHFU4_K)
    212 
    213 ENTRY(rsdIntrinsicBlurHFU1_K)
    214         push            {r4-r8, r10, r11, lr}
    215         vpush           {q4-q7}
    216 
    217         ldr r4, [sp, #32+64]
    218         ldr r5, [sp, #32+64 + 4]
    219 
    220 1:
    221         add r7, r1, r4, lsl #2  /* const uchar *pi = ptrIn + x1 * 4; */
    222         mov r10, r2
    223         mov r11, r3
    224 
    225         veor q0, q0
    226 
    227 2:
    228         vld1.32 {q1}, [r7]
    229         add r7, r7, #4
    230         vld1.32 {d4[0]}, [r10]!
    231         vmla.f32 q0, q1, d4[0]
    232         subs r11, r11, #1
    233         bne 2b
    234 
    235         vcvt.s32.f32 q0, q0
    236         vmovn.u32 d0, q0
    237         vmovn.u16 d0, q0
    238 
    239         vst1.32 {d0[0]}, [r0]!
    240         add r4, r4, #4
    241         cmp r4, r5
    242         bne 1b
    243 
    244         vpop            {q4-q7}
    245         pop             {r4-r8, r10, r11, lr}
    246         bx              lr
    247 END(rsdIntrinsicBlurHFU1_K)
    248 
    249 /*
    250     Function called with the following arguments: dst, Y, vu, len, YuvCoeff
    251         r0 = dst
    252         r1 = Y
    253         r2 = VU
    254         r3 = length (pixels / 8)
    255         ---- Args below will be in the stack ----
    256         sp = YuvCoeff
    257 
    258         This function converts 8 pixels per iteration
    259 */
    260 ENTRY(rsdIntrinsicYuv_K)
    261         push        {r4, r5, lr}            @ preserve clobbered int registers
    262         vpush       {Q4-Q7}                 @ preserve Vregisters we clobber
    263 
    264         mov  r5, #16                        @ Integer 16 in r5; used as an incrementing value
    265 
    266         ldr         r4, [sp, #64+12]        @ load the coeffs address in memory in r4 (16*4 + 4*3)
    267         vld1.16     {Q2}, [r4]!             @ load the multipliers from the coeffs matrix (r4) in Q2
    268         vld1.8      {d6[]}, [r4], r5        @ load y offset 16 from the coeffs matrix (r4) in d6
    269         vld1.8      {d8[]}, [r4], r5        @ load V and U offset of 128 from the coeffs matrix (r4) in d8
    270 
    271         mov         r4, #8                  @ Integer 8 in r4; used as an incrementing value
    272 
    273         vdup.8      d3, d5[1]               @ d3 = 255 (alpha) from the multipliers line in
    274                                             @ the coeffs matrix (Q2)
    275 
    276         1:
    277         vld1.8      {d10}, [r1]!            @ get Y (r1->Y)
    278         vld2.8      {d12, d14}, [r2], r4    @ split V from U (r2 -> VU) and increase pointer by 8 (in r4)
    279         pld         [r1, #64]               @ preloading data from address y(r1) + 64 for subsequent loops
    280         pld         [r2, #64]               @ preloading data from address vu(r2) + 64 for subsequent loops
    281 
    282         vsubl.u8    Q5, d10, d6             @ Y to 16 bit - 16 (in 16bit) (n to n+7)
    283         vmull.s16   Q8, d10, d4[0]          @ Y(n,n+1,n+2,n+3) * 298 = Q8 (to 32bit)
    284         vmull.s16   Q11, d11, d4[0]         @ Y(n+4,n+5,n+6,n+7) * 298 = Q11 (to 32bit)
    285 
    286         vsubl.u8    Q5, d12, d8             @ V to 16 bit - 128 = Q5 // V(n, n+1, n+2,n+3)
    287         vsubl.u8    Q6, d14, d8             @ U to 16 bit - 128 = Q6 // U(n, n+1, n+2,n+3)
    288         vmov.u16    d11, d10                @ Copying V to d11
    289         vmov.u16    d13, d12                @ Copying U to d13
    290         vzip.u16    d10, d11                @ Q5 = V (n,n n+1, n+1) V(n+2, n+2, n+3, n+3)
    291         vzip.u16    d12, d13                @ Q5 = U (n,n n+1, n+1) U(n+2, n+2, n+3, n+3)
    292 
    293 
    294         vmov        Q9, Q8                  @ Copy Q8(Y: n, n+1, n+2, n+3) to Q9
    295         vmov        Q10, Q8                 @ Copy Q8(Y: n, n+1, n+2, n+3) to Q10
    296         vmov        Q12, Q11                @ Copy Q11(Y: n+5, n+6, n+6, n+7) to Q12
    297         vmov        Q13, Q11                @ Copy Q11(Y: n+5, n+6, n+6, n+7) to Q13
    298 
    299                                             @                  R    G    B
    300                                             @     Pixel(0-3)  Q8,  Q9, Q10
    301                                             @     Pixel(4-7) Q11, Q12, Q13
    302                                             @
    303 
    304                                             @ Pixel(0-3)
    305         vmlal.s16   Q8,  d10, d4[1]         @ R : Q8  = Q8(Y-16)  + (V-128) * 409
    306         vmlal.s16   Q9,  d10, d5[0]         @ G : Q9  = Q9(Y-16)  + (V-128) * (-208)
    307         vmlal.s16   Q9,  d12, d4[2]         @                     + (U-128) * (-100)
    308         vmlal.s16   Q10, d12, d4[3]         @ B : Q10 = Q10(Y-16) + (U-128) * 516
    309 
    310                                             @ Pixel(4-7)
    311         vmlal.s16   Q11, d11, d4[1]         @ R : Q11 = Q11(Y-16) + (V-128) * 409
    312         vmlal.s16   Q12, d11, d5[0]         @ G : Q12 = Q12(Y-16) + (V-128) * (-208)
    313         vmlal.s16   Q12, d13, d4[2]         @                     + (U-128) * (-100)
    314         vmlal.s16   Q13, d13, d4[3]         @ B : Q13 = Q13(Y-16) + (U-128) * 516
    315 
    316                                             @ Pixel(0-3)
    317         vrshrn.i32  d16, Q8, #8             @ d16 : R shifted right by 8 rounded'n narrowed to 16bit
    318         vrshrn.i32  d18, Q9, #8             @ d18 : G shifted right by 8 rounded'n narrowed to 16bit
    319         vrshrn.i32  d20, Q10, #8            @ d20 : B shifted right by 8 rounded'n narrowed to 16bit
    320 
    321                                             @ Pixel(4-7)
    322         vrshrn.i32  d17, Q11, #8            @ d17 : R shifted right by 8 rounded'n narrowed to 16bit
    323         vrshrn.i32  d19, Q12, #8            @ d19 : G shifted right by 8 rounded'n narrowed to 16bit
    324         vrshrn.i32  d21, Q13, #8            @ d21 : B shifted right by 8 rounded'n narrowed to 16bit
    325 
    326         vqmovun.s16 d0, Q8                  @ r = d0 (saturated, unsigned and narrowed to 8bit)
    327         vqmovun.s16 d1, Q9                  @ g = d1 (saturated, unsigned and narrowed to 8bit)
    328         vqmovun.s16 d2, Q10                 @ b = d2 (saturated, unsigned and narrowed to 8bit)
    329 
    330         subs        r3, r3, #1              @ Checking length (r3)
    331         vst4.8      {d0, d1, d2, d3}, [r0]! @ Writing out 8 RGBA values to dst (r0)
    332 
    333         bne 1b                              @ if not done with length, loop
    334 
    335         vpop        {Q4-Q7}                 @ Restore Vregisters
    336         pop         {r4, r5, lr}            @ Restore int registers
    337         bx          lr
    338 END(rsdIntrinsicYuv_K)
    339 
    340 /*
    341     Function called with the following arguments: dst, Y, vu, len, YuvCoeff
    342         r0 = dst
    343         r1 = Y
    344         r2 = UV
    345         r3 = length (pixels / 8)
    346         ---- Args below will be in the stack ----
    347         sp = YuvCoeff
    348 
    349         This function converts 8 pixels per iteration
    350 */
    351 ENTRY(rsdIntrinsicYuvR_K)
    352         push        {r4, r5, lr}            @ preserve clobbered int registers
    353         vpush       {Q4-Q7}                 @ preserve Vregisters we clobber
    354 
    355         mov  r5, #16                        @ Integer 16 in r5; used as an incrementing value
    356 
    357         ldr         r4, [sp, #64+12]        @ load the coeffs address in memory in r4 (16*4 + 4*3)
    358         vld1.16     {Q2}, [r4]!             @ load the multipliers from the coeffs matrix (r4) in Q2
    359         vld1.8      {d6[]}, [r4], r5        @ load y offset 16 from the coeffs matrix (r4) in d6
    360         vld1.8      {d8[]}, [r4], r5        @ load V and U offset of 128 from the coeffs matrix (r4) in d8
    361 
    362         mov         r4, #8                  @ Integer 8 in r4; used as an incrementing value
    363 
    364         vdup.8      d3, d5[1]               @ d3 = 255 (alpha) from the multipliers line in
    365                                             @ the coeffs matrix (Q2)
    366 
    367         1:
    368         vld1.8      {d10}, [r1]!            @ get Y (r1->Y)
    369         vld2.8      {d12, d14}, [r2], r4    @ split V from U (r2 -> VU) and increase pointer by 8 (in r4)
    370         pld         [r1, #64]               @ preloading data from address y(r1) + 64 for subsequent loops
    371         pld         [r2, #64]               @ preloading data from address vu(r2) + 64 for subsequent loops
    372 
    373         vsubl.u8    Q5, d10, d6             @ Y to 16 bit - 16 (in 16bit) (n to n+7)
    374         vmull.s16   Q8, d10, d4[0]          @ Y(n,n+1,n+2,n+3) * 298 = Q8 (to 32bit)
    375         vmull.s16   Q11, d11, d4[0]         @ Y(n+4,n+5,n+6,n+7) * 298 = Q11 (to 32bit)
    376 
    377         vsubl.u8    Q5, d14, d8             @ V to 16 bit - 128 = Q5 // V(n, n+1, n+2,n+3)
    378         vsubl.u8    Q6, d12, d8             @ U to 16 bit - 128 = Q6 // U(n, n+1, n+2,n+3)
    379         vmov.u16    d11, d10                @ Copying V to d11
    380         vmov.u16    d13, d12                @ Copying U to d13
    381         vzip.u16    d10, d11                @ Q5 = V (n,n n+1, n+1) V(n+2, n+2, n+3, n+3)
    382         vzip.u16    d12, d13                @ Q5 = U (n,n n+1, n+1) U(n+2, n+2, n+3, n+3)
    383 
    384 
    385         vmov        Q9, Q8                  @ Copy Q8(Y: n, n+1, n+2, n+3) to Q9
    386         vmov        Q10, Q8                 @ Copy Q8(Y: n, n+1, n+2, n+3) to Q10
    387         vmov        Q12, Q11                @ Copy Q11(Y: n+5, n+6, n+6, n+7) to Q12
    388         vmov        Q13, Q11                @ Copy Q11(Y: n+5, n+6, n+6, n+7) to Q13
    389 
    390                                             @                  R    G    B
    391                                             @     Pixel(0-3)  Q8,  Q9, Q10
    392                                             @     Pixel(4-7) Q11, Q12, Q13
    393                                             @
    394 
    395                                             @ Pixel(0-3)
    396         vmlal.s16   Q8,  d10, d4[1]         @ R : Q8  = Q8(Y-16)  + (V-128) * 409
    397         vmlal.s16   Q9,  d10, d5[0]         @ G : Q9  = Q9(Y-16)  + (V-128) * (-208)
    398         vmlal.s16   Q9,  d12, d4[2]         @                     + (U-128) * (-100)
    399         vmlal.s16   Q10, d12, d4[3]         @ B : Q10 = Q10(Y-16) + (U-128) * 516
    400 
    401                                             @ Pixel(4-7)
    402         vmlal.s16   Q11, d11, d4[1]         @ R : Q11 = Q11(Y-16) + (V-128) * 409
    403         vmlal.s16   Q12, d11, d5[0]         @ G : Q12 = Q12(Y-16) + (V-128) * (-208)
    404         vmlal.s16   Q12, d13, d4[2]         @                     + (U-128) * (-100)
    405         vmlal.s16   Q13, d13, d4[3]         @ B : Q13 = Q13(Y-16) + (U-128) * 516
    406 
    407                                             @ Pixel(0-3)
    408         vrshrn.i32  d16, Q8, #8             @ d16 : R shifted right by 8 rounded'n narrowed to 16bit
    409         vrshrn.i32  d18, Q9, #8             @ d18 : G shifted right by 8 rounded'n narrowed to 16bit
    410         vrshrn.i32  d20, Q10, #8            @ d20 : B shifted right by 8 rounded'n narrowed to 16bit
    411 
    412                                             @ Pixel(4-7)
    413         vrshrn.i32  d17, Q11, #8            @ d17 : R shifted right by 8 rounded'n narrowed to 16bit
    414         vrshrn.i32  d19, Q12, #8            @ d19 : G shifted right by 8 rounded'n narrowed to 16bit
    415         vrshrn.i32  d21, Q13, #8            @ d21 : B shifted right by 8 rounded'n narrowed to 16bit
    416 
    417         vqmovun.s16 d0, Q8                  @ r = d0 (saturated, unsigned and narrowed to 8bit)
    418         vqmovun.s16 d1, Q9                  @ g = d1 (saturated, unsigned and narrowed to 8bit)
    419         vqmovun.s16 d2, Q10                 @ b = d2 (saturated, unsigned and narrowed to 8bit)
    420 
    421         subs        r3, r3, #1              @ Checking length (r3)
    422         vst4.8      {d0, d1, d2, d3}, [r0]! @ Writing out 8 RGBA values to dst (r0)
    423 
    424         bne 1b                              @ if not done with length, loop
    425 
    426         vpop        {Q4-Q7}                 @ Restore Vregisters
    427         pop         {r4, r5, lr}            @ Restore int registers
    428         bx          lr
    429 END(rsdIntrinsicYuvR_K)
    430 
    431 /*
    432     Function called with the following arguments: dst, Y, v, u, len, YuvCoeff
    433         r0 = dst
    434         r1 = Y
    435         r2 = V,
    436         r3 = U
    437         ---- Args below will be in the stack ----
    438         sp = length (pixels / 8)
    439         sp+4 = YuvCoeff
    440 
    441         This function converts 8 pixels per iteration
    442 */
    443 ENTRY(rsdIntrinsicYuv2_K)
    444         push        {r4, r5, r6, lr}        @ preserve clobbered int registers
    445         vpush       {Q4-Q7}                 @ preserve Vregisters we clobber
    446 
    447         mov  r5, #16                        @ Integer 16 in r5; used as an incrementing value
    448 
    449         ldr         r4, [sp, #64+16+4]      @ load the coeffs address in memory in r4 (16*4 + 4*4 + 4)
    450         ldr         r6, [sp, #64+16]        @ load the length in r6 (16*4 + 4*4)
    451         vld1.16     {Q2}, [r4]!             @ load the multipliers from the coeffs matrix (r4) in Q2
    452         vld1.8      {d6[]}, [r4], r5        @ load y offset 16 from the coeffs matrix (r4) in d6
    453         vld1.8      {d8[]}, [r4], r5        @ load V and U offset of 128 from the coeffs matrix (r4) in d8
    454 
    455         mov         r4, #4                  @ Integer 8 in r4; used as an incrementing value
    456 
    457         vdup.8      d3, d5[1]               @ d3 = 255 (alpha) from the multipliers line in
    458                                             @ the coeffs matrix (Q2)
    459 
    460         1:
    461         vld1.8      {d10}, [r1]!            @ get Y (r1->Y)
    462         vld1.8      {d12}, [r3], r4         @ split V from U (r2 -> VU) and increase pointer by 4 (in r4)
    463         vld1.8      {d14}, [r2], r4         @ split V from U (r2 -> VU) and increase pointer by 4 (in r4)
    464         pld         [r1, #64]               @ preloading data from address y(r1) + 64 for subsequent loops
    465         pld         [r2, #64]               @ preloading data from address vu(r2) + 64 for subsequent loops
    466 
    467         vsubl.u8    Q5, d10, d6             @ Y to 16 bit - 16 (in 16bit) (n to n+7)
    468         vmull.s16   Q8, d10, d4[0]          @ Y(n,n+1,n+2,n+3) * 298 = Q8 (to 32bit)
    469         vmull.s16   Q11, d11, d4[0]         @ Y(n+4,n+5,n+6,n+7) * 298 = Q11 (to 32bit)
    470 
    471         vsubl.u8    Q5, d12, d8             @ V to 16 bit - 128 = Q5 // V(n, n+1, n+2,n+3)
    472         vsubl.u8    Q6, d14, d8             @ U to 16 bit - 128 = Q6 // U(n, n+1, n+2,n+3)
    473         vmov.u16    d11, d10                @ Copying V to d11
    474         vmov.u16    d13, d12                @ Copying U to d13
    475         vzip.u16    d10, d11                @ Q5 = V (n,n n+1, n+1) V(n+2, n+2, n+3, n+3)
    476         vzip.u16    d12, d13                @ Q5 = U (n,n n+1, n+1) U(n+2, n+2, n+3, n+3)
    477 
    478 
    479         vmov        Q9, Q8                  @ Copy Q8(Y: n, n+1, n+2, n+3) to Q9
    480         vmov        Q10, Q8                 @ Copy Q8(Y: n, n+1, n+2, n+3) to Q10
    481         vmov        Q12, Q11                @ Copy Q11(Y: n+5, n+6, n+6, n+7) to Q12
    482         vmov        Q13, Q11                @ Copy Q11(Y: n+5, n+6, n+6, n+7) to Q13
    483 
    484                                             @                  R    G    B
    485                                             @     Pixel(0-3)  Q8,  Q9, Q10
    486                                             @     Pixel(4-7) Q11, Q12, Q13
    487                                             @
    488 
    489                                             @ Pixel(0-3)
    490         vmlal.s16   Q8,  d10, d4[1]         @ R : Q8  = Q8(Y-16)  + (V-128) * 409
    491         vmlal.s16   Q9,  d10, d5[0]         @ G : Q9  = Q9(Y-16)  + (V-128) * (-208)
    492         vmlal.s16   Q9,  d12, d4[2]         @                     + (U-128) * (-100)
    493         vmlal.s16   Q10, d12, d4[3]         @ B : Q10 = Q10(Y-16) + (U-128) * 516
    494 
    495                                             @ Pixel(4-7)
    496         vmlal.s16   Q11, d11, d4[1]         @ R : Q11 = Q11(Y-16) + (V-128) * 409
    497         vmlal.s16   Q12, d11, d5[0]         @ G : Q12 = Q12(Y-16) + (V-128) * (-208)
    498         vmlal.s16   Q12, d13, d4[2]         @                     + (U-128) * (-100)
    499         vmlal.s16   Q13, d13, d4[3]         @ B : Q13 = Q13(Y-16) + (U-128) * 516
    500 
    501                                             @ Pixel(0-3)
    502         vrshrn.i32  d16, Q8, #8             @ d16 : R shifted right by 8 rounded'n narrowed to 16bit
    503         vrshrn.i32  d18, Q9, #8             @ d18 : G shifted right by 8 rounded'n narrowed to 16bit
    504         vrshrn.i32  d20, Q10, #8            @ d20 : B shifted right by 8 rounded'n narrowed to 16bit
    505 
    506                                             @ Pixel(4-7)
    507         vrshrn.i32  d17, Q11, #8            @ d17 : R shifted right by 8 rounded'n narrowed to 16bit
    508         vrshrn.i32  d19, Q12, #8            @ d19 : G shifted right by 8 rounded'n narrowed to 16bit
    509         vrshrn.i32  d21, Q13, #8            @ d21 : B shifted right by 8 rounded'n narrowed to 16bit
    510 
    511         vqmovun.s16 d0, Q8                  @ r = d0 (saturated, unsigned and narrowed to 8bit)
    512         vqmovun.s16 d1, Q9                  @ g = d1 (saturated, unsigned and narrowed to 8bit)
    513         vqmovun.s16 d2, Q10                 @ b = d2 (saturated, unsigned and narrowed to 8bit)
    514 
    515         subs        r6, r6, #1              @ Checking length (r6)
    516         vst4.8      {d0, d1, d2, d3}, [r0]! @ Writing out 8 RGBA values to dst (r0)
    517 
    518         bne 1b                              @ if not done with length, loop
    519 
    520         vpop        {Q4-Q7}                 @ Restore Vregisters
    521         pop         {r4, r5, r6, lr}        @ Restore int registers
    522         bx          lr
    523 END(rsdIntrinsicYuv2_K)
    524 
    525 /* Convolve 5x5 */
    526 
    527 /*
    528         r0 = dst
    529         r1 = y0 base pointer
    530         r2 = y1 base pointer
    531         r3 = y2 base pointer
    532         r4 = y3 base pointer
    533         r5 = y4 base pointer
    534         r6 = coeffs
    535         r7 = length
    536 */
    537 ENTRY(rsdIntrinsicConvolve5x5_K)
    538         push        {r4-r7, lr}
    539         vpush       {q4-q7}
    540 
    541         /* load y3 in r4 */
    542         ldr     r4, [sp, #20 + 64]
    543 
    544         /* load y4 in r5 */
    545         ldr     r5, [sp, #24 + 64]
    546 
    547         /* Load the coefficients pointer */
    548         ldr     r6, [sp, #28 + 64]
    549 
    550         /* Create the coefficients vector */
    551         vld1.16     {d0, d1, d2, d3}, [r6]!
    552         vld1.16     {d4, d5, d6}, [r6]
    553 
    554         vmov.u32  q15, #0x7f
    555 
    556         /* load the count */
    557         ldr     r6, [sp, #32 + 64]
    558 
    559         /* Load the frequently used immediate in a register */
    560         mov     r7, #8
    561 
    562 1:
    563         /* Load the y base pointers in Qregs and post-increase the address by r7=#8 */
    564         vld1.8  {d24, d25, d26}, [r1], r7      @  y0 ( y - 2 )
    565         vld1.8  {d27, d28, d29}, [r2], r7      @  y0 ( y - 1 )
    566 
    567         /* Signal memory for data that will be used in the loop after the next */
    568         PLD         (r1, r7)
    569         PLD         (r2, r7)
    570 
    571         /* Promoting the 8bit channels to 16bit */
    572         vmovl.u8 q9,  d24
    573         vmovl.u8 q10, d25
    574         vmovl.u8 q11, d26
    575         vmovl.u8 q12, d27
    576         vmovl.u8 q13, d28
    577         vmovl.u8 q14, d29
    578 
    579 /*
    580         d18,  d19,  d20, d21, d22, d23,
    581         d24,  d25
    582 */
    583         vmull.s16 q4, d18, d0[0]
    584         vmlal.s16 q4, d19, d0[1]
    585         vmlal.s16 q4, d20, d0[2]
    586         vmlal.s16 q4, d21, d0[3]
    587         vmlal.s16 q4, d22, d1[0]
    588 
    589         vmlal.s16 q4, d24, d1[1]
    590         vmlal.s16 q4, d25, d1[2]
    591         vmlal.s16 q4, d26, d1[3]
    592         vmlal.s16 q4, d27, d2[0]
    593         vmlal.s16 q4, d28, d2[1]
    594 
    595         vmull.s16 q5, d19, d0[0]
    596         vmlal.s16 q5, d20, d0[1]
    597         vmlal.s16 q5, d21, d0[2]
    598         vmlal.s16 q5, d22, d0[3]
    599         vmlal.s16 q5, d23, d1[0]
    600 
    601         vmlal.s16 q5, d25, d1[1]
    602         vmlal.s16 q5, d26, d1[2]
    603         vmlal.s16 q5, d27, d1[3]
    604         vmlal.s16 q5, d28, d2[0]
    605         vmlal.s16 q5, d29, d2[1]
    606 
    607 
    608         /* Next 2 rows */
    609         /* Load the y base pointers in Qregs and post-increase the address by r7=#8 */
    610         vld1.8  {d24, d25, d26}, [r3], r7      @  y0 ( y )
    611         vld1.8  {d27, d28, d29}, [r4], r7      @  y0 ( y + 1 )
    612 
    613         /* Signal memory for data that will be used in the loop after the next */
    614         PLD         (r3, r7)
    615         PLD         (r4, r7)
    616 
    617         /* Promoting the 8bit channels to 16bit */
    618         vmovl.u8 q9,  d24
    619         vmovl.u8 q10, d25
    620         vmovl.u8 q11, d26
    621         vmovl.u8 q12, d27
    622         vmovl.u8 q13, d28
    623         vmovl.u8 q14, d29
    624 
    625 /*
    626         d18,  d19,  d20, d21, d22, d23,
    627         d24,  d25
    628 */
    629         vmlal.s16 q4, d18, d2[2]
    630         vmlal.s16 q4, d19, d2[3]
    631         vmlal.s16 q4, d20, d3[0]
    632         vmlal.s16 q4, d21, d3[1]
    633         vmlal.s16 q4, d22, d3[2]
    634 
    635         vmlal.s16 q4, d24, d3[3]
    636         vmlal.s16 q4, d25, d4[0]
    637         vmlal.s16 q4, d26, d4[1]
    638         vmlal.s16 q4, d27, d4[2]
    639         vmlal.s16 q4, d28, d4[3]
    640 
    641         vmlal.s16 q5, d19, d2[2]
    642         vmlal.s16 q5, d20, d2[3]
    643         vmlal.s16 q5, d21, d3[0]
    644         vmlal.s16 q5, d22, d3[1]
    645         vmlal.s16 q5, d23, d3[2]
    646 
    647         vmlal.s16 q5, d25, d3[3]
    648         vmlal.s16 q5, d26, d4[0]
    649         vmlal.s16 q5, d27, d4[1]
    650         vmlal.s16 q5, d28, d4[2]
    651         vmlal.s16 q5, d29, d4[3]
    652 
    653         /* Last row */
    654         /* Load the y base pointers in Qregs and post-increase the address by r7=#8 */
    655         vld1.8  {d24, d25, d26}, [r5], r7      @  y0 ( y + 2 )
    656 
    657         /* Signal memory for data that will be used in the loop after the next */
    658         PLD         (r5, r7)
    659 
    660         /* Promoting the 8bit channels to 16bit */
    661         vmovl.u8 q9,  d24
    662         vmovl.u8 q10, d25
    663         vmovl.u8 q11, d26
    664 
    665 /*
    666         d18,  d19,  d20, d21, d22, d23,
    667         d24,  d25
    668 */
    669 
    670         vmlal.s16 q4, d18, d5[0]
    671         vmlal.s16 q4, d19, d5[1]
    672         vmlal.s16 q4, d20, d5[2]
    673         vmlal.s16 q4, d21, d5[3]
    674         vmlal.s16 q4, d22, d6[0]
    675 
    676         vmlal.s16 q5, d19, d5[0]
    677         vmlal.s16 q5, d20, d5[1]
    678         vmlal.s16 q5, d21, d5[2]
    679         vmlal.s16 q5, d22, d5[3]
    680         vmlal.s16 q5, d23, d6[0]
    681 
    682 
    683 
    684         vadd.i32 q4, q4, q15
    685         vadd.i32 q5, q5, q15
    686 
    687 /*      Narrow it to a d-reg 32 -> 16 bit */
    688         vrshrn.i32 d8, q4, #8
    689         vrshrn.i32 d9, q5, #8
    690 
    691 
    692 /*      Pack 16 -> 8 bit, saturate, put two pixels into D reg */
    693         vqmovun.s16 d8, q4
    694 
    695         vst1.8 d8, [r0]!           @ return the output and increase the address of r0
    696 
    697         /* Are we done? */
    698         subs r6, r6, #1
    699         bne 1b
    700 
    701         /* Yup, bye */
    702         vpop        {q4-q7}
    703         pop         {r4-r7, lr}
    704         bx          lr
    705 
    706 END(rsdIntrinsicConvolve5x5_K)
    707 
    708 
    709 
    710 
    711 /*
    712         dst = src + dst * (1.0 - src.a)
    713 
    714         r0 = dst
    715         r1 = src
    716         r2 = length
    717 */
    718 ENTRY(rsdIntrinsicBlendSrcOver_K)
    719         .save           {r4, lr}
    720         stmfd           sp!, {r4, lr}
    721         vpush           {q4-q7}
    722 
    723         mov r4, #255
    724         vdup.16 q7, r4
    725 
    726         mov r4, r0
    727 1:
    728 
    729         /* src */
    730         vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
    731         vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
    732         vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
    733         vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
    734         vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]!
    735         vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]!
    736         vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]!
    737         vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]!
    738         vshll.u8 q12, d0, #8
    739         vshll.u8 q13, d1, #8
    740         vshll.u8 q14, d2, #8
    741         vmovl.u8 q6, d3
    742         vsub.i16 q6, q7, q6        // q6 = 1 - src.a
    743         vshll.u8 q15, d3, #8
    744 
    745         /* dst */
    746         vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
    747         vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
    748         vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
    749         vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
    750         vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]!
    751         vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]!
    752         vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]!
    753         vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]!
    754         vmovl.u8 q8, d0
    755         vmovl.u8 q9, d1
    756         vmovl.u8 q10, d2
    757         vmovl.u8 q11, d3
    758 
    759         vmla.i16 q12, q8, q6
    760         vmla.i16 q13, q9, q6
    761         vmla.i16 q14, q10, q6
    762         vmla.i16 q15, q11, q6
    763 
    764         vshrn.i16 d0, q12, #8
    765         vshrn.i16 d1, q13, #8
    766         vshrn.i16 d2, q14, #8
    767         vshrn.i16 d3, q15, #8
    768         vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]!
    769         vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]!
    770         vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]!
    771         vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]!
    772         vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]!
    773         vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]!
    774         vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]!
    775         vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]!
    776 
    777         subs r2, r2, #1
    778         bne 1b
    779 
    780         vpop            {q4-q7}
    781         ldmfd           sp!, {r4, lr}
    782         bx              lr
    783 END(rsdIntrinsicBlendSrcOver_K)
    784 
    785 /*
    786         dst = dst + src * (1.0 - dst.a)
    787 
    788         r0 = dst
    789         r1 = src
    790         r2 = length
    791 */
    792 ENTRY(rsdIntrinsicBlendDstOver_K)
    793         .save           {r4, lr}
    794         stmfd           sp!, {r4, lr}
    795         vpush           {q4-q7}
    796 
    797         mov r4, #255
    798         vdup.16 q7, r4
    799 
    800         mov r4, r0
    801 1:
    802 
    803         /* src */
    804         vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
    805         vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
    806         vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
    807         vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
    808         vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]!
    809         vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]!
    810         vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]!
    811         vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]!
    812         vmovl.u8 q12, d0
    813         vmovl.u8 q13, d1
    814         vmovl.u8 q14, d2
    815         vmovl.u8 q15, d3
    816 
    817         /* dst */
    818         vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
    819         vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
    820         vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
    821         vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
    822         vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]!
    823         vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]!
    824         vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]!
    825         vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]!
    826         vshll.u8 q8, d0, #8
    827         vshll.u8 q9, d1, #8
    828         vshll.u8 q10, d2, #8
    829         vmovl.u8 q6, d3
    830         vsub.i16 q6, q7, q6        // q6 = 1 - dst.a
    831         vshll.u8 q11, d3, #8
    832 
    833 
    834         vmla.i16 q8, q12, q6
    835         vmla.i16 q9, q13, q6
    836         vmla.i16 q10, q14, q6
    837         vmla.i16 q11, q15, q6
    838 
    839         vshrn.i16 d0, q8, #8
    840         vshrn.i16 d1, q9, #8
    841         vshrn.i16 d2, q10, #8
    842         vshrn.i16 d3, q11, #8
    843         vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]!
    844         vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]!
    845         vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]!
    846         vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]!
    847         vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]!
    848         vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]!
    849         vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]!
    850         vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]!
    851 
    852         subs r2, r2, #1
    853         bne 1b
    854 
    855         vpop            {q4-q7}
    856         ldmfd           sp!, {r4, lr}
    857         bx              lr
    858 END(rsdIntrinsicBlendDstOver_K)
    859 
    860 /*
    861         dst = src * dst.a
    862 
    863         r0 = dst
    864         r1 = src
    865         r2 = length
    866 */
    867 ENTRY(rsdIntrinsicBlendSrcIn_K)
    868         .save           {r4, lr}
    869         stmfd           sp!, {r4, lr}
    870         vpush           {q4-q7}
    871 
    872         mov r4, r0
    873 1:
    874 
    875         /* src */
    876         vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
    877         vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
    878         vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
    879         vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
    880         vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]!
    881         vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]!
    882         vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]!
    883         vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]!
    884         vmovl.u8 q12, d0
    885         vmovl.u8 q13, d1
    886         vmovl.u8 q14, d2
    887         vmovl.u8 q15, d3
    888 
    889         /* dst */
    890         vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
    891         vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
    892         vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
    893         vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
    894         vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]!
    895         vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]!
    896         vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]!
    897         vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]!
    898         //vmovl.u8 q8, d0
    899         //vmovl.u8 q9, d1
    900         //vmovl.u8 q10, d2
    901         vmovl.u8 q11, d3
    902 
    903         vmul.i16 q12, q12, q11
    904         vmul.i16 q13, q13, q11
    905         vmul.i16 q14, q14, q11
    906         vmul.i16 q15, q15, q11
    907 
    908         vshrn.i16 d0, q12, #8
    909         vshrn.i16 d1, q13, #8
    910         vshrn.i16 d2, q14, #8
    911         vshrn.i16 d3, q15, #8
    912         vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]!
    913         vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]!
    914         vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]!
    915         vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]!
    916         vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]!
    917         vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]!
    918         vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]!
    919         vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]!
    920 
    921         subs r2, r2, #1
    922         bne 1b
    923 
    924         vpop            {q4-q7}
    925         ldmfd           sp!, {r4, lr}
    926         bx              lr
    927 END(rsdIntrinsicBlendSrcIn_K)
    928 
    929 /*
    930         dst = dst * src.a
    931 
    932         r0 = dst
    933         r1 = src
    934         r2 = length
    935 */
    936 ENTRY(rsdIntrinsicBlendDstIn_K)
    937         .save           {r4, lr}
    938         stmfd           sp!, {r4, lr}
    939         vpush           {q4-q7}
    940 
    941         mov r4, r0
    942 1:
    943 
    944         /* src */
    945         vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
    946         vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
    947         vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
    948         vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
    949         vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]!
    950         vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]!
    951         vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]!
    952         vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]!
    953         //vmovl.u8 q12, d0
    954         //vmovl.u8 q13, d1
    955         //vmovl.u8 q14, d2
    956         vmovl.u8 q15, d3
    957 
    958         /* dst */
    959         vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
    960         vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
    961         vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
    962         vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
    963         vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]!
    964         vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]!
    965         vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]!
    966         vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]!
    967         vmovl.u8 q8, d0
    968         vmovl.u8 q9, d1
    969         vmovl.u8 q10, d2
    970         vmovl.u8 q11, d3
    971 
    972         vmul.i16 q8, q8, q15
    973         vmul.i16 q9, q9, q15
    974         vmul.i16 q10, q10, q15
    975         vmul.i16 q11, q11, q15
    976 
    977         vshrn.i16 d0, q8, #8
    978         vshrn.i16 d1, q9, #8
    979         vshrn.i16 d2, q10, #8
    980         vshrn.i16 d3, q11, #8
    981         vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]!
    982         vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]!
    983         vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]!
    984         vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]!
    985         vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]!
    986         vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]!
    987         vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]!
    988         vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]!
    989 
    990         subs r2, r2, #1
    991         bne 1b
    992 
    993         vpop            {q4-q7}
    994         ldmfd           sp!, {r4, lr}
    995         bx              lr
    996 END(rsdIntrinsicBlendDstIn_K)
    997 
    998 
    999 
   1000 /*
   1001         dst = src * (1.0 - dst.a)
   1002 
   1003         r0 = dst
   1004         r1 = src
   1005         r2 = length
   1006 */
   1007 ENTRY(rsdIntrinsicBlendSrcOut_K)
   1008         .save           {r4, lr}
   1009         stmfd           sp!, {r4, lr}
   1010         vpush           {q4-q7}
   1011 
   1012         mov r4, #255
   1013         vdup.16 q7, r4
   1014 
   1015         mov r4, r0
   1016 1:
   1017 
   1018         /* src */
   1019         vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
   1020         vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
   1021         vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
   1022         vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
   1023         vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]!
   1024         vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]!
   1025         vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]!
   1026         vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]!
   1027         vmovl.u8 q12, d0
   1028         vmovl.u8 q13, d1
   1029         vmovl.u8 q14, d2
   1030         vmovl.u8 q15, d3
   1031 
   1032         /* dst */
   1033         vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
   1034         vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
   1035         vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
   1036         vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
   1037         vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]!
   1038         vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]!
   1039         vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]!
   1040         vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]!
   1041         //vmovl.u8 q8, d0
   1042         //vmovl.u8 q9, d1
   1043         //vmovl.u8 q10, d2
   1044         vmovl.u8 q11, d3
   1045 
   1046 
   1047         vsub.i16 q6, q7, q11        // q6 = 1 - dst.a
   1048         vmul.i16 q12, q12, q6
   1049         vmul.i16 q13, q13, q6
   1050         vmul.i16 q14, q14, q6
   1051         vmul.i16 q15, q15, q6
   1052 
   1053         vshrn.i16 d0, q12, #8
   1054         vshrn.i16 d1, q13, #8
   1055         vshrn.i16 d2, q14, #8
   1056         vshrn.i16 d3, q15, #8
   1057         vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]!
   1058         vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]!
   1059         vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]!
   1060         vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]!
   1061         vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]!
   1062         vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]!
   1063         vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]!
   1064         vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]!
   1065 
   1066         subs r2, r2, #1
   1067         bne 1b
   1068 
   1069         vpop            {q4-q7}
   1070         ldmfd           sp!, {r4, lr}
   1071         bx              lr
   1072 END(rsdIntrinsicBlendSrcOut_K)
   1073 
   1074 
   1075 /*
   1076         dst = dst * (1.0 - src.a)
   1077 
   1078         r0 = dst
   1079         r1 = src
   1080         r2 = length
   1081 */
   1082 ENTRY(rsdIntrinsicBlendDstOut_K)
   1083         .save           {r4, lr}
   1084         stmfd           sp!, {r4, lr}
   1085         vpush           {q4-q7}
   1086 
   1087         mov r4, #255
   1088         vdup.16 q7, r4
   1089 
   1090         mov r4, r0
   1091 1:
   1092 
   1093         /* src */
   1094         vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
   1095         vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
   1096         vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
   1097         vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
   1098         vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]!
   1099         vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]!
   1100         vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]!
   1101         vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]!
   1102         //vmovl.u8 q12, d0
   1103         //vmovl.u8 q13, d1
   1104         //vmovl.u8 q14, d2
   1105         vmovl.u8 q15, d3
   1106 
   1107         /* dst */
   1108         vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
   1109         vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
   1110         vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
   1111         vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
   1112         vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]!
   1113         vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]!
   1114         vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]!
   1115         vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]!
   1116         vmovl.u8 q8, d0
   1117         vmovl.u8 q9, d1
   1118         vmovl.u8 q10, d2
   1119         vmovl.u8 q11, d3
   1120 
   1121 
   1122         vsub.i16 q6, q7, q15        // q6 = 1 - src.a
   1123         vmul.i16 q12, q8, q6
   1124         vmul.i16 q13, q9, q6
   1125         vmul.i16 q14, q10, q6
   1126         vmul.i16 q15, q11, q6
   1127 
   1128         vshrn.i16 d0, q12, #8
   1129         vshrn.i16 d1, q13, #8
   1130         vshrn.i16 d2, q14, #8
   1131         vshrn.i16 d3, q15, #8
   1132         vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]!
   1133         vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]!
   1134         vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]!
   1135         vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]!
   1136         vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]!
   1137         vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]!
   1138         vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]!
   1139         vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]!
   1140 
   1141         subs r2, r2, #1
   1142         bne 1b
   1143 
   1144         vpop            {q4-q7}
   1145         ldmfd           sp!, {r4, lr}
   1146         bx              lr
   1147 END(rsdIntrinsicBlendDstOut_K)
   1148 
   1149 
   1150 /*
   1151         dst.rgb = src.rgb * dst.a + (1.0 - src.a) * dst.rgb
   1152         dst.a = dst.a
   1153 
   1154         r0 = dst
   1155         r1 = src
   1156         r2 = length
   1157 */
   1158 ENTRY(rsdIntrinsicBlendSrcAtop_K)
   1159         .save           {r4, lr}
   1160         stmfd           sp!, {r4, lr}
   1161         vpush           {q4-q7}
   1162 
   1163         mov r4, #255
   1164         vdup.16 q7, r4
   1165 
   1166         mov r4, r0
   1167 1:
   1168 
   1169         /* src */
   1170         vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
   1171         vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
   1172         vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
   1173         vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
   1174         vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]!
   1175         vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]!
   1176         vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]!
   1177         vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]!
   1178         vmovl.u8 q12, d0
   1179         vmovl.u8 q13, d1
   1180         vmovl.u8 q14, d2
   1181         vmovl.u8 q15, d3
   1182 
   1183         /* dst */
   1184         vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
   1185         vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
   1186         vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
   1187         vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
   1188         vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]!
   1189         vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]!
   1190         vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]!
   1191         vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]!
   1192         vmovl.u8 q8, d0
   1193         vmovl.u8 q9, d1
   1194         vmovl.u8 q10, d2
   1195         vmovl.u8 q11, d3
   1196 
   1197 
   1198         vsub.i16 q6, q7, q15        // q6 = 1 - src.a
   1199         vmul.i16 q8, q8, q6
   1200         vmul.i16 q9, q9, q6
   1201         vmul.i16 q10, q10, q6
   1202 
   1203         vmla.i16 q8, q12, q11
   1204         vmla.i16 q9, q13, q11
   1205         vmla.i16 q10, q14, q11
   1206 
   1207 
   1208         vshrn.i16 d0, q8, #8
   1209         vshrn.i16 d1, q9, #8
   1210         vshrn.i16 d2, q10, #8
   1211         //vshrn.i16 d3, q15, #8
   1212         vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]!
   1213         vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]!
   1214         vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]!
   1215         vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]!
   1216         vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]!
   1217         vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]!
   1218         vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]!
   1219         vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]!
   1220 
   1221         subs r2, r2, #1
   1222         bne 1b
   1223 
   1224         vpop            {q4-q7}
   1225         ldmfd           sp!, {r4, lr}
   1226         bx              lr
   1227 END(rsdIntrinsicBlendSrcAtop_K)
   1228 
   1229 /*
   1230         dst = dst.rgb * src.a + (1.0 - dst.a) * src.rgb
   1231         dst.a = src.a
   1232 
   1233         r0 = dst
   1234         r1 = src
   1235         r2 = length
   1236 */
   1237 ENTRY(rsdIntrinsicBlendDstAtop_K)
   1238         .save           {r4, lr}
   1239         stmfd           sp!, {r4, lr}
   1240         vpush           {q4-q7}
   1241 
   1242         mov r4, #255
   1243         vdup.16 q7, r4
   1244 
   1245         mov r4, r0
   1246 1:
   1247 
   1248         /* src */
   1249         vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
   1250         vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
   1251         vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
   1252         vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
   1253         vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]!
   1254         vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]!
   1255         vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]!
   1256         vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]!
   1257         vmovl.u8 q12, d0
   1258         vmovl.u8 q13, d1
   1259         vmovl.u8 q14, d2
   1260         vmovl.u8 q15, d3
   1261 
   1262         /* dst */
   1263         vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
   1264         vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
   1265         vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
   1266         vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
   1267         vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]!
   1268         vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]!
   1269         vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]!
   1270         vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]!
   1271         vmovl.u8 q8, d0
   1272         vmovl.u8 q9, d1
   1273         vmovl.u8 q10, d2
   1274         vmovl.u8 q11, d3
   1275 
   1276 
   1277         vsub.i16 q6, q7, q11        // q6 = 1 - dst.a
   1278         vmul.i16 q12, q12, q6
   1279         vmul.i16 q13, q13, q6
   1280         vmul.i16 q14, q14, q6
   1281 
   1282         vmla.i16 q12, q8, q15
   1283         vmla.i16 q13, q9, q15
   1284         vmla.i16 q14, q10, q15
   1285 
   1286 
   1287         vshrn.i16 d0, q12, #8
   1288         vshrn.i16 d1, q13, #8
   1289         vshrn.i16 d2, q14, #8
   1290         //vshrn.i16 d3, q15, #8
   1291         vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]!
   1292         vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]!
   1293         vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]!
   1294         vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]!
   1295         vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]!
   1296         vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]!
   1297         vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]!
   1298         vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]!
   1299 
   1300         subs r2, r2, #1
   1301         bne 1b
   1302 
   1303         vpop            {q4-q7}
   1304         ldmfd           sp!, {r4, lr}
   1305         bx              lr
   1306 END(rsdIntrinsicBlendDstAtop_K)
   1307 
   1308 /*
   1309         dst = dst ^ src
   1310 
   1311         r0 = dst
   1312         r1 = src
   1313         r2 = length
   1314 */
   1315 ENTRY(rsdIntrinsicBlendXor_K)
   1316         .save           {r4, lr}
   1317         stmfd           sp!, {r4, lr}
   1318         vpush           {q4-q7}
   1319 
   1320         mov r4, #255
   1321         vdup.16 q7, r4
   1322 
   1323         mov r4, r0
   1324 1:
   1325 
   1326         /* src */
   1327         vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
   1328         vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
   1329         vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
   1330         vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
   1331         vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]!
   1332         vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]!
   1333         vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]!
   1334         vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]!
   1335         vmov.u8 d4, d0
   1336         vmov.u8 d5, d1
   1337         vmov.u8 d6, d2
   1338         vmov.u8 d7, d3
   1339 
   1340         /* dst */
   1341         vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
   1342         vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
   1343         vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
   1344         vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
   1345         vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]!
   1346         vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]!
   1347         vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]!
   1348         vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]!
   1349 
   1350         veor d0, d0, d4
   1351         veor d1, d1, d5
   1352         veor d2, d2, d6
   1353         veor d3, d3, d7
   1354 
   1355         vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]!
   1356         vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]!
   1357         vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]!
   1358         vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]!
   1359         vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]!
   1360         vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]!
   1361         vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]!
   1362         vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]!
   1363 
   1364         subs r2, r2, #1
   1365         bne 1b
   1366 
   1367         vpop            {q4-q7}
   1368         ldmfd           sp!, {r4, lr}
   1369         bx              lr
   1370 END(rsdIntrinsicBlendXor_K)
   1371 
   1372 /*
   1373         dst = dst * src
   1374 
   1375         r0 = dst
   1376         r1 = src
   1377         r2 = length
   1378 */
   1379 ENTRY(rsdIntrinsicBlendMultiply_K)
   1380         .save           {r4, lr}
   1381         stmfd           sp!, {r4, lr}
   1382         vpush           {q4-q7}
   1383 
   1384         mov r4, #255
   1385         vdup.16 q7, r4
   1386 
   1387         mov r4, r0
   1388 1:
   1389 
   1390         /* src */
   1391         vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
   1392         vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
   1393         vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
   1394         vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
   1395         vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]!
   1396         vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]!
   1397         vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]!
   1398         vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]!
   1399         vmovl.u8 q12, d0
   1400         vmovl.u8 q13, d1
   1401         vmovl.u8 q14, d2
   1402         vmovl.u8 q15, d3
   1403 
   1404         /* dst */
   1405         vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
   1406         vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
   1407         vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
   1408         vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
   1409         vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]!
   1410         vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]!
   1411         vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]!
   1412         vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]!
   1413         vmovl.u8 q8, d0
   1414         vmovl.u8 q9, d1
   1415         vmovl.u8 q10, d2
   1416         vmovl.u8 q11, d3
   1417 
   1418 
   1419         vmul.i16 q8, q8, q12
   1420         vmul.i16 q9, q9, q13
   1421         vmul.i16 q10, q10, q14
   1422         vmul.i16 q11, q11, q15
   1423 
   1424         vshrn.i16 d0, q8, #8
   1425         vshrn.i16 d1, q9, #8
   1426         vshrn.i16 d2, q10, #8
   1427         vshrn.i16 d3, q11, #8
   1428         vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]!
   1429         vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]!
   1430         vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]!
   1431         vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]!
   1432         vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]!
   1433         vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]!
   1434         vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]!
   1435         vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]!
   1436 
   1437         subs r2, r2, #1
   1438         bne 1b
   1439 
   1440         vpop            {q4-q7}
   1441         ldmfd           sp!, {r4, lr}
   1442         bx              lr
   1443 END(rsdIntrinsicBlendMultiply_K)
   1444 
   1445 /*
   1446         dst = min(src + dst, 1.0)
   1447 
   1448         r0 = dst
   1449         r1 = src
   1450         r2 = length
   1451 */
   1452 ENTRY(rsdIntrinsicBlendAdd_K)
   1453         .save           {r4, lr}
   1454         stmfd           sp!, {r4, lr}
   1455         vpush           {q4-q7}
   1456 
   1457         mov r4, #255
   1458         vdup.16 q7, r4
   1459 
   1460         mov r4, r0
   1461 1:
   1462 
   1463         /* src */
   1464         vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
   1465         vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
   1466         vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
   1467         vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
   1468         vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]!
   1469         vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]!
   1470         vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]!
   1471         vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]!
   1472         vmovl.u8 q12, d0
   1473         vmovl.u8 q13, d1
   1474         vmovl.u8 q14, d2
   1475         vmovl.u8 q15, d3
   1476 
   1477         /* dst */
   1478         vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
   1479         vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
   1480         vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
   1481         vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
   1482         vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]!
   1483         vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]!
   1484         vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]!
   1485         vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]!
   1486         vmovl.u8 q8, d0
   1487         vmovl.u8 q9, d1
   1488         vmovl.u8 q10, d2
   1489         vmovl.u8 q11, d3
   1490 
   1491 
   1492         vadd.i16 q8, q8, q12
   1493         vadd.i16 q9, q9, q13
   1494         vadd.i16 q10, q10, q14
   1495         vadd.i16 q11, q11, q15
   1496 
   1497         vqmovun.s16 d0, q8
   1498         vqmovun.s16 d1, q9
   1499         vqmovun.s16 d2, q10
   1500         vqmovun.s16 d3, q11
   1501         vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]!
   1502         vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]!
   1503         vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]!
   1504         vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]!
   1505         vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]!
   1506         vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]!
   1507         vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]!
   1508         vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]!
   1509 
   1510         subs r2, r2, #1
   1511         bne 1b
   1512 
   1513         vpop            {q4-q7}
   1514         ldmfd           sp!, {r4, lr}
   1515         bx              lr
   1516 END(rsdIntrinsicBlendAdd_K)
   1517 
   1518 
   1519 /*
   1520         dst = max(dst - src, 0.0)
   1521 
   1522         r0 = dst
   1523         r1 = src
   1524         r2 = length
   1525 */
   1526 ENTRY(rsdIntrinsicBlendSub_K)
   1527         .save           {r4, lr}
   1528         stmfd           sp!, {r4, lr}
   1529         vpush           {q4-q7}
   1530 
   1531         mov r4, #255
   1532         vdup.16 q7, r4
   1533 
   1534         mov r4, r0
   1535 1:
   1536 
   1537         /* src */
   1538         vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
   1539         vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
   1540         vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
   1541         vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
   1542         vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]!
   1543         vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]!
   1544         vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]!
   1545         vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]!
   1546         vmovl.u8 q12, d0
   1547         vmovl.u8 q13, d1
   1548         vmovl.u8 q14, d2
   1549         vmovl.u8 q15, d3
   1550 
   1551         /* dst */
   1552         vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
   1553         vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
   1554         vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
   1555         vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
   1556         vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]!
   1557         vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]!
   1558         vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]!
   1559         vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]!
   1560         vmovl.u8 q8, d0
   1561         vmovl.u8 q9, d1
   1562         vmovl.u8 q10, d2
   1563         vmovl.u8 q11, d3
   1564 
   1565 
   1566         vsub.i16 q8, q8, q12
   1567         vsub.i16 q9, q9, q13
   1568         vsub.i16 q10, q10, q14
   1569         vsub.i16 q11, q11, q15
   1570 
   1571         vqmovun.s16 d0, q8
   1572         vqmovun.s16 d1, q9
   1573         vqmovun.s16 d2, q10
   1574         vqmovun.s16 d3, q11
   1575         vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]!
   1576         vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]!
   1577         vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]!
   1578         vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]!
   1579         vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]!
   1580         vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]!
   1581         vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]!
   1582         vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]!
   1583 
   1584         subs r2, r2, #1
   1585         bne 1b
   1586 
   1587         vpop            {q4-q7}
   1588         ldmfd           sp!, {r4, lr}
   1589         bx              lr
   1590 END(rsdIntrinsicBlendSub_K)
   1591 
   1592 
   1593 /* 3D LUT */
   1594 
   1595 /*
   1596         r0 = dst
   1597         r1 = src
   1598         r2 = cube base pointer
   1599         r3 = cube Y stride
   1600         r4 = cube Z stride
   1601         r5 = count
   1602         xr10 = * constants
   1603 
   1604         d0  / q0  = weight 1 p1
   1605         d1        = weight 2 p1
   1606 
   1607         d2  / q1  = weight 1 p2
   1608         d3        = weight 2 p2
   1609 
   1610         d4  / q2  = src1
   1611         d5        = src2
   1612 
   1613         d6  / q3  = baseCoord
   1614         d7        = baseCoord
   1615 
   1616         d8  / q4  = coord1 p1
   1617         d9        =
   1618 
   1619         d10 / q5  = coord1 p2
   1620         d11       =
   1621 
   1622         d12 / q6  =
   1623         d13       =
   1624 
   1625         d14 / q7  =
   1626         d15       =
   1627 
   1628 
   1629         d16 / q8  = x0 y0 z0
   1630         d17       = x1 y0 z0
   1631         d18 / q9  = x0 y1 z0
   1632         d19       = x1 y1 z0
   1633         d20 / q10 = x0 y0 z1
   1634         d21       = x1 y0 z1
   1635         d22 / q11 = x0 y1 z1
   1636         d23       = x1 y1 z1
   1637 
   1638         d24 / q12 = alpha mash
   1639         d25       = current pixel alpha
   1640         d26 / q13 = 4, y stride
   1641         d27       = z stride, 0
   1642         d28 / q14 = 0x8000
   1643         d29       = 0x7fff
   1644         d30 / q15 = 0, 0, 0, 0xffff
   1645 
   1646 
   1647         d31 = coordMult
   1648 */
   1649 
   1650 ENTRY(rsdIntrinsic3DLUT_K)
   1651         push        {r4-r8, r10, r11, lr}
   1652         vpush       {q4-q7}
   1653 
   1654         /* load Z stride in r4 */
   1655         ldr     r4, [sp, #32 + 64]
   1656 
   1657         /* Load count */
   1658         ldr     r5, [sp, #36 + 64]
   1659 
   1660         vmov.u16 d28, #0x8000
   1661         vmov.u16 d29, #0x7fff
   1662         vmov.u32 d24, #0xff000000
   1663 
   1664         /* load constants using r10 */
   1665         ldr     r10, [sp, #40 + 64]
   1666         vld1.32 {d31}, [r10]!
   1667         vld1.32 {d30}, [r10]!
   1668 
   1669         mov r6, #4
   1670         vmov d26, r6, r3
   1671         mov r6, #0
   1672         vmov d27, r4, r6
   1673 
   1674         add r8, r3, r4
   1675 
   1676 
   1677 
   1678 1:
   1679         vld1.8 {d4}, [r1]!
   1680         vand.u8 d25, d4, d24
   1681         vmovl.u8 q2, d4
   1682 
   1683 
   1684         vmull.u16 q3, d4, d31
   1685         vshr.u32 q4, q3, #15       // coord1 p1
   1686         vmovn.u32 d1, q3
   1687         vand.u16 d1, d29           // weight 2
   1688         vsub.u16 d0, d28, d1       // weight 1
   1689         vmul.u32 q4, q4, q13           // q4 = x*4, y*ystride, z*zstride, 0
   1690 
   1691         vmull.u16 q3, d5, d31
   1692         vshr.u32 q5, q3, #15       // coord1 p2
   1693         vmovn.u32 d3, q3
   1694         vand.u16 d3, d29           // weight 2
   1695         vsub.u16 d2, d28, d3       // weight 1
   1696         vmul.u32 q5, q5, q13       // q5 = x*4, y*ystride, z*zstride, 0
   1697 
   1698         vpadd.u32 d8, d8, d9
   1699         vpadd.u32 d9, d10, d11
   1700         vpadd.u32 d8, d8, d9
   1701         vmov r6, r7, d8            // base pointers
   1702 
   1703         add  r6, r6, r2
   1704         add  r7, r7, r2
   1705 
   1706         vld1.8 {d16}, [r6]
   1707         add r11, r6, r3
   1708         vld1.8 {d18}, [r11]
   1709         add r11, r6, r4
   1710         vld1.8 {d20}, [r11]
   1711         add r11, r6, r8
   1712         vld1.8 {d22}, [r11]
   1713 
   1714         vmovl.u8 q8, d16
   1715         vmovl.u8 q9, d18
   1716         vmovl.u8 q10, d20
   1717         vmovl.u8 q11, d22
   1718 
   1719         vmull.u16 q6, d16, d0[0]
   1720         vmlal.u16 q6, d17, d1[0]
   1721         vshrn.u32 d16, q6, #7
   1722         vmull.u16 q6, d18, d0[0]
   1723         vmlal.u16 q6, d19, d1[0]
   1724         vshrn.u32 d18, q6, #7
   1725         vmull.u16 q6, d20, d0[0]
   1726         vmlal.u16 q6, d21, d1[0]
   1727         vshrn.u32 d20, q6, #7
   1728         vmull.u16 q6, d22, d0[0]
   1729         vmlal.u16 q6, d23, d1[0]
   1730         vshrn.u32 d22, q6, #7
   1731 
   1732         vmull.u16 q6, d16, d0[1]
   1733         vmlal.u16 q6, d18, d1[1]
   1734         vshrn.u32 d16, q6, #15
   1735         vmull.u16 q6, d20, d0[1]
   1736         vmlal.u16 q6, d22, d1[1]
   1737         vshrn.u32 d18, q6, #15
   1738 
   1739         vmull.u16 q6, d16, d0[2]
   1740         vmlal.u16 q6, d18, d1[2]
   1741         vshrn.u32 d14, q6, #15
   1742 
   1743 
   1744         vld1.8 {d16}, [r7]
   1745         add r11, r7, r3
   1746         vld1.8 {d18}, [r11]
   1747         add r11, r7, r4
   1748         vld1.8 {d20}, [r11]
   1749         add r11, r7, r8
   1750         vld1.8 {d22}, [r11]
   1751         vmovl.u8 q8, d16
   1752         vmovl.u8 q9, d18
   1753         vmovl.u8 q10, d20
   1754         vmovl.u8 q11, d22
   1755 
   1756         vmull.u16 q6, d16, d2[0]
   1757         vmlal.u16 q6, d17, d3[0]
   1758         vshrn.u32 d16, q6, #7
   1759         vmull.u16 q6, d18, d2[0]
   1760         vmlal.u16 q6, d19, d3[0]
   1761         vshrn.u32 d18, q6, #7
   1762         vmull.u16 q6, d20, d2[0]
   1763         vmlal.u16 q6, d21, d3[0]
   1764         vshrn.u32 d20, q6, #7
   1765         vmull.u16 q6, d22, d2[0]
   1766         vmlal.u16 q6, d23, d3[0]
   1767         vshrn.u32 d22, q6, #7
   1768 
   1769         vmull.u16 q6, d16, d2[1]
   1770         vmlal.u16 q6, d18, d3[1]
   1771         vshrn.u32 d16, q6, #15
   1772         vmull.u16 q6, d20, d2[1]
   1773         vmlal.u16 q6, d22, d3[1]
   1774         vshrn.u32 d18, q6, #15
   1775 
   1776         vmull.u16 q6, d16, d2[2]
   1777         vmlal.u16 q6, d18, d3[2]
   1778         vshrn.u32 d15, q6, #15
   1779 
   1780         vrshrn.u16 d14, q7, #8
   1781 
   1782         vbic.u8 d14, d14, d24  // mix in alpha
   1783         vorr.u8 d14, d14, d25
   1784         vst1.32 {d14}, [r0]!
   1785 
   1786 
   1787         /* Are we done? */
   1788         subs r5, r5, #1
   1789         bne 1b
   1790 
   1791         /* Yup, bye */
   1792         vpop            {q4-q7}
   1793         pop         {r4-r8, r10, r11, lr}
   1794         bx          lr
   1795 
   1796 END(rsdIntrinsic3DLUT_K)
   1797 
   1798 
   1799