Home | History | Annotate | Download | only in cpu_ref
      1 /*
      2  * Copyright (C) 2012 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 
     18 
     19 #include <machine/cpu-features.h>
     20 #include <machine/asm.h>
     21 
     22 /*
     23         r0 = dst
     24         r1 = y0 base pointer
     25         r2 = y1 base pointer
     26         r3 = y2 base pointer
     27         sp = coeffs
     28         sp = length / 2
     29 */
     30 
     31 ENTRY(rsdIntrinsicConvolve3x3_K)
     32         push            {r4-r8, r10, r11, lr}
     33         vpush           {q4-q7}
     34 
     35         /* Get the coeffs pointer from the stack and load the
     36            coefficients in the q0, q1 NEON registers */
     37         ldr r4, [sp, #32+64]
     38         vld1.16 {q0, q1}, [r4]
     39 
     40         /* Get count from the stack */
     41         ldr r4, [sp, #36+64]
     42 
     43         /* Load the frequently used immediate in a register */
     44         mov r5, #8
     45 
     46 1:
     47         /* Load and post-increase the address by r5=#8 */
     48         vld1.8 {q13}, [r1], r5
     49         vld1.8 {q14}, [r2], r5
     50         vld1.8 {q15}, [r3], r5
     51 
     52         /* Signal memory for data that will be used in the loop after the next */
     53         PLD         (r1, r5)
     54         PLD         (r2, r5)
     55         PLD         (r3, r5)
     56 
     57         vmovl.u8 q2, d26
     58         vmovl.u8 q3, d27
     59         vmovl.u8 q4, d28
     60         vmovl.u8 q5, d29
     61         vmovl.u8 q6, d30
     62         vmovl.u8 q7, d31
     63 
     64 /*
     65         The two pixel source array is
     66         d4,  d5,  d6,  d7
     67         d8,  d9,  d10, d11
     68         d12, d13, d14, d15
     69 */
     70 
     71         vmull.s16 q8, d4, d0[0]
     72         vmlal.s16 q8, d5, d0[1]
     73         vmlal.s16 q8, d6, d0[2]
     74         vmlal.s16 q8, d8, d0[3]
     75         vmlal.s16 q8, d9, d1[0]
     76         vmlal.s16 q8, d10, d1[1]
     77         vmlal.s16 q8, d12, d1[2]
     78         vmlal.s16 q8, d13, d1[3]
     79         vmlal.s16 q8, d14, d2[0]
     80 
     81         vmull.s16 q9, d5, d0[0]
     82         vmlal.s16 q9, d6, d0[1]
     83         vmlal.s16 q9, d7, d0[2]
     84         vmlal.s16 q9, d9, d0[3]
     85         vmlal.s16 q9, d10, d1[0]
     86         vmlal.s16 q9, d11, d1[1]
     87         vmlal.s16 q9, d13, d1[2]
     88         vmlal.s16 q9, d14, d1[3]
     89         vmlal.s16 q9, d15, d2[0]
     90 
     91         vshrn.i32 d16, q8, #8
     92         vshrn.i32 d17, q9, #8
     93 
     94         vqmovun.s16 d16, q8
     95         vst1.8 d16, [r0]!
     96 
     97         /* Are we done yet? */
     98         subs r4, r4, #1
     99         bne 1b
    100 
    101         /* We're done, bye! */
    102         vpop            {q4-q7}
    103         pop             {r4-r8, r10, r11, lr}
    104         bx              lr
    105 END(rsdIntrinsicConvolve3x3_K)
    106 
    107 /*
    108         r0 = dst
    109         r1 = src
    110         r2 = matrix
    111         r3 = length
    112 */
    113 ENTRY(rsdIntrinsicColorMatrix4x4_K)
    114         stmfd           sp!, {r4, lr}
    115         vpush           {q4-q7}
    116 
    117         vld1.16 {q2}, [r2]!
    118         vld1.16 {q3}, [r2]!
    119 
    120 1:
    121         vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
    122         vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
    123         vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
    124         vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
    125 
    126         vmovl.u8 q12, d0  /* R */
    127         vmovl.u8 q13, d1  /* G */
    128         vmovl.u8 q14, d2  /* B */
    129         vmovl.u8 q15, d3  /* A */
    130 
    131         vmull.s16 q8,  d24, d4[0]
    132         vmull.s16 q9,  d24, d4[1]
    133         vmull.s16 q10, d24, d4[2]
    134         vmull.s16 q11, d24, d4[3]
    135 
    136         vmlal.s16 q8,  d26, d5[0]
    137         vmlal.s16 q9,  d26, d5[1]
    138         vmlal.s16 q10, d26, d5[2]
    139         vmlal.s16 q11, d26, d5[3]
    140 
    141         vmlal.s16 q8,  d28, d6[0]
    142         vmlal.s16 q9,  d28, d6[1]
    143         vmlal.s16 q10, d28, d6[2]
    144         vmlal.s16 q11, d28, d6[3]
    145 
    146         vmlal.s16 q8,  d30, d7[0]
    147         vmlal.s16 q9,  d30, d7[1]
    148         vmlal.s16 q10, d30, d7[2]
    149         vmlal.s16 q11, d30, d7[3]
    150 
    151         vshrn.i32 d24, q8, #8
    152         vshrn.i32 d26, q9, #8
    153         vshrn.i32 d28, q10, #8
    154         vshrn.i32 d30, q11, #8
    155 
    156         vqmovun.s16 d0, q12
    157         vqmovun.s16 d1, q13
    158         vqmovun.s16 d2, q14
    159         vqmovun.s16 d3, q15
    160 
    161         vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
    162         vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
    163         vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
    164         vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
    165 
    166         subs r3, r3, #1
    167         bne 1b
    168 
    169         vpop            {q4-q7}
    170         ldmfd           sp!, {r4, lr}
    171         bx              lr
    172 END(rsdIntrinsicColorMatrix4x4_K)
    173 
    174 /*
    175         r0 = dst
    176         r1 = src
    177         r2 = matrix
    178         r3 = length
    179 */
    180 ENTRY(rsdIntrinsicColorMatrix3x3_K)
    181         stmfd           sp!, {r4, lr}
    182         vpush           {q4-q7}
    183 
    184         vld1.16 {q2}, [r2]!
    185         vld1.16 {q3}, [r2]!
    186 
    187 1:
    188         vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
    189         vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
    190         vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
    191         vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
    192 
    193         vmovl.u8 q12, d0
    194         vmovl.u8 q13, d1
    195         vmovl.u8 q14, d2
    196 
    197         vmull.s16 q8,  d24, d4[0]
    198         vmull.s16 q9,  d24, d4[1]
    199         vmull.s16 q10, d24, d4[2]
    200 
    201         vmlal.s16 q8,  d26, d5[0]
    202         vmlal.s16 q9,  d26, d5[1]
    203         vmlal.s16 q10, d26, d5[2]
    204 
    205         vmlal.s16 q8,  d28, d6[0]
    206         vmlal.s16 q9,  d28, d6[1]
    207         vmlal.s16 q10, d28, d6[2]
    208 
    209         vshrn.i32 d24, q8, #8
    210         vshrn.i32 d26, q9, #8
    211         vshrn.i32 d28, q10, #8
    212 
    213         vqmovun.s16 d0, q12
    214         vqmovun.s16 d1, q13
    215         vqmovun.s16 d2, q14
    216 
    217         vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
    218         vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
    219         vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
    220         vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
    221 
    222         subs r3, r3, #1
    223         bne 1b
    224 
    225         vpop            {q4-q7}
    226         ldmfd           sp!, {r4, lr}
    227         bx              lr
    228 END(rsdIntrinsicColorMatrix3x3_K)
    229 
    230 /*
    231         r0 = dst
    232         r1 = src
    233         r2 = matrix
    234         r3 = length
    235 */
    236 ENTRY(rsdIntrinsicColorMatrixDot_K)
    237         stmfd           sp!, {r4, lr}
    238         vpush           {q4-q7}
    239 
    240         vld1.16 {q2}, [r2]!
    241         vld1.16 {q3}, [r2]!
    242 
    243 1:
    244         vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
    245         vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
    246         vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
    247         vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
    248 
    249         vmovl.u8 q12, d0
    250         vmovl.u8 q13, d1
    251         vmovl.u8 q14, d2
    252 
    253         vmull.s16 q8,  d24, d4[0]
    254         vmlal.s16 q8,  d26, d5[0]
    255         vmlal.s16 q8,  d28, d6[0]
    256         vshrn.i32 d24, q8, #8
    257         vqmovun.s16 d0, q12
    258         vmov.u8 d1, d0
    259         vmov.u8 d2, d0
    260 
    261         vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
    262         vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
    263         vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
    264         vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
    265 
    266         subs r3, r3, #1
    267         bne 1b
    268 
    269         vpop            {q4-q7}
    270         ldmfd           sp!, {r4, lr}
    271         bx              lr
    272 END(rsdIntrinsicColorMatrixDot_K)
    273 
    274 
    275 /*
    276 static void OneVF(float4 *out, const uchar *ptrIn, int iStride,
    277                   const float* gPtr, int iradius, int x1, int x2)
    278 
    279     r0 = out
    280     r1 = pin
    281     r2 = stride
    282     r3 = gptr
    283     r4 = sp, ct
    284     r5 = sp+4, x1
    285     r6 = sp+8, x2
    286 */
    287 ENTRY(rsdIntrinsicBlurVFU4_K)
    288         push            {r4-r8, r10, r11, lr}
    289         vpush           {q4-q7}
    290 
    291         ldr r4, [sp, #32+64]
    292         ldr r5, [sp, #32+64 + 4]
    293         ldr r6, [sp, #32+64 + 8]
    294 
    295 1:
    296         veor q10, q10, q10         /* float4 blurredPixel = 0; */
    297         veor q11, q11, q11         /* float4 blurredPixel = 0; */
    298         add r7, r1, r5, lsl #2  /* const uchar *pi = ptrIn + x1 * 4; */
    299         mov r10, r3
    300 
    301         mov r11, r4
    302 
    303 2:
    304         vld1.32 {d2}, [r7]
    305         vmovl.u8 q1, d2
    306         vmovl.u16 q3, d2
    307         vmovl.u16 q4, d3
    308         vcvt.f32.s32 q3, q3
    309         vcvt.f32.s32 q4, q4
    310         vld1.32 {d0[0]}, [r10]!
    311         add r7, r7, r2
    312         vmla.f32 q10, q3, d0[0]
    313         vmla.f32 q11, q4, d0[0]
    314         subs r11, r11, #1
    315         bne 2b
    316 
    317         vst1.32 {q10}, [r0]!
    318         vst1.32 {q11}, [r0]!
    319         add r5, r5, #2
    320         cmp r5, r6
    321         bne 1b
    322 
    323 
    324         vpop            {q4-q7}
    325         pop             {r4-r8, r10, r11, lr}
    326         bx              lr
    327 END(rsdIntrinsicBlurVFU4_K)
    328 
    329 /*
    330 static void OneVF(float4 *out, const uchar *ptrIn, int iStride,
    331                   const float* gPtr, int iradius, int x1, int x2)
    332 
    333     r0 = out
    334     r1 = pin
    335     r2 = gptr
    336     r3 = ct
    337     r4 = sp, x1
    338     r5 = sp+4, x2
    339 */
    340 ENTRY(rsdIntrinsicBlurHFU4_K)
    341         push            {r4-r8, r10, r11, lr}
    342         vpush           {q4-q7}
    343 
    344         ldr r4, [sp, #32+64]
    345         ldr r5, [sp, #32+64 + 4]
    346 
    347 1:
    348         add r7, r1, r4, lsl #4  /* const uchar *pi = ptrIn + x1 * 4; */
    349         mov r10, r2
    350         mov r11, r3
    351 
    352         vld1.32 {q1}, [r7]!
    353         vld1.32 {d6[0]}, [r10]!
    354         vmul.f32 q0, q1, d6[0]
    355         sub r11, r11, #1
    356 
    357 2:
    358         vld1.32 {q1}, [r7]!
    359         vld1.32 {q2}, [r7]!
    360         vld1.32 {d6}, [r10]!
    361         vmla.f32 q0, q1, d6[0]
    362         vmla.f32 q0, q2, d6[1]
    363         subs r11, r11, #2
    364         bne 2b
    365 
    366         vcvt.s32.f32 q0, q0
    367         vmovn.u32 d0, q0
    368         vmovn.u16 d0, q0
    369 
    370         vst1.32 {d0[0]}, [r0]!
    371         add r4, r4, #1
    372         cmp r4, r5
    373         bne 1b
    374 
    375         vpop            {q4-q7}
    376         pop             {r4-r8, r10, r11, lr}
    377         bx              lr
    378 END(rsdIntrinsicBlurHFU4_K)
    379 
    380 ENTRY(rsdIntrinsicBlurHFU1_K)
    381         push            {r4-r8, r10, r11, lr}
    382         vpush           {q4-q7}
    383 
    384         ldr r4, [sp, #32+64]
    385         ldr r5, [sp, #32+64 + 4]
    386 
    387 1:
    388         add r7, r1, r4, lsl #2  /* const uchar *pi = ptrIn + x1 * 4; */
    389         mov r10, r2
    390         mov r11, r3
    391 
    392         veor q0, q0
    393 
    394 2:
    395         vld1.32 {q1}, [r7]
    396         add r7, r7, #4
    397         vld1.32 {d4[0]}, [r10]!
    398         vmla.f32 q0, q1, d4[0]
    399         subs r11, r11, #1
    400         bne 2b
    401 
    402         vcvt.s32.f32 q0, q0
    403         vmovn.u32 d0, q0
    404         vmovn.u16 d0, q0
    405 
    406         vst1.32 {d0[0]}, [r0]!
    407         add r4, r4, #4
    408         cmp r4, r5
    409         bne 1b
    410 
    411         vpop            {q4-q7}
    412         pop             {r4-r8, r10, r11, lr}
    413         bx              lr
    414 END(rsdIntrinsicBlurHFU1_K)
    415 
    416 /*
    417     Function called with the following arguments: dst, Y, vu, len, YuvCoeff
    418         r0 = dst
    419         r1 = Y
    420         r2 = VU
    421         r3 = length (pixels / 8)
    422         ---- Args below will be in the stack ----
    423         sp = YuvCoeff
    424 
    425         This function converts 8 pixels per iteration
    426 */
    427 ENTRY(rsdIntrinsicYuv_K)
    428         push        {r4, r5, lr}            @ preserve clobbered int registers
    429         vpush       {Q4-Q7}                 @ preserve Vregisters we clobber
    430 
    431         mov  r5, #16                        @ Integer 16 in r5; used as an incrementing value
    432 
    433         ldr         r4, [sp, #64+12]        @ load the coeffs address in memory in r4 (16*4 + 4*3)
    434         vld1.16     {Q2}, [r4]!             @ load the multipliers from the coeffs matrix (r4) in Q2
    435         vld1.8      {d6[]}, [r4], r5        @ load y offset 16 from the coeffs matrix (r4) in d6
    436         vld1.8      {d8[]}, [r4], r5        @ load V and U offset of 128 from the coeffs matrix (r4) in d8
    437 
    438         mov         r4, #8                  @ Integer 8 in r4; used as an incrementing value
    439 
    440         vdup.8      d3, d5[1]               @ d3 = 255 (alpha) from the multipliers line in
    441                                             @ the coeffs matrix (Q2)
    442 
    443         1:
    444         vld1.8      {d10}, [r1]!            @ get Y (r1->Y)
    445         vld2.8      {d12, d14}, [r2], r4    @ split V from U (r2 -> VU) and increase pointer by 8 (in r4)
    446         pld         [r1, #64]               @ preloading data from address y(r1) + 64 for subsequent loops
    447         pld         [r2, #64]               @ preloading data from address vu(r2) + 64 for subsequent loops
    448 
    449         vsubl.u8    Q5, d10, d6             @ Y to 16 bit - 16 (in 16bit) (n to n+7)
    450         vmull.s16   Q8, d10, d4[0]          @ Y(n,n+1,n+2,n+3) * 298 = Q8 (to 32bit)
    451         vmull.s16   Q11, d11, d4[0]         @ Y(n+4,n+5,n+6,n+7) * 298 = Q11 (to 32bit)
    452 
    453         vsubl.u8    Q5, d12, d8             @ V to 16 bit - 128 = Q5 // V(n, n+1, n+2,n+3)
    454         vsubl.u8    Q6, d14, d8             @ U to 16 bit - 128 = Q6 // U(n, n+1, n+2,n+3)
    455         vmov.u16    d11, d10                @ Copying V to d11
    456         vmov.u16    d13, d12                @ Copying U to d13
    457         vzip.u16    d10, d11                @ Q5 = V (n,n n+1, n+1) V(n+2, n+2, n+3, n+3)
    458         vzip.u16    d12, d13                @ Q5 = U (n,n n+1, n+1) U(n+2, n+2, n+3, n+3)
    459 
    460 
    461         vmov        Q9, Q8                  @ Copy Q8(Y: n, n+1, n+2, n+3) to Q9
    462         vmov        Q10, Q8                 @ Copy Q8(Y: n, n+1, n+2, n+3) to Q10
    463         vmov        Q12, Q11                @ Copy Q11(Y: n+5, n+6, n+6, n+7) to Q12
    464         vmov        Q13, Q11                @ Copy Q11(Y: n+5, n+6, n+6, n+7) to Q13
    465 
    466                                             @                  R    G    B
    467                                             @     Pixel(0-3)  Q8,  Q9, Q10
    468                                             @     Pixel(4-7) Q11, Q12, Q13
    469                                             @
    470 
    471                                             @ Pixel(0-3)
    472         vmlal.s16   Q8,  d10, d4[1]         @ R : Q8  = Q8(Y-16)  + (V-128) * 409
    473         vmlal.s16   Q9,  d10, d5[0]         @ G : Q9  = Q9(Y-16)  + (V-128) * (-208)
    474         vmlal.s16   Q9,  d12, d4[2]         @                     + (U-128) * (-100)
    475         vmlal.s16   Q10, d12, d4[3]         @ B : Q10 = Q10(Y-16) + (U-128) * 516
    476 
    477                                             @ Pixel(4-7)
    478         vmlal.s16   Q11, d11, d4[1]         @ R : Q11 = Q11(Y-16) + (V-128) * 409
    479         vmlal.s16   Q12, d11, d5[0]         @ G : Q12 = Q12(Y-16) + (V-128) * (-208)
    480         vmlal.s16   Q12, d13, d4[2]         @                     + (U-128) * (-100)
    481         vmlal.s16   Q13, d13, d4[3]         @ B : Q13 = Q13(Y-16) + (U-128) * 516
    482 
    483                                             @ Pixel(0-3)
    484         vrshrn.i32  d16, Q8, #8             @ d16 : R shifted right by 8 rounded'n narrowed to 16bit
    485         vrshrn.i32  d18, Q9, #8             @ d18 : G shifted right by 8 rounded'n narrowed to 16bit
    486         vrshrn.i32  d20, Q10, #8            @ d20 : B shifted right by 8 rounded'n narrowed to 16bit
    487 
    488                                             @ Pixel(4-7)
    489         vrshrn.i32  d17, Q11, #8            @ d17 : R shifted right by 8 rounded'n narrowed to 16bit
    490         vrshrn.i32  d19, Q12, #8            @ d19 : G shifted right by 8 rounded'n narrowed to 16bit
    491         vrshrn.i32  d21, Q13, #8            @ d21 : B shifted right by 8 rounded'n narrowed to 16bit
    492 
    493         vqmovun.s16 d0, Q8                  @ r = d0 (saturated, unsigned and narrowed to 8bit)
    494         vqmovun.s16 d1, Q9                  @ g = d1 (saturated, unsigned and narrowed to 8bit)
    495         vqmovun.s16 d2, Q10                 @ b = d2 (saturated, unsigned and narrowed to 8bit)
    496 
    497         subs        r3, r3, #1              @ Checking length (r3)
    498         vst4.8      {d0, d1, d2, d3}, [r0]! @ Writing out 8 RGBA values to dst (r0)
    499 
    500         bne 1b                              @ if not done with length, loop
    501 
    502         vpop        {Q4-Q7}                 @ Restore Vregisters
    503         pop         {r4, r5, lr}            @ Restore int registers
    504         bx          lr
    505 END(rsdIntrinsicYuv_K)
    506 
    507 /*
    508     Function called with the following arguments: dst, Y, v, u, len, YuvCoeff
    509         r0 = dst
    510         r1 = Y
    511         r2 = V,
    512         r3 = U
    513         ---- Args below will be in the stack ----
    514         sp = length (pixels / 8)
    515         sp+4 = YuvCoeff
    516 
    517         This function converts 8 pixels per iteration
    518 */
    519 ENTRY(rsdIntrinsicYuv2_K)
    520         push        {r4, r5, r6, lr}        @ preserve clobbered int registers
    521         vpush       {Q4-Q7}                 @ preserve Vregisters we clobber
    522 
    523         mov  r5, #16                        @ Integer 16 in r5; used as an incrementing value
    524 
    525         ldr         r4, [sp, #64+16+4]      @ load the coeffs address in memory in r4 (16*4 + 4*4 + 4)
    526         ldr         r6, [sp, #64+16]        @ load the length in r6 (16*4 + 4*4)
    527         vld1.16     {Q2}, [r4]!             @ load the multipliers from the coeffs matrix (r4) in Q2
    528         vld1.8      {d6[]}, [r4], r5        @ load y offset 16 from the coeffs matrix (r4) in d6
    529         vld1.8      {d8[]}, [r4], r5        @ load V and U offset of 128 from the coeffs matrix (r4) in d8
    530 
    531         mov         r4, #4                  @ Integer 8 in r4; used as an incrementing value
    532 
    533         vdup.8      d3, d5[1]               @ d3 = 255 (alpha) from the multipliers line in
    534                                             @ the coeffs matrix (Q2)
    535 
    536         1:
    537         vld1.8      {d10}, [r1]!            @ get Y (r1->Y)
    538         vld1.8      {d12}, [r3], r4         @ split V from U (r2 -> VU) and increase pointer by 4 (in r4)
    539         vld1.8      {d14}, [r2], r4         @ split V from U (r2 -> VU) and increase pointer by 4 (in r4)
    540         pld         [r1, #64]               @ preloading data from address y(r1) + 64 for subsequent loops
    541         pld         [r2, #64]               @ preloading data from address vu(r2) + 64 for subsequent loops
    542 
    543         vsubl.u8    Q5, d10, d6             @ Y to 16 bit - 16 (in 16bit) (n to n+7)
    544         vmull.s16   Q8, d10, d4[0]          @ Y(n,n+1,n+2,n+3) * 298 = Q8 (to 32bit)
    545         vmull.s16   Q11, d11, d4[0]         @ Y(n+4,n+5,n+6,n+7) * 298 = Q11 (to 32bit)
    546 
    547         vsubl.u8    Q5, d12, d8             @ V to 16 bit - 128 = Q5 // V(n, n+1, n+2,n+3)
    548         vsubl.u8    Q6, d14, d8             @ U to 16 bit - 128 = Q6 // U(n, n+1, n+2,n+3)
    549         vmov.u16    d11, d10                @ Copying V to d11
    550         vmov.u16    d13, d12                @ Copying U to d13
    551         vzip.u16    d10, d11                @ Q5 = V (n,n n+1, n+1) V(n+2, n+2, n+3, n+3)
    552         vzip.u16    d12, d13                @ Q5 = U (n,n n+1, n+1) U(n+2, n+2, n+3, n+3)
    553 
    554 
    555         vmov        Q9, Q8                  @ Copy Q8(Y: n, n+1, n+2, n+3) to Q9
    556         vmov        Q10, Q8                 @ Copy Q8(Y: n, n+1, n+2, n+3) to Q10
    557         vmov        Q12, Q11                @ Copy Q11(Y: n+5, n+6, n+6, n+7) to Q12
    558         vmov        Q13, Q11                @ Copy Q11(Y: n+5, n+6, n+6, n+7) to Q13
    559 
    560                                             @                  R    G    B
    561                                             @     Pixel(0-3)  Q8,  Q9, Q10
    562                                             @     Pixel(4-7) Q11, Q12, Q13
    563                                             @
    564 
    565                                             @ Pixel(0-3)
    566         vmlal.s16   Q8,  d10, d4[1]         @ R : Q8  = Q8(Y-16)  + (V-128) * 409
    567         vmlal.s16   Q9,  d10, d5[0]         @ G : Q9  = Q9(Y-16)  + (V-128) * (-208)
    568         vmlal.s16   Q9,  d12, d4[2]         @                     + (U-128) * (-100)
    569         vmlal.s16   Q10, d12, d4[3]         @ B : Q10 = Q10(Y-16) + (U-128) * 516
    570 
    571                                             @ Pixel(4-7)
    572         vmlal.s16   Q11, d11, d4[1]         @ R : Q11 = Q11(Y-16) + (V-128) * 409
    573         vmlal.s16   Q12, d11, d5[0]         @ G : Q12 = Q12(Y-16) + (V-128) * (-208)
    574         vmlal.s16   Q12, d13, d4[2]         @                     + (U-128) * (-100)
    575         vmlal.s16   Q13, d13, d4[3]         @ B : Q13 = Q13(Y-16) + (U-128) * 516
    576 
    577                                             @ Pixel(0-3)
    578         vrshrn.i32  d16, Q8, #8             @ d16 : R shifted right by 8 rounded'n narrowed to 16bit
    579         vrshrn.i32  d18, Q9, #8             @ d18 : G shifted right by 8 rounded'n narrowed to 16bit
    580         vrshrn.i32  d20, Q10, #8            @ d20 : B shifted right by 8 rounded'n narrowed to 16bit
    581 
    582                                             @ Pixel(4-7)
    583         vrshrn.i32  d17, Q11, #8            @ d17 : R shifted right by 8 rounded'n narrowed to 16bit
    584         vrshrn.i32  d19, Q12, #8            @ d19 : G shifted right by 8 rounded'n narrowed to 16bit
    585         vrshrn.i32  d21, Q13, #8            @ d21 : B shifted right by 8 rounded'n narrowed to 16bit
    586 
    587         vqmovun.s16 d0, Q8                  @ r = d0 (saturated, unsigned and narrowed to 8bit)
    588         vqmovun.s16 d1, Q9                  @ g = d1 (saturated, unsigned and narrowed to 8bit)
    589         vqmovun.s16 d2, Q10                 @ b = d2 (saturated, unsigned and narrowed to 8bit)
    590 
    591         subs        r6, r6, #1              @ Checking length (r6)
    592         vst4.8      {d0, d1, d2, d3}, [r0]! @ Writing out 8 RGBA values to dst (r0)
    593 
    594         bne 1b                              @ if not done with length, loop
    595 
    596         vpop        {Q4-Q7}                 @ Restore Vregisters
    597         pop         {r4, r5, r6, lr}        @ Restore int registers
    598         bx          lr
    599 END(rsdIntrinsicYuv2_K)
    600 
    601 /* Convolve 5x5 */
    602 
    603 /*
    604         r0 = dst
    605         r1 = y0 base pointer
    606         r2 = y1 base pointer
    607         r3 = y2 base pointer
    608         r4 = y3 base pointer
    609         r5 = y4 base pointer
    610         r6 = coeffs
    611         r7 = length
    612 */
    613 ENTRY(rsdIntrinsicConvolve5x5_K)
    614         push        {r4-r7, lr}
    615         vpush       {q4-q7}
    616 
    617         /* load y3 in r4 */
    618         ldr     r4, [sp, #20 + 64]
    619 
    620         /* load y4 in r5 */
    621         ldr     r5, [sp, #24 + 64]
    622 
    623         /* Load the coefficients pointer */
    624         ldr     r6, [sp, #28 + 64]
    625 
    626         /* Create the coefficients vector */
    627         vld1.16     {d0, d1, d2, d3}, [r6]!
    628         vld1.16     {d4, d5, d6}, [r6]
    629 
    630         vmov.u32  q15, #0x7f
    631 
    632         /* load the count */
    633         ldr     r6, [sp, #32 + 64]
    634 
    635         /* Load the frequently used immediate in a register */
    636         mov     r7, #8
    637 
    638 1:
    639         /* Load the y base pointers in Qregs and post-increase the address by r7=#8 */
    640         vld1.8  {d24, d25, d26}, [r1], r7      @  y0 ( y - 2 )
    641         vld1.8  {d27, d28, d29}, [r2], r7      @  y0 ( y - 1 )
    642 
    643         /* Signal memory for data that will be used in the loop after the next */
    644         PLD         (r1, r7)
    645         PLD         (r2, r7)
    646 
    647         /* Promoting the 8bit channels to 16bit */
    648         vmovl.u8 q9,  d24
    649         vmovl.u8 q10, d25
    650         vmovl.u8 q11, d26
    651         vmovl.u8 q12, d27
    652         vmovl.u8 q13, d28
    653         vmovl.u8 q14, d29
    654 
    655 /*
    656         d18,  d19,  d20, d21, d22, d23,
    657         d24,  d25
    658 */
    659         vmull.s16 q4, d18, d0[0]
    660         vmlal.s16 q4, d19, d0[1]
    661         vmlal.s16 q4, d20, d0[2]
    662         vmlal.s16 q4, d21, d0[3]
    663         vmlal.s16 q4, d22, d1[0]
    664 
    665         vmlal.s16 q4, d24, d1[1]
    666         vmlal.s16 q4, d25, d1[2]
    667         vmlal.s16 q4, d26, d1[3]
    668         vmlal.s16 q4, d27, d2[0]
    669         vmlal.s16 q4, d28, d2[1]
    670 
    671         vmull.s16 q5, d19, d0[0]
    672         vmlal.s16 q5, d20, d0[1]
    673         vmlal.s16 q5, d21, d0[2]
    674         vmlal.s16 q5, d22, d0[3]
    675         vmlal.s16 q5, d23, d1[0]
    676 
    677         vmlal.s16 q5, d25, d1[1]
    678         vmlal.s16 q5, d26, d1[2]
    679         vmlal.s16 q5, d27, d1[3]
    680         vmlal.s16 q5, d28, d2[0]
    681         vmlal.s16 q5, d29, d2[1]
    682 
    683 
    684         /* Next 2 rows */
    685         /* Load the y base pointers in Qregs and post-increase the address by r7=#8 */
    686         vld1.8  {d24, d25, d26}, [r3], r7      @  y0 ( y )
    687         vld1.8  {d27, d28, d29}, [r4], r7      @  y0 ( y + 1 )
    688 
    689         /* Signal memory for data that will be used in the loop after the next */
    690         PLD         (r3, r7)
    691         PLD         (r4, r7)
    692 
    693         /* Promoting the 8bit channels to 16bit */
    694         vmovl.u8 q9,  d24
    695         vmovl.u8 q10, d25
    696         vmovl.u8 q11, d26
    697         vmovl.u8 q12, d27
    698         vmovl.u8 q13, d28
    699         vmovl.u8 q14, d29
    700 
    701 /*
    702         d18,  d19,  d20, d21, d22, d23,
    703         d24,  d25
    704 */
    705         vmlal.s16 q4, d18, d2[2]
    706         vmlal.s16 q4, d19, d2[3]
    707         vmlal.s16 q4, d20, d3[0]
    708         vmlal.s16 q4, d21, d3[1]
    709         vmlal.s16 q4, d22, d3[2]
    710 
    711         vmlal.s16 q4, d24, d3[3]
    712         vmlal.s16 q4, d25, d4[0]
    713         vmlal.s16 q4, d26, d4[1]
    714         vmlal.s16 q4, d27, d4[2]
    715         vmlal.s16 q4, d28, d4[3]
    716 
    717         vmlal.s16 q5, d19, d2[2]
    718         vmlal.s16 q5, d20, d2[3]
    719         vmlal.s16 q5, d21, d3[0]
    720         vmlal.s16 q5, d22, d3[1]
    721         vmlal.s16 q5, d23, d3[2]
    722 
    723         vmlal.s16 q5, d25, d3[3]
    724         vmlal.s16 q5, d26, d4[0]
    725         vmlal.s16 q5, d27, d4[1]
    726         vmlal.s16 q5, d28, d4[2]
    727         vmlal.s16 q5, d29, d4[3]
    728 
    729         /* Last row */
    730         /* Load the y base pointers in Qregs and post-increase the address by r7=#8 */
    731         vld1.8  {d24, d25, d26}, [r5], r7      @  y0 ( y + 2 )
    732 
    733         /* Signal memory for data that will be used in the loop after the next */
    734         PLD         (r5, r7)
    735 
    736         /* Promoting the 8bit channels to 16bit */
    737         vmovl.u8 q9,  d24
    738         vmovl.u8 q10, d25
    739         vmovl.u8 q11, d26
    740 
    741 /*
    742         d18,  d19,  d20, d21, d22, d23,
    743         d24,  d25
    744 */
    745 
    746         vmlal.s16 q4, d18, d5[0]
    747         vmlal.s16 q4, d19, d5[1]
    748         vmlal.s16 q4, d20, d5[2]
    749         vmlal.s16 q4, d21, d5[3]
    750         vmlal.s16 q4, d22, d6[0]
    751 
    752         vmlal.s16 q5, d19, d5[0]
    753         vmlal.s16 q5, d20, d5[1]
    754         vmlal.s16 q5, d21, d5[2]
    755         vmlal.s16 q5, d22, d5[3]
    756         vmlal.s16 q5, d23, d6[0]
    757 
    758 
    759 
    760         vadd.i32 q4, q4, q15
    761         vadd.i32 q5, q5, q15
    762 
    763 /*      Narrow it to a d-reg 32 -> 16 bit */
    764         vrshrn.i32 d8, q4, #8
    765         vrshrn.i32 d9, q5, #8
    766 
    767 
    768 /*      Pack 16 -> 8 bit, saturate, put two pixels into D reg */
    769         vqmovun.s16 d8, q4
    770 
    771         vst1.8 d8, [r0]!           @ return the output and increase the address of r0
    772 
    773         /* Are we done? */
    774         subs r6, r6, #1
    775         bne 1b
    776 
    777         /* Yup, bye */
    778         vpop        {q4-q7}
    779         pop         {r4-r7, lr}
    780         bx          lr
    781 
    782 END(rsdIntrinsicConvolve5x5_K)
    783 
    784 
    785 
    786 
    787 /*
    788         dst = src + dst * (1.0 - src.a)
    789 
    790         r0 = dst
    791         r1 = src
    792         r2 = length
    793 */
    794 ENTRY(rsdIntrinsicBlendSrcOver_K)
    795         .save           {r4, lr}
    796         stmfd           sp!, {r4, lr}
    797         vpush           {q4-q7}
    798 
    799         mov r4, #255
    800         vdup.16 q7, r4
    801 
    802         mov r4, r0
    803 1:
    804 
    805         /* src */
    806         vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
    807         vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
    808         vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
    809         vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
    810         vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]!
    811         vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]!
    812         vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]!
    813         vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]!
    814         vshll.u8 q12, d0, #8
    815         vshll.u8 q13, d1, #8
    816         vshll.u8 q14, d2, #8
    817         vmovl.u8 q6, d3
    818         vsub.i16 q6, q7, q6        // q6 = 1 - src.a
    819         vshll.u8 q15, d3, #8
    820 
    821         /* dst */
    822         vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
    823         vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
    824         vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
    825         vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
    826         vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]!
    827         vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]!
    828         vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]!
    829         vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]!
    830         vmovl.u8 q8, d0
    831         vmovl.u8 q9, d1
    832         vmovl.u8 q10, d2
    833         vmovl.u8 q11, d3
    834 
    835         vmla.i16 q12, q8, q6
    836         vmla.i16 q13, q9, q6
    837         vmla.i16 q14, q10, q6
    838         vmla.i16 q15, q11, q6
    839 
    840         vshrn.i16 d0, q12, #8
    841         vshrn.i16 d1, q13, #8
    842         vshrn.i16 d2, q14, #8
    843         vshrn.i16 d3, q15, #8
    844         vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]!
    845         vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]!
    846         vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]!
    847         vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]!
    848         vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]!
    849         vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]!
    850         vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]!
    851         vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]!
    852 
    853         subs r2, r2, #1
    854         bne 1b
    855 
    856         vpop            {q4-q7}
    857         ldmfd           sp!, {r4, lr}
    858         bx              lr
    859 END(rsdIntrinsicBlendSrcOver_K)
    860 
    861 /*
    862         dst = dst + src * (1.0 - dst.a)
    863 
    864         r0 = dst
    865         r1 = src
    866         r2 = length
    867 */
    868 ENTRY(rsdIntrinsicBlendDstOver_K)
    869         .save           {r4, lr}
    870         stmfd           sp!, {r4, lr}
    871         vpush           {q4-q7}
    872 
    873         mov r4, #255
    874         vdup.16 q7, r4
    875 
    876         mov r4, r0
    877 1:
    878 
    879         /* src */
    880         vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
    881         vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
    882         vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
    883         vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
    884         vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]!
    885         vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]!
    886         vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]!
    887         vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]!
    888         vmovl.u8 q12, d0
    889         vmovl.u8 q13, d1
    890         vmovl.u8 q14, d2
    891         vmovl.u8 q15, d3
    892 
    893         /* dst */
    894         vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
    895         vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
    896         vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
    897         vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
    898         vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]!
    899         vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]!
    900         vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]!
    901         vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]!
    902         vshll.u8 q8, d0, #8
    903         vshll.u8 q9, d1, #8
    904         vshll.u8 q10, d2, #8
    905         vmovl.u8 q6, d3
    906         vsub.i16 q6, q7, q6        // q6 = 1 - dst.a
    907         vshll.u8 q11, d3, #8
    908 
    909 
    910         vmla.i16 q8, q12, q6
    911         vmla.i16 q9, q13, q6
    912         vmla.i16 q10, q14, q6
    913         vmla.i16 q11, q15, q6
    914 
    915         vshrn.i16 d0, q8, #8
    916         vshrn.i16 d1, q9, #8
    917         vshrn.i16 d2, q10, #8
    918         vshrn.i16 d3, q11, #8
    919         vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]!
    920         vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]!
    921         vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]!
    922         vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]!
    923         vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]!
    924         vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]!
    925         vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]!
    926         vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]!
    927 
    928         subs r2, r2, #1
    929         bne 1b
    930 
    931         vpop            {q4-q7}
    932         ldmfd           sp!, {r4, lr}
    933         bx              lr
    934 END(rsdIntrinsicBlendDstOver_K)
    935 
    936 /*
    937         dst = src * dst.a
    938 
    939         r0 = dst
    940         r1 = src
    941         r2 = length
    942 */
    943 ENTRY(rsdIntrinsicBlendSrcIn_K)
    944         .save           {r4, lr}
    945         stmfd           sp!, {r4, lr}
    946         vpush           {q4-q7}
    947 
    948         mov r4, r0
    949 1:
    950 
    951         /* src */
    952         vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
    953         vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
    954         vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
    955         vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
    956         vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]!
    957         vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]!
    958         vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]!
    959         vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]!
    960         vmovl.u8 q12, d0
    961         vmovl.u8 q13, d1
    962         vmovl.u8 q14, d2
    963         vmovl.u8 q15, d3
    964 
    965         /* dst */
    966         vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
    967         vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
    968         vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
    969         vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
    970         vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]!
    971         vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]!
    972         vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]!
    973         vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]!
    974         //vmovl.u8 q8, d0
    975         //vmovl.u8 q9, d1
    976         //vmovl.u8 q10, d2
    977         vmovl.u8 q11, d3
    978 
    979         vmul.i16 q12, q12, q11
    980         vmul.i16 q13, q13, q11
    981         vmul.i16 q14, q14, q11
    982         vmul.i16 q15, q15, q11
    983 
    984         vshrn.i16 d0, q12, #8
    985         vshrn.i16 d1, q13, #8
    986         vshrn.i16 d2, q14, #8
    987         vshrn.i16 d3, q15, #8
    988         vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]!
    989         vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]!
    990         vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]!
    991         vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]!
    992         vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]!
    993         vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]!
    994         vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]!
    995         vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]!
    996 
    997         subs r2, r2, #1
    998         bne 1b
    999 
   1000         vpop            {q4-q7}
   1001         ldmfd           sp!, {r4, lr}
   1002         bx              lr
   1003 END(rsdIntrinsicBlendSrcIn_K)
   1004 
   1005 /*
   1006         dst = dst * src.a
   1007 
   1008         r0 = dst
   1009         r1 = src
   1010         r2 = length
   1011 */
   1012 ENTRY(rsdIntrinsicBlendDstIn_K)
   1013         .save           {r4, lr}
   1014         stmfd           sp!, {r4, lr}
   1015         vpush           {q4-q7}
   1016 
   1017         mov r4, r0
   1018 1:
   1019 
   1020         /* src */
   1021         vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
   1022         vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
   1023         vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
   1024         vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
   1025         vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]!
   1026         vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]!
   1027         vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]!
   1028         vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]!
   1029         //vmovl.u8 q12, d0
   1030         //vmovl.u8 q13, d1
   1031         //vmovl.u8 q14, d2
   1032         vmovl.u8 q15, d3
   1033 
   1034         /* dst */
   1035         vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
   1036         vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
   1037         vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
   1038         vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
   1039         vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]!
   1040         vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]!
   1041         vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]!
   1042         vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]!
   1043         vmovl.u8 q8, d0
   1044         vmovl.u8 q9, d1
   1045         vmovl.u8 q10, d2
   1046         vmovl.u8 q11, d3
   1047 
   1048         vmul.i16 q8, q8, q15
   1049         vmul.i16 q9, q9, q15
   1050         vmul.i16 q10, q10, q15
   1051         vmul.i16 q11, q11, q15
   1052 
   1053         vshrn.i16 d0, q8, #8
   1054         vshrn.i16 d1, q9, #8
   1055         vshrn.i16 d2, q10, #8
   1056         vshrn.i16 d3, q11, #8
   1057         vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]!
   1058         vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]!
   1059         vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]!
   1060         vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]!
   1061         vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]!
   1062         vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]!
   1063         vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]!
   1064         vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]!
   1065 
   1066         subs r2, r2, #1
   1067         bne 1b
   1068 
   1069         vpop            {q4-q7}
   1070         ldmfd           sp!, {r4, lr}
   1071         bx              lr
   1072 END(rsdIntrinsicBlendDstIn_K)
   1073 
   1074 
   1075 
   1076 /*
   1077         dst = src * (1.0 - dst.a)
   1078 
   1079         r0 = dst
   1080         r1 = src
   1081         r2 = length
   1082 */
   1083 ENTRY(rsdIntrinsicBlendSrcOut_K)
   1084         .save           {r4, lr}
   1085         stmfd           sp!, {r4, lr}
   1086         vpush           {q4-q7}
   1087 
   1088         mov r4, #255
   1089         vdup.16 q7, r4
   1090 
   1091         mov r4, r0
   1092 1:
   1093 
   1094         /* src */
   1095         vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
   1096         vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
   1097         vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
   1098         vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
   1099         vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]!
   1100         vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]!
   1101         vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]!
   1102         vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]!
   1103         vmovl.u8 q12, d0
   1104         vmovl.u8 q13, d1
   1105         vmovl.u8 q14, d2
   1106         vmovl.u8 q15, d3
   1107 
   1108         /* dst */
   1109         vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
   1110         vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
   1111         vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
   1112         vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
   1113         vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]!
   1114         vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]!
   1115         vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]!
   1116         vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]!
   1117         //vmovl.u8 q8, d0
   1118         //vmovl.u8 q9, d1
   1119         //vmovl.u8 q10, d2
   1120         vmovl.u8 q11, d3
   1121 
   1122 
   1123         vsub.i16 q6, q7, q11        // q6 = 1 - dst.a
   1124         vmul.i16 q12, q12, q6
   1125         vmul.i16 q13, q13, q6
   1126         vmul.i16 q14, q14, q6
   1127         vmul.i16 q15, q15, q6
   1128 
   1129         vshrn.i16 d0, q12, #8
   1130         vshrn.i16 d1, q13, #8
   1131         vshrn.i16 d2, q14, #8
   1132         vshrn.i16 d3, q15, #8
   1133         vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]!
   1134         vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]!
   1135         vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]!
   1136         vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]!
   1137         vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]!
   1138         vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]!
   1139         vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]!
   1140         vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]!
   1141 
   1142         subs r2, r2, #1
   1143         bne 1b
   1144 
   1145         vpop            {q4-q7}
   1146         ldmfd           sp!, {r4, lr}
   1147         bx              lr
   1148 END(rsdIntrinsicBlendSrcOut_K)
   1149 
   1150 
   1151 /*
   1152         dst = dst * (1.0 - src.a)
   1153 
   1154         r0 = dst
   1155         r1 = src
   1156         r2 = length
   1157 */
   1158 ENTRY(rsdIntrinsicBlendDstOut_K)
   1159         .save           {r4, lr}
   1160         stmfd           sp!, {r4, lr}
   1161         vpush           {q4-q7}
   1162 
   1163         mov r4, #255
   1164         vdup.16 q7, r4
   1165 
   1166         mov r4, r0
   1167 1:
   1168 
   1169         /* src */
   1170         vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
   1171         vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
   1172         vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
   1173         vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
   1174         vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]!
   1175         vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]!
   1176         vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]!
   1177         vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]!
   1178         //vmovl.u8 q12, d0
   1179         //vmovl.u8 q13, d1
   1180         //vmovl.u8 q14, d2
   1181         vmovl.u8 q15, d3
   1182 
   1183         /* dst */
   1184         vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
   1185         vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
   1186         vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
   1187         vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
   1188         vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]!
   1189         vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]!
   1190         vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]!
   1191         vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]!
   1192         vmovl.u8 q8, d0
   1193         vmovl.u8 q9, d1
   1194         vmovl.u8 q10, d2
   1195         vmovl.u8 q11, d3
   1196 
   1197 
   1198         vsub.i16 q6, q7, q15        // q6 = 1 - src.a
   1199         vmul.i16 q12, q8, q6
   1200         vmul.i16 q13, q9, q6
   1201         vmul.i16 q14, q10, q6
   1202         vmul.i16 q15, q11, q6
   1203 
   1204         vshrn.i16 d0, q12, #8
   1205         vshrn.i16 d1, q13, #8
   1206         vshrn.i16 d2, q14, #8
   1207         vshrn.i16 d3, q15, #8
   1208         vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]!
   1209         vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]!
   1210         vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]!
   1211         vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]!
   1212         vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]!
   1213         vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]!
   1214         vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]!
   1215         vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]!
   1216 
   1217         subs r2, r2, #1
   1218         bne 1b
   1219 
   1220         vpop            {q4-q7}
   1221         ldmfd           sp!, {r4, lr}
   1222         bx              lr
   1223 END(rsdIntrinsicBlendDstOut_K)
   1224 
   1225 
   1226 /*
   1227         dst.rgb = src.rgb * dst.a + (1.0 - src.a) * dst.rgb
   1228         dst.a = dst.a
   1229 
   1230         r0 = dst
   1231         r1 = src
   1232         r2 = length
   1233 */
   1234 ENTRY(rsdIntrinsicBlendSrcAtop_K)
   1235         .save           {r4, lr}
   1236         stmfd           sp!, {r4, lr}
   1237         vpush           {q4-q7}
   1238 
   1239         mov r4, #255
   1240         vdup.16 q7, r4
   1241 
   1242         mov r4, r0
   1243 1:
   1244 
   1245         /* src */
   1246         vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
   1247         vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
   1248         vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
   1249         vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
   1250         vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]!
   1251         vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]!
   1252         vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]!
   1253         vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]!
   1254         vmovl.u8 q12, d0
   1255         vmovl.u8 q13, d1
   1256         vmovl.u8 q14, d2
   1257         vmovl.u8 q15, d3
   1258 
   1259         /* dst */
   1260         vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
   1261         vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
   1262         vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
   1263         vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
   1264         vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]!
   1265         vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]!
   1266         vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]!
   1267         vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]!
   1268         vmovl.u8 q8, d0
   1269         vmovl.u8 q9, d1
   1270         vmovl.u8 q10, d2
   1271         vmovl.u8 q11, d3
   1272 
   1273 
   1274         vsub.i16 q6, q7, q15        // q6 = 1 - src.a
   1275         vmul.i16 q8, q8, q6
   1276         vmul.i16 q9, q9, q6
   1277         vmul.i16 q10, q10, q6
   1278 
   1279         vmla.i16 q8, q12, q11
   1280         vmla.i16 q9, q13, q11
   1281         vmla.i16 q10, q14, q11
   1282 
   1283 
   1284         vshrn.i16 d0, q8, #8
   1285         vshrn.i16 d1, q9, #8
   1286         vshrn.i16 d2, q10, #8
   1287         //vshrn.i16 d3, q15, #8
   1288         vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]!
   1289         vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]!
   1290         vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]!
   1291         vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]!
   1292         vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]!
   1293         vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]!
   1294         vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]!
   1295         vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]!
   1296 
   1297         subs r2, r2, #1
   1298         bne 1b
   1299 
   1300         vpop            {q4-q7}
   1301         ldmfd           sp!, {r4, lr}
   1302         bx              lr
   1303 END(rsdIntrinsicBlendSrcAtop_K)
   1304 
   1305 /*
   1306         dst = dst.rgb * src.a + (1.0 - dst.a) * src.rgb
   1307         dst.a = src.a
   1308 
   1309         r0 = dst
   1310         r1 = src
   1311         r2 = length
   1312 */
   1313 ENTRY(rsdIntrinsicBlendDstAtop_K)
   1314         .save           {r4, lr}
   1315         stmfd           sp!, {r4, lr}
   1316         vpush           {q4-q7}
   1317 
   1318         mov r4, #255
   1319         vdup.16 q7, r4
   1320 
   1321         mov r4, r0
   1322 1:
   1323 
   1324         /* src */
   1325         vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
   1326         vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
   1327         vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
   1328         vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
   1329         vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]!
   1330         vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]!
   1331         vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]!
   1332         vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]!
   1333         vmovl.u8 q12, d0
   1334         vmovl.u8 q13, d1
   1335         vmovl.u8 q14, d2
   1336         vmovl.u8 q15, d3
   1337 
   1338         /* dst */
   1339         vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
   1340         vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
   1341         vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
   1342         vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
   1343         vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]!
   1344         vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]!
   1345         vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]!
   1346         vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]!
   1347         vmovl.u8 q8, d0
   1348         vmovl.u8 q9, d1
   1349         vmovl.u8 q10, d2
   1350         vmovl.u8 q11, d3
   1351 
   1352 
   1353         vsub.i16 q6, q7, q11        // q6 = 1 - dst.a
   1354         vmul.i16 q12, q12, q6
   1355         vmul.i16 q13, q13, q6
   1356         vmul.i16 q14, q14, q6
   1357 
   1358         vmla.i16 q12, q8, q15
   1359         vmla.i16 q13, q9, q15
   1360         vmla.i16 q14, q10, q15
   1361 
   1362 
   1363         vshrn.i16 d0, q12, #8
   1364         vshrn.i16 d1, q13, #8
   1365         vshrn.i16 d2, q14, #8
   1366         //vshrn.i16 d3, q15, #8
   1367         vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]!
   1368         vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]!
   1369         vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]!
   1370         vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]!
   1371         vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]!
   1372         vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]!
   1373         vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]!
   1374         vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]!
   1375 
   1376         subs r2, r2, #1
   1377         bne 1b
   1378 
   1379         vpop            {q4-q7}
   1380         ldmfd           sp!, {r4, lr}
   1381         bx              lr
   1382 END(rsdIntrinsicBlendDstAtop_K)
   1383 
   1384 /*
   1385         dst = dst ^ src
   1386 
   1387         r0 = dst
   1388         r1 = src
   1389         r2 = length
   1390 */
   1391 ENTRY(rsdIntrinsicBlendXor_K)
   1392         .save           {r4, lr}
   1393         stmfd           sp!, {r4, lr}
   1394         vpush           {q4-q7}
   1395 
   1396         mov r4, #255
   1397         vdup.16 q7, r4
   1398 
   1399         mov r4, r0
   1400 1:
   1401 
   1402         /* src */
   1403         vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
   1404         vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
   1405         vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
   1406         vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
   1407         vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]!
   1408         vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]!
   1409         vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]!
   1410         vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]!
   1411         vmov.u8 d4, d0
   1412         vmov.u8 d5, d1
   1413         vmov.u8 d6, d2
   1414         vmov.u8 d7, d3
   1415 
   1416         /* dst */
   1417         vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
   1418         vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
   1419         vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
   1420         vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
   1421         vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]!
   1422         vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]!
   1423         vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]!
   1424         vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]!
   1425 
   1426         veor d0, d0, d4
   1427         veor d1, d1, d5
   1428         veor d2, d2, d6
   1429         veor d3, d3, d7
   1430 
   1431         vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]!
   1432         vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]!
   1433         vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]!
   1434         vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]!
   1435         vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]!
   1436         vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]!
   1437         vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]!
   1438         vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]!
   1439 
   1440         subs r2, r2, #1
   1441         bne 1b
   1442 
   1443         vpop            {q4-q7}
   1444         ldmfd           sp!, {r4, lr}
   1445         bx              lr
   1446 END(rsdIntrinsicBlendXor_K)
   1447 
   1448 /*
   1449         dst = dst * src
   1450 
   1451         r0 = dst
   1452         r1 = src
   1453         r2 = length
   1454 */
   1455 ENTRY(rsdIntrinsicBlendMultiply_K)
   1456         .save           {r4, lr}
   1457         stmfd           sp!, {r4, lr}
   1458         vpush           {q4-q7}
   1459 
   1460         mov r4, #255
   1461         vdup.16 q7, r4
   1462 
   1463         mov r4, r0
   1464 1:
   1465 
   1466         /* src */
   1467         vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
   1468         vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
   1469         vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
   1470         vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
   1471         vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]!
   1472         vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]!
   1473         vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]!
   1474         vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]!
   1475         vmovl.u8 q12, d0
   1476         vmovl.u8 q13, d1
   1477         vmovl.u8 q14, d2
   1478         vmovl.u8 q15, d3
   1479 
   1480         /* dst */
   1481         vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
   1482         vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
   1483         vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
   1484         vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
   1485         vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]!
   1486         vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]!
   1487         vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]!
   1488         vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]!
   1489         vmovl.u8 q8, d0
   1490         vmovl.u8 q9, d1
   1491         vmovl.u8 q10, d2
   1492         vmovl.u8 q11, d3
   1493 
   1494 
   1495         vmul.i16 q8, q8, q12
   1496         vmul.i16 q9, q9, q13
   1497         vmul.i16 q10, q10, q14
   1498         vmul.i16 q11, q11, q15
   1499 
   1500         vshrn.i16 d0, q8, #8
   1501         vshrn.i16 d1, q9, #8
   1502         vshrn.i16 d2, q10, #8
   1503         vshrn.i16 d3, q11, #8
   1504         vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]!
   1505         vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]!
   1506         vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]!
   1507         vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]!
   1508         vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]!
   1509         vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]!
   1510         vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]!
   1511         vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]!
   1512 
   1513         subs r2, r2, #1
   1514         bne 1b
   1515 
   1516         vpop            {q4-q7}
   1517         ldmfd           sp!, {r4, lr}
   1518         bx              lr
   1519 END(rsdIntrinsicBlendMultiply_K)
   1520 
   1521 /*
   1522         dst = min(src + dst, 1.0)
   1523 
   1524         r0 = dst
   1525         r1 = src
   1526         r2 = length
   1527 */
   1528 ENTRY(rsdIntrinsicBlendAdd_K)
   1529         .save           {r4, lr}
   1530         stmfd           sp!, {r4, lr}
   1531         vpush           {q4-q7}
   1532 
   1533         mov r4, #255
   1534         vdup.16 q7, r4
   1535 
   1536         mov r4, r0
   1537 1:
   1538 
   1539         /* src */
   1540         vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
   1541         vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
   1542         vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
   1543         vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
   1544         vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]!
   1545         vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]!
   1546         vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]!
   1547         vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]!
   1548         vmovl.u8 q12, d0
   1549         vmovl.u8 q13, d1
   1550         vmovl.u8 q14, d2
   1551         vmovl.u8 q15, d3
   1552 
   1553         /* dst */
   1554         vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
   1555         vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
   1556         vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
   1557         vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
   1558         vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]!
   1559         vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]!
   1560         vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]!
   1561         vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]!
   1562         vmovl.u8 q8, d0
   1563         vmovl.u8 q9, d1
   1564         vmovl.u8 q10, d2
   1565         vmovl.u8 q11, d3
   1566 
   1567 
   1568         vadd.i16 q8, q8, q12
   1569         vadd.i16 q9, q9, q13
   1570         vadd.i16 q10, q10, q14
   1571         vadd.i16 q11, q11, q15
   1572 
   1573         vqmovun.s16 d0, q8
   1574         vqmovun.s16 d1, q9
   1575         vqmovun.s16 d2, q10
   1576         vqmovun.s16 d3, q11
   1577         vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]!
   1578         vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]!
   1579         vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]!
   1580         vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]!
   1581         vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]!
   1582         vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]!
   1583         vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]!
   1584         vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]!
   1585 
   1586         subs r2, r2, #1
   1587         bne 1b
   1588 
   1589         vpop            {q4-q7}
   1590         ldmfd           sp!, {r4, lr}
   1591         bx              lr
   1592 END(rsdIntrinsicBlendAdd_K)
   1593 
   1594 
   1595 /*
   1596         dst = max(dst - src, 0.0)
   1597 
   1598         r0 = dst
   1599         r1 = src
   1600         r2 = length
   1601 */
   1602 ENTRY(rsdIntrinsicBlendSub_K)
   1603         .save           {r4, lr}
   1604         stmfd           sp!, {r4, lr}
   1605         vpush           {q4-q7}
   1606 
   1607         mov r4, #255
   1608         vdup.16 q7, r4
   1609 
   1610         mov r4, r0
   1611 1:
   1612 
   1613         /* src */
   1614         vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
   1615         vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
   1616         vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
   1617         vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
   1618         vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r1]!
   1619         vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r1]!
   1620         vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r1]!
   1621         vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r1]!
   1622         vmovl.u8 q12, d0
   1623         vmovl.u8 q13, d1
   1624         vmovl.u8 q14, d2
   1625         vmovl.u8 q15, d3
   1626 
   1627         /* dst */
   1628         vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
   1629         vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
   1630         vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
   1631         vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
   1632         vld4.8 {d0[4],d1[4],d2[4],d3[4]}, [r0]!
   1633         vld4.8 {d0[5],d1[5],d2[5],d3[5]}, [r0]!
   1634         vld4.8 {d0[6],d1[6],d2[6],d3[6]}, [r0]!
   1635         vld4.8 {d0[7],d1[7],d2[7],d3[7]}, [r0]!
   1636         vmovl.u8 q8, d0
   1637         vmovl.u8 q9, d1
   1638         vmovl.u8 q10, d2
   1639         vmovl.u8 q11, d3
   1640 
   1641 
   1642         vsub.i16 q8, q8, q12
   1643         vsub.i16 q9, q9, q13
   1644         vsub.i16 q10, q10, q14
   1645         vsub.i16 q11, q11, q15
   1646 
   1647         vqmovun.s16 d0, q8
   1648         vqmovun.s16 d1, q9
   1649         vqmovun.s16 d2, q10
   1650         vqmovun.s16 d3, q11
   1651         vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r4]!
   1652         vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r4]!
   1653         vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r4]!
   1654         vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r4]!
   1655         vst4.8 {d0[4],d1[4],d2[4],d3[4]}, [r4]!
   1656         vst4.8 {d0[5],d1[5],d2[5],d3[5]}, [r4]!
   1657         vst4.8 {d0[6],d1[6],d2[6],d3[6]}, [r4]!
   1658         vst4.8 {d0[7],d1[7],d2[7],d3[7]}, [r4]!
   1659 
   1660         subs r2, r2, #1
   1661         bne 1b
   1662 
   1663         vpop            {q4-q7}
   1664         ldmfd           sp!, {r4, lr}
   1665         bx              lr
   1666 END(rsdIntrinsicBlendSub_K)
   1667 
   1668 
   1669 /* 3D LUT */
   1670 
   1671 /*
   1672         r0 = dst
   1673         r1 = src
   1674         r2 = cube base pointer
   1675         r3 = cube Y stride
   1676         r4 = cube Z stride
   1677         r5 = count
   1678         xr10 = * constants
   1679 
   1680         d0  / q0  = weight 1 p1
   1681         d1        = weight 2 p1
   1682 
   1683         d2  / q1  = weight 1 p2
   1684         d3        = weight 2 p2
   1685 
   1686         d4  / q2  = src1
   1687         d5        = src2
   1688 
   1689         d6  / q3  = baseCoord
   1690         d7        = baseCoord
   1691 
   1692         d8  / q4  = coord1 p1
   1693         d9        =
   1694 
   1695         d10 / q5  = coord1 p2
   1696         d11       =
   1697 
   1698         d12 / q6  =
   1699         d13       =
   1700 
   1701         d14 / q7  =
   1702         d15       =
   1703 
   1704 
   1705         d16 / q8  = x0 y0 z0
   1706         d17       = x1 y0 z0
   1707         d18 / q9  = x0 y1 z0
   1708         d19       = x1 y1 z0
   1709         d20 / q10 = x0 y0 z1
   1710         d21       = x1 y0 z1
   1711         d22 / q11 = x0 y1 z1
   1712         d23       = x1 y1 z1
   1713 
   1714         d24 / q12 = alpha mash
   1715         d25       = current pixel alpha
   1716         d26 / q13 = 4, y stride
   1717         d27       = z stride, 0
   1718         d28 / q14 = 0x8000
   1719         d29       = 0x7fff
   1720         d30 / q15 = 0, 0, 0, 0xffff
   1721 
   1722 
   1723         d31 = coordMult
   1724 */
   1725 
   1726 ENTRY(rsdIntrinsic3DLUT_K)
   1727         push        {r4-r8, r10, r11, lr}
   1728         vpush       {q4-q7}
   1729 
   1730         /* load Z stride in r4 */
   1731         ldr     r4, [sp, #32 + 64]
   1732 
   1733         /* Load count */
   1734         ldr     r5, [sp, #36 + 64]
   1735 
   1736         vmov.u16 d28, #0x8000
   1737         vmov.u16 d29, #0x7fff
   1738         vmov.u32 d24, #0xff000000
   1739 
   1740         /* load constants using r10 */
   1741         ldr     r10, [sp, #40 + 64]
   1742         vld1.32 {d31}, [r10]!
   1743         vld1.32 {d30}, [r10]!
   1744 
   1745         mov r6, #4
   1746         vmov d26, r6, r3
   1747         mov r6, #0
   1748         vmov d27, r4, r6
   1749 
   1750         add r8, r3, r4
   1751 
   1752 
   1753 
   1754 1:
   1755         vld1.8 {d4}, [r1]!
   1756         vand.u8 d25, d4, d24
   1757         vmovl.u8 q2, d4
   1758 
   1759 
   1760         vmull.u16 q3, d4, d31
   1761         vshr.u32 q4, q3, #15       // coord1 p1
   1762         vmovn.u32 d1, q3
   1763         vand.u16 d1, d29           // weight 2
   1764         vsub.u16 d0, d28, d1       // weight 1
   1765         vmul.u32 q4, q4, q13           // q4 = x*4, y*ystride, z*zstride, 0
   1766 
   1767         vmull.u16 q3, d5, d31
   1768         vshr.u32 q5, q3, #15       // coord1 p2
   1769         vmovn.u32 d3, q3
   1770         vand.u16 d3, d29           // weight 2
   1771         vsub.u16 d2, d28, d3       // weight 1
   1772         vmul.u32 q5, q5, q13       // q5 = x*4, y*ystride, z*zstride, 0
   1773 
   1774         vpadd.u32 d8, d8, d9
   1775         vpadd.u32 d9, d10, d11
   1776         vpadd.u32 d8, d8, d9
   1777         vmov r6, r7, d8            // base pointers
   1778 
   1779         add  r6, r6, r2
   1780         add  r7, r7, r2
   1781 
   1782         vld1.8 {d16}, [r6]
   1783         add r11, r6, r3
   1784         vld1.8 {d18}, [r11]
   1785         add r11, r6, r4
   1786         vld1.8 {d20}, [r11]
   1787         add r11, r6, r8
   1788         vld1.8 {d22}, [r11]
   1789 
   1790         vmovl.u8 q8, d16
   1791         vmovl.u8 q9, d18
   1792         vmovl.u8 q10, d20
   1793         vmovl.u8 q11, d22
   1794 
   1795         vmull.u16 q6, d16, d0[0]
   1796         vmlal.u16 q6, d17, d1[0]
   1797         vshrn.u32 d16, q6, #7
   1798         vmull.u16 q6, d18, d0[0]
   1799         vmlal.u16 q6, d19, d1[0]
   1800         vshrn.u32 d18, q6, #7
   1801         vmull.u16 q6, d20, d0[0]
   1802         vmlal.u16 q6, d21, d1[0]
   1803         vshrn.u32 d20, q6, #7
   1804         vmull.u16 q6, d22, d0[0]
   1805         vmlal.u16 q6, d23, d1[0]
   1806         vshrn.u32 d22, q6, #7
   1807 
   1808         vmull.u16 q6, d16, d0[1]
   1809         vmlal.u16 q6, d18, d1[1]
   1810         vshrn.u32 d16, q6, #15
   1811         vmull.u16 q6, d20, d0[1]
   1812         vmlal.u16 q6, d22, d1[1]
   1813         vshrn.u32 d18, q6, #15
   1814 
   1815         vmull.u16 q6, d16, d0[2]
   1816         vmlal.u16 q6, d18, d1[2]
   1817         vshrn.u32 d14, q6, #15
   1818 
   1819 
   1820         vld1.8 {d16}, [r7]
   1821         add r11, r7, r3
   1822         vld1.8 {d18}, [r11]
   1823         add r11, r7, r4
   1824         vld1.8 {d20}, [r11]
   1825         add r11, r7, r8
   1826         vld1.8 {d22}, [r11]
   1827         vmovl.u8 q8, d16
   1828         vmovl.u8 q9, d18
   1829         vmovl.u8 q10, d20
   1830         vmovl.u8 q11, d22
   1831 
   1832         vmull.u16 q6, d16, d2[0]
   1833         vmlal.u16 q6, d17, d3[0]
   1834         vshrn.u32 d16, q6, #7
   1835         vmull.u16 q6, d18, d2[0]
   1836         vmlal.u16 q6, d19, d3[0]
   1837         vshrn.u32 d18, q6, #7
   1838         vmull.u16 q6, d20, d2[0]
   1839         vmlal.u16 q6, d21, d3[0]
   1840         vshrn.u32 d20, q6, #7
   1841         vmull.u16 q6, d22, d2[0]
   1842         vmlal.u16 q6, d23, d3[0]
   1843         vshrn.u32 d22, q6, #7
   1844 
   1845         vmull.u16 q6, d16, d2[1]
   1846         vmlal.u16 q6, d18, d3[1]
   1847         vshrn.u32 d16, q6, #15
   1848         vmull.u16 q6, d20, d2[1]
   1849         vmlal.u16 q6, d22, d3[1]
   1850         vshrn.u32 d18, q6, #15
   1851 
   1852         vmull.u16 q6, d16, d2[2]
   1853         vmlal.u16 q6, d18, d3[2]
   1854         vshrn.u32 d15, q6, #15
   1855 
   1856         vrshrn.u16 d14, q7, #8
   1857 
   1858         vbic.u8 d14, d14, d24  // mix in alpha
   1859         vorr.u8 d14, d14, d25
   1860         vst1.32 {d14}, [r0]!
   1861 
   1862 
   1863         /* Are we done? */
   1864         subs r5, r5, #1
   1865         bne 1b
   1866 
   1867         /* Yup, bye */
   1868         vpop            {q4-q7}
   1869         pop         {r4-r8, r10, r11, lr}
   1870         bx          lr
   1871 
   1872 END(rsdIntrinsic3DLUT_K)
   1873 
   1874 
   1875