Home | History | Annotate | Download | only in libswconverter
      1 
      2     .arch armv7-a
      3     .text
      4     .global csc_ARGB8888_to_YUV420SP_NEON
      5     .type   csc_ARGB8888_to_YUV420SP_NEON, %function
      6 csc_ARGB8888_to_YUV420SP_NEON:
      7     .fnstart
      8 
      9     @r0     pDstY
     10     @r1     pDstUV
     11     @r2     pSrcRGB
     12     @r3     nWidth
     13     @r4     pDstY2 = pDstY + nWidth
     14     @r5     pSrcRGB2 = pSrcRGB + nWidthx2
     15     @r6     temp7, nWidth16m
     16     @r7     temp6, accumilator
     17     @r8     temp5, nWidthTemp
     18     @r9     temp4, Raw RGB565
     19     @r10    temp3, r,g,b
     20     @r11    temp2, immediate operand
     21     @r12    temp1, nHeight
     22     @r14    temp0, debugging pointer
     23 
     24     .equ CACHE_LINE_SIZE, 32
     25     .equ PRE_LOAD_OFFSET, 6
     26 
     27     stmfd       sp!, {r4-r12,r14}       @ backup registers
     28     ldr         r12, [sp, #40]           @ load nHeight
     29     @ldr         r14, [sp, #44]          @ load pTest
     30     add         r4, r0, r3             @r4: pDstY2 = pDstY + nWidth
     31     add         r5, r2, r3, lsl #2     @r5: pSrcRGB2 = tmpSrcRGB + nWidthx4
     32     sub         r8, r3, #16                @r8: nWidthTmp = nWidth -16
     33 
     34     @q0: temp1, R
     35     @q1: temp2, GB
     36     @q2: R
     37     @q3: G
     38     @q4: B
     39     @q5: temp3, output
     40 
     41 
     42     vmov.u16 q6, #66 @coefficient assignment
     43     vmov.u16 q7, #129
     44     vmov.u16 q8, #25
     45     vmov.u16 q9,  #0x8080  @ 128<<8 + 128
     46 
     47     vmov.u16 q10, #0x1000  @ 16<<8 + 128
     48     vorr.u16 q10, #0x0080
     49 
     50     vmov.u16 q11, #38 @#-38
     51     vmov.u16 q12, #74 @#-74
     52     vmov.u16 q13, #112
     53     vmov.u16 q14, #94 @#-94
     54     vmov.u16 q15, #18 @#-18
     55 
     56 
     57 
     58 
     59 LOOP_NHEIGHT2:
     60     stmfd       sp!, {r12}       @ backup registers
     61 
     62 LOOP_NWIDTH16:
     63     pld         [r2, #(CACHE_LINE_SIZE*PRE_LOAD_OFFSET)]
     64    @-------------------------------------------YUV ------------------------------------------
     65     vmov.u16 q14, #94 @#94
     66     vmov.u16 q15, #18 @#18
     67     vld4.8   {d0,d1,d2,d3}, [r2]! @loadRGB interleavely
     68     vld4.8   {d4,d5,d6,d7}, [r2]! @loadRGB interleavely
     69 
     70 
     71     vmov.u16 d8,d2
     72     vmov.u16 d9,d6
     73     vmov.u16 d10,d1
     74     vmov.u16 d11,d5
     75     vmov.u16 d12,d0
     76     vmov.u16 d13,d4
     77 
     78     vand.u16 q4,#0x00FF  @R
     79     vand.u16 q5,#0x00FF  @G
     80     vand.u16 q6,#0x00FF  @B
     81 
     82     vmov.u16 q8,q9   @ CalcU()
     83     vmla.u16 q8,q6,q13  @112 * B[k]
     84     vmls.u16 q8,q4,q11  @q0:U -(38 * R[k]) @128<<6+ 32 + u>>2
     85     vmls.u16 q8,q5,q12  @-(74 * G[k])
     86     vshr.u16 q8,q8, #8  @(128<<8+ 128 + u)>>8
     87 
     88     vmov.u16 q7,q9      @CalcV()
     89     vmla.u16 q7,q4,q13  @112 * R[k]
     90     vmls.u16 q7,q5,q14  @q0:U -(94 * G[k])  @128<<6+ 32 + v>>2
     91     vmls.u16 q7,q6,q15  @-(18 * B[k])
     92     vshr.u16 q7,q7, #8  @(128<<8+ 128 + v)>>8
     93 
     94 
     95     vtrn.8 q8,q7
     96     vst1.8  {q8}, [r1]!    @write UV component to yuv420_buffer+linear_ylanesiez
     97 
     98     @-------------------------------------------Y ------------------------------------------
     99 
    100     vmov.u16 q14, #66 @#66
    101     vmov.u16 q15, #129 @#129
    102     vmov.u16 q8, #25 @#25
    103 
    104     @CalcY_Y()
    105 
    106     vmul.u16 q7,q4,q14  @q0 = 66 *R[k]
    107     vmla.u16 q7,q5,q15  @q0 += 129 *G[k]
    108     vmla.u16 q7,q6,q8  @q0 += 25 *B[k]
    109 
    110     vadd.u16 q7,q7,q10
    111     vshr.u16 q7,q7, #8
    112 
    113     vmov.u16 d8,d2
    114     vmov.u16 d9,d6
    115     vmov.u16 d10,d1
    116     vmov.u16 d11,d5
    117     vmov.u16 d12,d0
    118     vmov.u16 d13,d4
    119 
    120     vshr.u16 q4,q4,#8  @R
    121     vshr.u16 q5,q5,#8  @G
    122     vshr.u16 q6,q6,#8  @B
    123 
    124     vmul.u16 q0,q4,q14  @q0 = 66 *R[k]
    125     vmla.u16 q0,q5,q15  @q0 += 129 *G[k]
    126     vmla.u16 q0,q6,q8  @q0 += 25 *B[k]
    127     vadd.u16 q0,q0,q10
    128     vshr.u16 q0,q0, #8
    129 
    130     vtrn.8 q7,q0
    131     vst1.8  {q7}, [r0]!@write to Y to yuv420_buffer
    132 
    133 
    134 
    135    @-------------------------------------------Y ------------------------------------------
    136 
    137             @---------------------------------------------Y1-------------------------------------------
    138 
    139     pld         [r5, #(CACHE_LINE_SIZE*PRE_LOAD_OFFSET)]
    140     vld4.8   {d0,d1,d2,d3}, [r5]! @loadRGB interleavely
    141     vld4.8   {d4,d5,d6,d7}, [r5]! @loadRGB interleavely
    142 
    143     vmov.u16 d8,d2
    144     vmov.u16 d9,d6
    145     vmov.u16 d10,d1
    146     vmov.u16 d11,d5
    147     vmov.u16 d12,d0
    148     vmov.u16 d13,d4
    149 
    150 
    151     vand.u16 q4,#0x00FF  @R
    152     vand.u16 q5,#0x00FF  @G
    153     vand.u16 q6,#0x00FF  @B
    154 
    155 
    156 
    157     vmul.u16 q7,q4,q14  @q0 = 66 *R[k]
    158     vmla.u16 q7,q5,q15  @q0 += 129 *G[k]
    159     vmla.u16 q7,q6,q8  @q0 += 25 *B[k]
    160     vadd.u16 q7,q7,q10
    161     vshr.u16 q7,q7, #8
    162 
    163     vmov.u16 d8,d2
    164     vmov.u16 d9,d6
    165     vmov.u16 d10,d1
    166     vmov.u16 d11,d5
    167     vmov.u16 d12,d0
    168     vmov.u16 d13,d4
    169 
    170     vshr.u16 q4,q4,#8  @R
    171     vshr.u16 q5,q5,#8  @G
    172     vshr.u16 q6,q6,#8  @B
    173 
    174     vmul.u16 q0,q4,q14  @q0 = 66 *R[k]
    175     vmla.u16 q0,q5,q15  @q0 += 129 *G[k]
    176     vmla.u16 q0,q6,q8  @q0 += 25 *B[k]
    177     vadd.u16 q0,q0,q10
    178     vshr.u16 q0,q0, #8
    179 
    180     vtrn.8 q7,q0
    181     vst1.8  {q7}, [r4]!@write to Y to yuv420_buffer
    182 
    183     subs r8,r8,#16                       @nWidth16--
    184     BPL LOOP_NWIDTH16                @if nWidth16>0
    185     @-----------------------------------unaligned ---------------------------------------
    186 
    187     adds r8,r8,#16 @ + 16 - 2
    188     BEQ NO_UNALIGNED  @in case that nWidht is multiple of 16
    189 LOOP_NWIDTH2:
    190     @----------------------------------pDstRGB1--Y------------------------------------------
    191     @stmfd sp!, {r14} @backup r14
    192 
    193 
    194     ldr r9,  [r2], #4 @loadRGB  int
    195     ldr r12,  [r2], #4 @loadRGB  int
    196 
    197     mov r10, r9,lsr #16    @copy to r10
    198     mov r14, r12    @copy to r10
    199 
    200     ldr r6, =0x000000FF
    201     and r10, r10, r6 @R: (rgbIn[k] & 0xF800) >> 10;
    202     ldr r6, =0x00FF0000
    203     and r14, r14, r6 @R: (rgbIn[k] & 0xF800) >> 10;
    204     add r10,r10,r14
    205 
    206     mov r11, #66 @accumilator += R*66
    207     mul r7, r10, r11
    208 
    209     mov r10, r9,lsr #8    @copy to r10
    210     mov r14, r12,lsl #8    @copy to r10
    211 
    212     ldr r6, =0x000000FF
    213     and r10, r10, r6 @G:
    214     ldr r6, =0x00FF0000
    215     and r14, r14, r6 @G:
    216     add r10,r10,r14
    217 
    218     mov r11, #129 @accumilator += G *129
    219     mla r7, r10, r11, r7
    220 
    221     mov r10, r9    @copy to r10
    222     mov r14, r12,lsl #16    @copy to r10
    223 
    224     ldr r6, =0x000000FF
    225     and r10, r10, r6 @B
    226     ldr r6, =0x00FF0000
    227     and r14, r14, r6 @B
    228     add r10,r10,r14
    229 
    230     mov r11, #25 @accumilator 1 -= B *25
    231     mla r7, r10, r11, r7
    232 
    233     ldr r6, =0x10801080
    234     add  r7, r6
    235 
    236     lsr r7, #8
    237     strb r7, [r0],#1
    238     lsr r7,#16
    239     strb r7, [r0],#1
    240     @ldmfd sp!, {r14} @load r14
    241 
    242 
    243     @----------------------------------pDstRGB2--UV------------------------------------------
    244 
    245     mov r10, r9    @copy to r10
    246     ldr  r7,=0x00008080
    247     mov  r12,r7
    248 
    249     ldr r6, =0x000000FF
    250     and r10, r10, r6 @B:
    251 
    252     mov r11, #112 @accumilator += B*112
    253     mla r7, r10, r11, r7
    254 
    255 
    256     mov r11, #18 @accumilator -= B*18
    257     mul r11, r10, r11
    258     sub r12, r12, r11
    259 
    260 
    261 
    262 
    263     mov r10, r9, lsr #16    @copy to r10
    264     ldr r6, =0x000000FF
    265     and r10, r10, r6 @R: (rgbIn[k] & 0xF800) >> 10;
    266 
    267     mov r11, #38 @accumilator -= R *38
    268     mul r11, r10, r11
    269     sub r7, r7, r11
    270 
    271     mov r11, #112 @accumilator  = R *112
    272     mla r12, r10, r11, r12
    273 
    274     mov r10, r9,lsr #8    @copy to r10
    275     ldr r6, =0x000000FF
    276     and r10, r10, r6  @G: (rgbIn[k] & 0x07E0) >> 5;
    277 
    278     mov r11, #74 @accumilator -= G*74
    279     mul r11, r10, r11
    280     sub r7, r7, r11
    281 
    282     mov r11, #94 @accumilator -= G*94
    283     mul r11, r10, r11
    284     sub r12, r12, r11
    285 
    286     lsr r7, #8 @ >>8
    287     strb r7, [r1],#1
    288     lsr r12, #8 @ >>8
    289     strb r12, [r1],#1
    290 
    291     @----------------------------------pDstRGB2--Y------------------------------------------
    292     @stmfd sp!, {r14} @backup r14
    293 
    294 
    295     ldr r9,  [r5], #4 @loadRGB  int
    296     ldr r12,  [r5], #4 @loadRGB  int
    297 
    298     mov r10, r9,lsr #16    @copy to r10
    299     mov r14, r12    @copy to r10
    300 
    301     ldr r6, =0x000000FF
    302     and r10, r10, r6 @R: (rgbIn[k] & 0xF800) >> 10;
    303     ldr r6, =0x00FF0000
    304     and r14, r14, r6 @R: (rgbIn[k] & 0xF800) >> 10;
    305     add r10,r10,r14
    306 
    307     mov r11, #66 @accumilator += R*66
    308     mul r7, r10, r11
    309 
    310     mov r10, r9,lsr #8    @copy to r10
    311     mov r14, r12,lsl #8    @copy to r10
    312 
    313     ldr r6, =0x000000FF
    314     and r10, r10, r6 @G:
    315     ldr r6, =0x00FF0000
    316     and r14, r14, r6 @G:
    317     add r10,r10,r14
    318 
    319     mov r11, #129 @accumilator += G *129
    320     mla r7, r10, r11, r7
    321 
    322     mov r10, r9    @copy to r10
    323     mov r14, r12,lsl #16    @copy to r10
    324 
    325     ldr r6, =0x000000FF
    326     and r10, r10, r6 @B
    327     ldr r6, =0x00FF0000
    328     and r14, r14, r6 @B
    329     add r10,r10,r14
    330 
    331 
    332 
    333 
    334     mov r11, #25 @accumilator 1 -= B *25
    335     mla r7, r10, r11, r7
    336 
    337     ldr r6, =0x10801080
    338     add  r7, r6
    339     lsr r7, #8
    340 
    341     strb r7, [r4],#1
    342     lsr r7,#16
    343     strb r7, [r4],#1
    344     @ldmfd sp!, {r14} @load r14
    345 
    346 
    347     subs r8,r8,#2                      @ nWidth2 -= 2
    348     BGT LOOP_NWIDTH2                @ if nWidth2>0
    349 
    350 
    351 NO_UNALIGNED: @in case that nWidht is multiple of 16
    352 
    353     @-----------------------------------------------------------------------------
    354     sub         r8, r3, #16                @r8: nWidthTmp = nWidth -16
    355     add r0, r0,  r3   @pDstY +  nwidth
    356     add r2, r2, r3, lsl #2    @pSrcRGB +  nwidthx4
    357     add r4, r4,  r3   @pDstY2 +  nwidth
    358     add r5, r5, r3, lsl #2   @pSrcRGB2 +  nwidthx4
    359 
    360     ldmfd sp!, {r12}
    361     subs r12,r12,#2                       @nHeight -=2
    362     BGT LOOP_NHEIGHT2                @if nHeight2>0
    363 
    364     ldmfd       sp!, {r4-r12,pc}       @ backup registers
    365     .fnend
    366