Home | History | Annotate | Download | only in source
      1   .global RestoreRegisters_NEON
      2   .global ReverseLine_NEON
      3   .global ReverseLineUV_NEON
      4   .global SaveRegisters_NEON
      5   .global TransposeWx8_NEON
      6   .global TransposeUVWx8_NEON
      7   .type RestoreRegisters_NEON, function
      8   .type ReverseLine_NEON, function
      9   .type ReverseLineUV_NEON, function
     10   .type SaveRegisters_NEON, function
     11   .type TransposeWx8_NEON, function
     12   .type TransposeUVWx8_NEON, function
     13 
     14 @ void ReverseLine_NEON (const uint8* src, uint8* dst, int width)
     15 @ r0 const uint8* src
     16 @ r1 uint8* dst
     17 @ r2 width
     18 ReverseLine_NEON:
     19 
     20   @ compute where to start writing destination
     21   add         r1, r2      @ dst + width
     22 
     23   @ work on segments that are multiples of 16
     24   lsrs        r3, r2, #4
     25 
     26   @ the output is written in two block.  8 bytes followed
     27   @ by another 8.  reading is done sequentially, from left to
     28   @ right.  writing is done from right to left in block sizes
     29   @ r1, the destination pointer is incremented after writing
     30   @ the first of the two blocks.  need to subtract that 8 off
     31   @ along with 16 to get the next location.
     32   mov         r3, #-24
     33 
     34   beq         Lline_residuals
     35 
     36   @ back of destination by the size of the register that is
     37   @ going to be reversed
     38   sub         r1, #16
     39 
     40   @ the loop needs to run on blocks of 16.  what will be left
     41   @ over is either a negative number, the residuals that need
     42   @ to be done, or 0.  if this isn't subtracted off here the
     43   @ loop will run one extra time.
     44   sub         r2, #16
     45 
     46 Lsegments_of_16:
     47     vld1.8      {q0}, [r0]!               @ src += 16
     48 
     49     @ reverse the bytes in the 64 bit segments.  unable to reverse
     50     @ the bytes in the entire 128 bits in one go.
     51     vrev64.8    q0, q0
     52 
     53     @ because of the inability to reverse the entire 128 bits
     54     @ reverse the writing out of the two 64 bit segments.
     55     vst1.8      {d1}, [r1]!
     56     vst1.8      {d0}, [r1], r3            @ dst -= 16
     57 
     58     subs        r2, #16
     59     bge         Lsegments_of_16
     60 
     61   @ add 16 back to the counter.  if the result is 0 there is no
     62   @ residuals so return
     63   adds        r2, #16
     64   bxeq        lr
     65 
     66   add         r1, #16
     67 
     68 Lline_residuals:
     69 
     70   mov         r3, #-3
     71 
     72   sub         r1, #2
     73   subs        r2, #2
     74   @ check for 16*n+1 scenarios where segments_of_2 should not
     75   @ be run, but there is something left over.
     76   blt         Lsegment_of_1
     77 
     78 @ do this in neon registers as per
     79 @ http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/
     80 Lsegments_of_2:
     81     vld2.8      {d0[0], d1[0]}, [r0]!     @ src += 2
     82 
     83     vst1.8      {d1[0]}, [r1]!
     84     vst1.8      {d0[0]}, [r1], r3         @ dst -= 2
     85 
     86     subs        r2, #2
     87     bge         Lsegments_of_2
     88 
     89   adds        r2, #2
     90   bxeq        lr
     91 
     92 Lsegment_of_1:
     93   add         r1, #1
     94   vld1.8      {d0[0]}, [r0]
     95   vst1.8      {d0[0]}, [r1]
     96 
     97   bx          lr
     98 
     99 @ void TransposeWx8_NEON (const uint8* src, int src_stride,
    100 @                         uint8* dst, int dst_stride,
    101 @                         int w)
    102 @ r0 const uint8* src
    103 @ r1 int src_stride
    104 @ r2 uint8* dst
    105 @ r3 int dst_stride
    106 @ stack int w
    107 TransposeWx8_NEON:
    108   push        {r4,r8,r9,lr}
    109 
    110   ldr         r8, [sp, #16]        @ width
    111 
    112   @ loops are on blocks of 8.  loop will stop when
    113   @ counter gets to or below 0.  starting the counter
    114   @ at w-8 allow for this
    115   sub         r8, #8
    116 
    117 @ handle 8x8 blocks.  this should be the majority of the plane
    118 Lloop_8x8:
    119     mov         r9, r0
    120 
    121     vld1.8      {d0}, [r9], r1
    122     vld1.8      {d1}, [r9], r1
    123     vld1.8      {d2}, [r9], r1
    124     vld1.8      {d3}, [r9], r1
    125     vld1.8      {d4}, [r9], r1
    126     vld1.8      {d5}, [r9], r1
    127     vld1.8      {d6}, [r9], r1
    128     vld1.8      {d7}, [r9]
    129 
    130     vtrn.8      d1, d0
    131     vtrn.8      d3, d2
    132     vtrn.8      d5, d4
    133     vtrn.8      d7, d6
    134 
    135     vtrn.16     d1, d3
    136     vtrn.16     d0, d2
    137     vtrn.16     d5, d7
    138     vtrn.16     d4, d6
    139 
    140     vtrn.32     d1, d5
    141     vtrn.32     d0, d4
    142     vtrn.32     d3, d7
    143     vtrn.32     d2, d6
    144 
    145     vrev16.8    q0, q0
    146     vrev16.8    q1, q1
    147     vrev16.8    q2, q2
    148     vrev16.8    q3, q3
    149 
    150     mov         r9, r2
    151 
    152     vst1.8      {d1}, [r9], r3
    153     vst1.8      {d0}, [r9], r3
    154     vst1.8      {d3}, [r9], r3
    155     vst1.8      {d2}, [r9], r3
    156     vst1.8      {d5}, [r9], r3
    157     vst1.8      {d4}, [r9], r3
    158     vst1.8      {d7}, [r9], r3
    159     vst1.8      {d6}, [r9]
    160 
    161     add         r0, #8            @ src += 8
    162     add         r2, r3, lsl #3    @ dst += 8 * dst_stride
    163     subs        r8,  #8           @ w   -= 8
    164     bge         Lloop_8x8
    165 
    166   @ add 8 back to counter.  if the result is 0 there are
    167   @ no residuals.
    168   adds        r8, #8
    169   beq         Ldone
    170 
    171   @ some residual, so between 1 and 7 lines left to transpose
    172   cmp         r8, #2
    173   blt         Lblock_1x8
    174 
    175   cmp         r8, #4
    176   blt         Lblock_2x8
    177 
    178 Lblock_4x8:
    179   mov         r9, r0
    180   vld1.32     {d0[0]}, [r9], r1
    181   vld1.32     {d0[1]}, [r9], r1
    182   vld1.32     {d1[0]}, [r9], r1
    183   vld1.32     {d1[1]}, [r9], r1
    184   vld1.32     {d2[0]}, [r9], r1
    185   vld1.32     {d2[1]}, [r9], r1
    186   vld1.32     {d3[0]}, [r9], r1
    187   vld1.32     {d3[1]}, [r9]
    188 
    189   mov         r9, r2
    190 
    191   adr         r12, vtbl_4x4_transpose
    192   vld1.8      {q3}, [r12]
    193 
    194   vtbl.8      d4, {d0, d1}, d6
    195   vtbl.8      d5, {d0, d1}, d7
    196   vtbl.8      d0, {d2, d3}, d6
    197   vtbl.8      d1, {d2, d3}, d7
    198 
    199   @ TODO: rework shuffle above to write
    200   @       out with 4 instead of 8 writes
    201   vst1.32     {d4[0]}, [r9], r3
    202   vst1.32     {d4[1]}, [r9], r3
    203   vst1.32     {d5[0]}, [r9], r3
    204   vst1.32     {d5[1]}, [r9]
    205 
    206   add         r9, r2, #4
    207   vst1.32     {d0[0]}, [r9], r3
    208   vst1.32     {d0[1]}, [r9], r3
    209   vst1.32     {d1[0]}, [r9], r3
    210   vst1.32     {d1[1]}, [r9]
    211 
    212   add         r0, #4            @ src += 4
    213   add         r2, r3, lsl #2    @ dst += 4 * dst_stride
    214   subs        r8,  #4           @ w   -= 4
    215   beq         Ldone
    216 
    217   @ some residual, check to see if it includes a 2x8 block,
    218   @ or less
    219   cmp         r8, #2
    220   blt         Lblock_1x8
    221 
    222 Lblock_2x8:
    223   mov         r9, r0
    224   vld1.16     {d0[0]}, [r9], r1
    225   vld1.16     {d1[0]}, [r9], r1
    226   vld1.16     {d0[1]}, [r9], r1
    227   vld1.16     {d1[1]}, [r9], r1
    228   vld1.16     {d0[2]}, [r9], r1
    229   vld1.16     {d1[2]}, [r9], r1
    230   vld1.16     {d0[3]}, [r9], r1
    231   vld1.16     {d1[3]}, [r9]
    232 
    233   vtrn.8      d0, d1
    234 
    235   mov         r9, r2
    236 
    237   vst1.64     {d0}, [r9], r3
    238   vst1.64     {d1}, [r9]
    239 
    240   add         r0, #2            @ src += 2
    241   add         r2, r3, lsl #1    @ dst += 2 * dst_stride
    242   subs        r8,  #2           @ w   -= 2
    243   beq         Ldone
    244 
    245 Lblock_1x8:
    246   vld1.8      {d0[0]}, [r0], r1
    247   vld1.8      {d0[1]}, [r0], r1
    248   vld1.8      {d0[2]}, [r0], r1
    249   vld1.8      {d0[3]}, [r0], r1
    250   vld1.8      {d0[4]}, [r0], r1
    251   vld1.8      {d0[5]}, [r0], r1
    252   vld1.8      {d0[6]}, [r0], r1
    253   vld1.8      {d0[7]}, [r0]
    254 
    255   vst1.64     {d0}, [r2]
    256 
    257 Ldone:
    258 
    259   pop         {r4,r8,r9,pc}
    260 
    261 vtbl_4x4_transpose:
    262   .byte  0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15
    263 
    264 @ void SaveRegisters_NEON (unsigned long long store)
    265 @ r0 unsigned long long store
    266 SaveRegisters_NEON:
    267   vst1.i64    {d8, d9, d10, d11}, [r0]!
    268   vst1.i64    {d12, d13, d14, d15}, [r0]!
    269   bx          lr
    270 
    271 @ void RestoreRegisters_NEON (unsigned long long store)
    272 @ r0 unsigned long long store
    273 RestoreRegisters_NEON:
    274   vld1.i64    {d8, d9, d10, d11}, [r0]!
    275   vld1.i64    {d12, d13, d14, d15}, [r0]!
    276   bx          lr
    277 
    278 @ void ReverseLineUV_NEON (const uint8* src,
    279 @                          uint8* dst_a,
    280 @                          uint8* dst_b,
    281 @                          int width)
    282 @ r0 const uint8* src
    283 @ r1 uint8* dst_a
    284 @ r2 uint8* dst_b
    285 @ r3 width
    286 ReverseLineUV_NEON:
    287 
    288   @ compute where to start writing destination
    289   add         r1, r1, r3      @ dst_a + width
    290   add         r2, r2, r3      @ dst_b + width
    291 
    292   @ work on input segments that are multiples of 16, but
    293   @ width that has been passed is output segments, half
    294   @ the size of input.
    295   lsrs        r12, r3, #3
    296 
    297   beq         Lline_residuals_di
    298 
    299   @ the output is written in to two blocks.
    300   mov         r12, #-8
    301 
    302   @ back of destination by the size of the register that is
    303   @ going to be reversed
    304   sub         r1, r1, #8
    305   sub         r2, r2, #8
    306 
    307   @ the loop needs to run on blocks of 8.  what will be left
    308   @ over is either a negative number, the residuals that need
    309   @ to be done, or 0.  if this isn't subtracted off here the
    310   @ loop will run one extra time.
    311   sub         r3, r3, #8
    312 
    313 Lsegments_of_8_di:
    314     vld2.8      {d0, d1}, [r0]!         @ src += 16
    315 
    316     @ reverse the bytes in the 64 bit segments
    317     vrev64.8    q0, q0
    318 
    319     vst1.8      {d0}, [r1], r12         @ dst_a -= 8
    320     vst1.8      {d1}, [r2], r12         @ dst_b -= 8
    321 
    322     subs        r3, r3, #8
    323     bge         Lsegments_of_8_di
    324 
    325   @ add 8 back to the counter.  if the result is 0 there is no
    326   @ residuals so return
    327   adds        r3, r3, #8
    328   bxeq        lr
    329 
    330   add         r1, r1, #8
    331   add         r2, r2, #8
    332 
    333 Lline_residuals_di:
    334 
    335   mov         r12, #-1
    336 
    337   sub         r1, r1, #1
    338   sub         r2, r2, #1
    339 
    340 @ do this in neon registers as per
    341 @ http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/
    342 Lsegments_of_1:
    343     vld2.8      {d0[0], d1[0]}, [r0]!     @ src += 2
    344 
    345     vst1.8      {d0[0]}, [r1], r12        @ dst_a -= 1
    346     vst1.8      {d1[0]}, [r2], r12        @ dst_b -= 1
    347 
    348     subs        r3, r3, #1
    349     bgt         Lsegments_of_1
    350 
    351   bx          lr
    352 
    353 @ void TransposeUVWx8_NEON (const uint8* src, int src_stride,
    354 @                           uint8* dst_a, int dst_stride_a,
    355 @                           uint8* dst_b, int dst_stride_b,
    356 @                           int width)
    357 @ r0 const uint8* src
    358 @ r1 int src_stride
    359 @ r2 uint8* dst_a
    360 @ r3 int dst_stride_a
    361 @ stack uint8* dst_b
    362 @ stack int dst_stride_b
    363 @ stack int width
    364 TransposeUVWx8_NEON:
    365   push        {r4-r9,lr}
    366 
    367   ldr         r4, [sp, #28]         @ dst_b
    368   ldr         r5, [sp, #32]         @ dst_stride_b
    369   ldr         r8, [sp, #36]         @ width
    370   @ loops are on blocks of 8.  loop will stop when
    371   @ counter gets to or below 0.  starting the counter
    372   @ at w-8 allow for this
    373   sub         r8, #8
    374 
    375 @ handle 8x8 blocks.  this should be the majority of the plane
    376 Lloop_8x8_di:
    377     mov         r9, r0
    378 
    379     vld2.8      {d0,  d1},  [r9], r1
    380     vld2.8      {d2,  d3},  [r9], r1
    381     vld2.8      {d4,  d5},  [r9], r1
    382     vld2.8      {d6,  d7},  [r9], r1
    383     vld2.8      {d8,  d9},  [r9], r1
    384     vld2.8      {d10, d11}, [r9], r1
    385     vld2.8      {d12, d13}, [r9], r1
    386     vld2.8      {d14, d15}, [r9]
    387 
    388     vtrn.8      q1, q0
    389     vtrn.8      q3, q2
    390     vtrn.8      q5, q4
    391     vtrn.8      q7, q6
    392 
    393     vtrn.16     q1, q3
    394     vtrn.16     q0, q2
    395     vtrn.16     q5, q7
    396     vtrn.16     q4, q6
    397 
    398     vtrn.32     q1, q5
    399     vtrn.32     q0, q4
    400     vtrn.32     q3, q7
    401     vtrn.32     q2, q6
    402 
    403     vrev16.8    q0, q0
    404     vrev16.8    q1, q1
    405     vrev16.8    q2, q2
    406     vrev16.8    q3, q3
    407     vrev16.8    q4, q4
    408     vrev16.8    q5, q5
    409     vrev16.8    q6, q6
    410     vrev16.8    q7, q7
    411 
    412     mov         r9, r2
    413 
    414     vst1.8      {d2},  [r9], r3
    415     vst1.8      {d0},  [r9], r3
    416     vst1.8      {d6},  [r9], r3
    417     vst1.8      {d4},  [r9], r3
    418     vst1.8      {d10}, [r9], r3
    419     vst1.8      {d8},  [r9], r3
    420     vst1.8      {d14}, [r9], r3
    421     vst1.8      {d12}, [r9]
    422 
    423     mov         r9, r4
    424 
    425     vst1.8      {d3},  [r9], r5
    426     vst1.8      {d1},  [r9], r5
    427     vst1.8      {d7},  [r9], r5
    428     vst1.8      {d5},  [r9], r5
    429     vst1.8      {d11}, [r9], r5
    430     vst1.8      {d9},  [r9], r5
    431     vst1.8      {d15}, [r9], r5
    432     vst1.8      {d13}, [r9]
    433 
    434     add         r0, #8*2          @ src   += 8*2
    435     add         r2, r3, lsl #3    @ dst_a += 8 * dst_stride_a
    436     add         r4, r5, lsl #3    @ dst_b += 8 * dst_stride_b
    437     subs        r8,  #8           @ w     -= 8
    438     bge         Lloop_8x8_di
    439 
    440   @ add 8 back to counter.  if the result is 0 there are
    441   @ no residuals.
    442   adds        r8, #8
    443   beq         Ldone_di
    444 
    445   @ some residual, so between 1 and 7 lines left to transpose
    446   cmp         r8, #2
    447   blt         Lblock_1x8_di
    448 
    449   cmp         r8, #4
    450   blt         Lblock_2x8_di
    451 
    452 @ TODO(frkoenig) : clean this up
    453 Lblock_4x8_di:
    454   mov         r9, r0
    455   vld1.64     {d0}, [r9], r1
    456   vld1.64     {d1}, [r9], r1
    457   vld1.64     {d2}, [r9], r1
    458   vld1.64     {d3}, [r9], r1
    459   vld1.64     {d4}, [r9], r1
    460   vld1.64     {d5}, [r9], r1
    461   vld1.64     {d6}, [r9], r1
    462   vld1.64     {d7}, [r9]
    463 
    464   adr         r12, vtbl_4x4_transpose_di
    465   vld1.8      {q7}, [r12]
    466 
    467   vtrn.8      q0, q1
    468   vtrn.8      q2, q3
    469 
    470   vtbl.8      d8,  {d0, d1}, d14
    471   vtbl.8      d9,  {d0, d1}, d15
    472   vtbl.8      d10, {d2, d3}, d14
    473   vtbl.8      d11, {d2, d3}, d15
    474   vtbl.8      d12, {d4, d5}, d14
    475   vtbl.8      d13, {d4, d5}, d15
    476   vtbl.8      d0,  {d6, d7}, d14
    477   vtbl.8      d1,  {d6, d7}, d15
    478 
    479   mov         r9, r2
    480 
    481   vst1.32     {d8[0]},  [r9], r3
    482   vst1.32     {d8[1]},  [r9], r3
    483   vst1.32     {d9[0]},  [r9], r3
    484   vst1.32     {d9[1]},  [r9], r3
    485 
    486   add         r9, r2, #4
    487   vst1.32     {d12[0]}, [r9], r3
    488   vst1.32     {d12[1]}, [r9], r3
    489   vst1.32     {d13[0]}, [r9], r3
    490   vst1.32     {d13[1]}, [r9]
    491 
    492   mov         r9, r4
    493 
    494   vst1.32     {d10[0]}, [r9], r5
    495   vst1.32     {d10[1]}, [r9], r5
    496   vst1.32     {d11[0]}, [r9], r5
    497   vst1.32     {d11[1]}, [r9], r5
    498 
    499   add         r9, r4, #4
    500   vst1.32     {d0[0]},  [r9], r5
    501   vst1.32     {d0[1]},  [r9], r5
    502   vst1.32     {d1[0]},  [r9], r5
    503   vst1.32     {d1[1]},  [r9]
    504 
    505   add         r0, #4*2          @ src   += 4 * 2
    506   add         r2, r3, lsl #2    @ dst_a += 4 * dst_stride_a
    507   add         r4, r5, lsl #2    @ dst_b += 4 * dst_stride_b
    508   subs        r8,  #4           @ w     -= 4
    509   beq         Ldone_di
    510 
    511   @ some residual, check to see if it includes a 2x8 block,
    512   @ or less
    513   cmp         r8, #2
    514   blt         Lblock_1x8_di
    515 
    516 Lblock_2x8_di:
    517   mov         r9, r0
    518   vld2.16     {d0[0], d2[0]}, [r9], r1
    519   vld2.16     {d1[0], d3[0]}, [r9], r1
    520   vld2.16     {d0[1], d2[1]}, [r9], r1
    521   vld2.16     {d1[1], d3[1]}, [r9], r1
    522   vld2.16     {d0[2], d2[2]}, [r9], r1
    523   vld2.16     {d1[2], d3[2]}, [r9], r1
    524   vld2.16     {d0[3], d2[3]}, [r9], r1
    525   vld2.16     {d1[3], d3[3]}, [r9]
    526 
    527   vtrn.8      d0, d1
    528   vtrn.8      d2, d3
    529 
    530   mov         r9, r2
    531 
    532   vst1.64     {d0}, [r9], r3
    533   vst1.64     {d2}, [r9]
    534 
    535   mov         r9, r4
    536 
    537   vst1.64     {d1}, [r9], r5
    538   vst1.64     {d3}, [r9]
    539 
    540   add         r0, #2*2          @ src   += 2 * 2
    541   add         r2, r3, lsl #1    @ dst_a += 2 * dst_stride_a
    542   add         r4, r5, lsl #1    @ dst_a += 2 * dst_stride_a
    543   subs        r8,  #2           @ w     -= 2
    544   beq         Ldone_di
    545 
    546 Lblock_1x8_di:
    547   vld2.8      {d0[0], d1[0]}, [r0], r1
    548   vld2.8      {d0[1], d1[1]}, [r0], r1
    549   vld2.8      {d0[2], d1[2]}, [r0], r1
    550   vld2.8      {d0[3], d1[3]}, [r0], r1
    551   vld2.8      {d0[4], d1[4]}, [r0], r1
    552   vld2.8      {d0[5], d1[5]}, [r0], r1
    553   vld2.8      {d0[6], d1[6]}, [r0], r1
    554   vld2.8      {d0[7], d1[7]}, [r0]
    555 
    556   vst1.64     {d0}, [r2]
    557   vst1.64     {d1}, [r4]
    558 
    559 Ldone_di:
    560   pop         {r4-r9, pc}
    561 
    562 vtbl_4x4_transpose_di:
    563   .byte  0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15
    564