Home | History | Annotate | Download | only in neon
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12     EXPORT  |vp8_yv12_copy_frame_yonly_neon|
     13     EXPORT  |vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon|
     14 
     15     ARM
     16     REQUIRE8
     17     PRESERVE8
     18 
     19     INCLUDE asm_com_offsets.asm
     20 
     21     AREA ||.text||, CODE, READONLY, ALIGN=2
     22 ;void vpxyv12_copy_frame_yonly(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc);
     23 ; Note: this is VP8 function, which has border=32 and 16. Internal y_width and y_height
     24 ; are always multiples of 16.
     25 
     26 |vp8_yv12_copy_frame_yonly_neon| PROC
     27     push            {r4 - r11, lr}
     28     vpush           {d8 - d15}
     29 
     30     ldr             r4, [r0, #yv12_buffer_config_y_height]
     31     ldr             r5, [r0, #yv12_buffer_config_y_width]
     32     ldr             r6, [r0, #yv12_buffer_config_y_stride]
     33     ldr             r7, [r1, #yv12_buffer_config_y_stride]
     34     ldr             r2, [r0, #yv12_buffer_config_y_buffer]       ;srcptr1
     35     ldr             r3, [r1, #yv12_buffer_config_y_buffer]       ;dstptr1
     36 
     37     ; copy two rows at one time
     38     mov             lr, r4, lsr #1
     39 
     40 cp_src_to_dst_height_loop
     41     mov             r8, r2
     42     mov             r9, r3
     43     add             r10, r2, r6
     44     add             r11, r3, r7
     45     mov             r12, r5, lsr #7
     46 
     47 cp_src_to_dst_width_loop
     48     vld1.8          {q0, q1}, [r8]!
     49     vld1.8          {q8, q9}, [r10]!
     50     vld1.8          {q2, q3}, [r8]!
     51     vld1.8          {q10, q11}, [r10]!
     52     vld1.8          {q4, q5}, [r8]!
     53     vld1.8          {q12, q13}, [r10]!
     54     vld1.8          {q6, q7}, [r8]!
     55     vld1.8          {q14, q15}, [r10]!
     56 
     57     subs            r12, r12, #1
     58 
     59     vst1.8          {q0, q1}, [r9]!
     60     vst1.8          {q8, q9}, [r11]!
     61     vst1.8          {q2, q3}, [r9]!
     62     vst1.8          {q10, q11}, [r11]!
     63     vst1.8          {q4, q5}, [r9]!
     64     vst1.8          {q12, q13}, [r11]!
     65     vst1.8          {q6, q7}, [r9]!
     66     vst1.8          {q14, q15}, [r11]!
     67 
     68     bne             cp_src_to_dst_width_loop
     69 
     70     subs            lr, lr, #1
     71     add             r2, r2, r6, lsl #1
     72     add             r3, r3, r7, lsl #1
     73 
     74     bne             cp_src_to_dst_height_loop
     75 
     76     ands            r10, r5, #0x7f                  ;check to see if extra copy is needed
     77     sub             r11, r5, r10
     78     ldr             r2, [r0, #yv12_buffer_config_y_buffer]       ;srcptr1
     79     ldr             r3, [r1, #yv12_buffer_config_y_buffer]       ;dstptr1
     80     bne             extra_cp_src_to_dst_width
     81 end_of_cp_src_to_dst
     82 
     83 
     84     ;vpxyv12_extend_frame_borders_yonly
     85     mov             r0, r1
     86     ;Not need to load y_width, since: y_width = y_stride - 2*border
     87     ldr             r3, [r0, #yv12_buffer_config_border]
     88     ldr             r1, [r0, #yv12_buffer_config_y_buffer]       ;srcptr1
     89     ldr             r4, [r0, #yv12_buffer_config_y_height]
     90     ldr             lr, [r0, #yv12_buffer_config_y_stride]
     91 
     92     cmp             r3, #16
     93     beq             b16_extend_frame_borders
     94 
     95 ;=======================
     96 b32_extend_frame_borders
     97 ;border = 32
     98 ;=======================
     99 ;Border copy for Y plane
    100 ;copy the left and right most columns out
    101     sub             r5, r1, r3              ;destptr1
    102     add             r6, r1, lr
    103     sub             r6, r6, r3, lsl #1      ;destptr2
    104     sub             r2, r6, #1              ;srcptr2
    105 
    106     ;Do four rows at one time
    107     mov             r12, r4, lsr #2
    108 
    109 copy_left_right_y
    110     vld1.8          {d0[], d1[]}, [r1], lr
    111     vld1.8          {d4[], d5[]}, [r2], lr
    112     vld1.8          {d8[], d9[]}, [r1], lr
    113     vld1.8          {d12[], d13[]}, [r2], lr
    114     vld1.8          {d16[], d17[]},  [r1], lr
    115     vld1.8          {d20[], d21[]}, [r2], lr
    116     vld1.8          {d24[], d25[]}, [r1], lr
    117     vld1.8          {d28[], d29[]}, [r2], lr
    118 
    119     vmov            q1, q0
    120     vmov            q3, q2
    121     vmov            q5, q4
    122     vmov            q7, q6
    123     vmov            q9, q8
    124     vmov            q11, q10
    125     vmov            q13, q12
    126     vmov            q15, q14
    127 
    128     subs            r12, r12, #1
    129 
    130     vst1.8          {q0, q1}, [r5], lr
    131     vst1.8          {q2, q3}, [r6], lr
    132     vst1.8          {q4, q5}, [r5], lr
    133     vst1.8          {q6, q7}, [r6], lr
    134     vst1.8          {q8, q9}, [r5], lr
    135     vst1.8          {q10, q11}, [r6], lr
    136     vst1.8          {q12, q13}, [r5], lr
    137     vst1.8          {q14, q15}, [r6], lr
    138 
    139     bne             copy_left_right_y
    140 
    141 ;Now copy the top and bottom source lines into each line of the respective borders
    142     ldr             r7, [r0, #yv12_buffer_config_y_buffer]       ;srcptr1
    143     mul             r8, r3, lr
    144 
    145     mov             r12, lr, lsr #7
    146 
    147     sub             r6, r1, r3              ;destptr2
    148     sub             r2, r6, lr              ;srcptr2
    149     sub             r1, r7, r3              ;srcptr1
    150     sub             r5, r1, r8              ;destptr1
    151 
    152 copy_top_bottom_y
    153     vld1.8          {q0, q1}, [r1]!
    154     vld1.8          {q8, q9}, [r2]!
    155     vld1.8          {q2, q3}, [r1]!
    156     vld1.8          {q10, q11}, [r2]!
    157     vld1.8          {q4, q5}, [r1]!
    158     vld1.8          {q12, q13}, [r2]!
    159     vld1.8          {q6, q7}, [r1]!
    160     vld1.8          {q14, q15}, [r2]!
    161 
    162     mov             r7, r3
    163 
    164 top_bottom_32
    165     subs            r7, r7, #1
    166 
    167     vst1.8          {q0, q1}, [r5]!
    168     vst1.8          {q8, q9}, [r6]!
    169     vst1.8          {q2, q3}, [r5]!
    170     vst1.8          {q10, q11}, [r6]!
    171     vst1.8          {q4, q5}, [r5]!
    172     vst1.8          {q12, q13}, [r6]!
    173     vst1.8          {q6, q7}, [r5]!
    174     vst1.8          {q14, q15}, [r6]!
    175 
    176     add             r5, r5, lr
    177     sub             r5, r5, #128
    178     add             r6, r6, lr
    179     sub             r6, r6, #128
    180 
    181     bne             top_bottom_32
    182 
    183     sub             r5, r1, r8
    184     add             r6, r2, lr
    185 
    186     subs            r12, r12, #1
    187     bne             copy_top_bottom_y
    188 
    189     mov             r7, lr, lsr #4              ;check to see if extra copy is needed
    190     ands            r7, r7, #0x7
    191     bne             extra_top_bottom_y
    192 end_of_border_copy_y
    193 
    194     vpop            {d8 - d15}
    195     pop             {r4 - r11, pc}
    196 
    197 ;=====================
    198 ;extra copy part for Y
    199 extra_top_bottom_y
    200     vld1.8          {q0}, [r1]!
    201     vld1.8          {q2}, [r2]!
    202 
    203     mov             r9, r3, lsr #3
    204 
    205 extra_top_bottom_32
    206     subs            r9, r9, #1
    207 
    208     vst1.8          {q0}, [r5], lr
    209     vst1.8          {q2}, [r6], lr
    210     vst1.8          {q0}, [r5], lr
    211     vst1.8          {q2}, [r6], lr
    212     vst1.8          {q0}, [r5], lr
    213     vst1.8          {q2}, [r6], lr
    214     vst1.8          {q0}, [r5], lr
    215     vst1.8          {q2}, [r6], lr
    216     vst1.8          {q0}, [r5], lr
    217     vst1.8          {q2}, [r6], lr
    218     vst1.8          {q0}, [r5], lr
    219     vst1.8          {q2}, [r6], lr
    220     vst1.8          {q0}, [r5], lr
    221     vst1.8          {q2}, [r6], lr
    222     vst1.8          {q0}, [r5], lr
    223     vst1.8          {q2}, [r6], lr
    224     bne             extra_top_bottom_32
    225 
    226     sub             r5, r1, r8
    227     add             r6, r2, lr
    228     subs            r7, r7, #1
    229     bne             extra_top_bottom_y
    230 
    231     b               end_of_border_copy_y
    232 
    233 
    234 ;=======================
    235 b16_extend_frame_borders
    236 ;border = 16
    237 ;=======================
    238 ;Border copy for Y plane
    239 ;copy the left and right most columns out
    240     sub             r5, r1, r3              ;destptr1
    241     add             r6, r1, lr
    242     sub             r6, r6, r3, lsl #1      ;destptr2
    243     sub             r2, r6, #1              ;srcptr2
    244 
    245     ;Do four rows at one time
    246     mov             r12, r4, lsr #2
    247 
    248 copy_left_right_y_b16
    249     vld1.8          {d0[], d1[]}, [r1], lr
    250     vld1.8          {d4[], d5[]}, [r2], lr
    251     vld1.8          {d8[], d9[]}, [r1], lr
    252     vld1.8          {d12[], d13[]}, [r2], lr
    253     vld1.8          {d16[], d17[]},  [r1], lr
    254     vld1.8          {d20[], d21[]}, [r2], lr
    255     vld1.8          {d24[], d25[]}, [r1], lr
    256     vld1.8          {d28[], d29[]}, [r2], lr
    257 
    258     subs            r12, r12, #1
    259 
    260     vst1.8          {q0}, [r5], lr
    261     vst1.8          {q2}, [r6], lr
    262     vst1.8          {q4}, [r5], lr
    263     vst1.8          {q6}, [r6], lr
    264     vst1.8          {q8}, [r5], lr
    265     vst1.8          {q10}, [r6], lr
    266     vst1.8          {q12}, [r5], lr
    267     vst1.8          {q14}, [r6], lr
    268 
    269     bne             copy_left_right_y_b16
    270 
    271 ;Now copy the top and bottom source lines into each line of the respective borders
    272     ldr             r7, [r0, #yv12_buffer_config_y_buffer]       ;srcptr1
    273     mul             r8, r3, lr
    274 
    275     mov             r12, lr, lsr #7
    276 
    277     sub             r6, r1, r3              ;destptr2
    278     sub             r2, r6, lr              ;srcptr2
    279     sub             r1, r7, r3              ;srcptr1
    280     sub             r5, r1, r8              ;destptr1
    281 
    282 copy_top_bottom_y_b16
    283     vld1.8          {q0, q1}, [r1]!
    284     vld1.8          {q8, q9}, [r2]!
    285     vld1.8          {q2, q3}, [r1]!
    286     vld1.8          {q10, q11}, [r2]!
    287     vld1.8          {q4, q5}, [r1]!
    288     vld1.8          {q12, q13}, [r2]!
    289     vld1.8          {q6, q7}, [r1]!
    290     vld1.8          {q14, q15}, [r2]!
    291 
    292     mov             r7, r3
    293 
    294 top_bottom_16_b16
    295     subs            r7, r7, #1
    296 
    297     vst1.8          {q0, q1}, [r5]!
    298     vst1.8          {q8, q9}, [r6]!
    299     vst1.8          {q2, q3}, [r5]!
    300     vst1.8          {q10, q11}, [r6]!
    301     vst1.8          {q4, q5}, [r5]!
    302     vst1.8          {q12, q13}, [r6]!
    303     vst1.8          {q6, q7}, [r5]!
    304     vst1.8          {q14, q15}, [r6]!
    305 
    306     add             r5, r5, lr
    307     sub             r5, r5, #128
    308     add             r6, r6, lr
    309     sub             r6, r6, #128
    310 
    311     bne             top_bottom_16_b16
    312 
    313     sub             r5, r1, r8
    314     add             r6, r2, lr
    315 
    316     subs            r12, r12, #1
    317     bne             copy_top_bottom_y_b16
    318 
    319     mov             r7, lr, lsr #4              ;check to see if extra copy is needed
    320     ands            r7, r7, #0x7
    321     bne             extra_top_bottom_y_b16
    322 end_of_border_copy_y_b16
    323 
    324     vpop            {d8 - d15}
    325     pop             {r4 - r11, pc}
    326 
    327 ;=====================
    328 ;extra copy part for Y
    329 extra_top_bottom_y_b16
    330     vld1.8          {q0}, [r1]!
    331     vld1.8          {q2}, [r2]!
    332 
    333     mov             r9, r3, lsr #3
    334 
    335 extra_top_bottom_16_b16
    336     subs            r9, r9, #1
    337 
    338     vst1.8          {q0}, [r5], lr
    339     vst1.8          {q2}, [r6], lr
    340     vst1.8          {q0}, [r5], lr
    341     vst1.8          {q2}, [r6], lr
    342     vst1.8          {q0}, [r5], lr
    343     vst1.8          {q2}, [r6], lr
    344     vst1.8          {q0}, [r5], lr
    345     vst1.8          {q2}, [r6], lr
    346     vst1.8          {q0}, [r5], lr
    347     vst1.8          {q2}, [r6], lr
    348     vst1.8          {q0}, [r5], lr
    349     vst1.8          {q2}, [r6], lr
    350     vst1.8          {q0}, [r5], lr
    351     vst1.8          {q2}, [r6], lr
    352     vst1.8          {q0}, [r5], lr
    353     vst1.8          {q2}, [r6], lr
    354     bne             extra_top_bottom_16_b16
    355 
    356     sub             r5, r1, r8
    357     add             r6, r2, lr
    358     subs            r7, r7, #1
    359     bne             extra_top_bottom_y_b16
    360 
    361     b               end_of_border_copy_y_b16
    362 
    363 ;=============================
    364 extra_cp_src_to_dst_width
    365     add             r2, r2, r11
    366     add             r3, r3, r11
    367     add             r0, r8, r6
    368     add             r11, r9, r7
    369 
    370     mov             lr, r4, lsr #1
    371 extra_cp_src_to_dst_height_loop
    372     mov             r8, r2
    373     mov             r9, r3
    374     add             r0, r8, r6
    375     add             r11, r9, r7
    376 
    377     mov             r12, r10
    378 
    379 extra_cp_src_to_dst_width_loop
    380     vld1.8          {q0}, [r8]!
    381     vld1.8          {q1}, [r0]!
    382 
    383     subs            r12, r12, #16
    384 
    385     vst1.8          {q0}, [r9]!
    386     vst1.8          {q1}, [r11]!
    387     bne             extra_cp_src_to_dst_width_loop
    388 
    389     subs            lr, lr, #1
    390 
    391     add             r2, r2, r6, lsl #1
    392     add             r3, r3, r7, lsl #1
    393 
    394     bne             extra_cp_src_to_dst_height_loop
    395 
    396     b               end_of_cp_src_to_dst
    397 
    398     ENDP
    399 
    400 ;===========================================================
    401 ;In vp8cx_pick_filter_level(), call vp8_yv12_copy_frame_yonly
    402 ;without extend_frame_borders.
    403 |vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon| PROC
    404     push            {r4 - r11, lr}
    405     vpush           {d8-d15}
    406 
    407     ldr             r4, [r0, #yv12_buffer_config_y_height]
    408     ldr             r5, [r0, #yv12_buffer_config_y_width]
    409     ldr             r6, [r0, #yv12_buffer_config_y_stride]
    410     ldr             r7, [r1, #yv12_buffer_config_y_stride]
    411     ldr             r2, [r0, #yv12_buffer_config_y_buffer]       ;srcptr1
    412     ldr             r3, [r1, #yv12_buffer_config_y_buffer]       ;dstptr1
    413 
    414     ; copy two rows at one time
    415     mov             lr, r4, lsr #1
    416 
    417 cp_src_to_dst_height_loop1
    418     mov             r8, r2
    419     mov             r9, r3
    420     add             r10, r2, r6
    421     add             r11, r3, r7
    422     mov             r12, r5, lsr #7
    423 
    424 cp_src_to_dst_width_loop1
    425     vld1.8          {q0, q1}, [r8]!
    426     vld1.8          {q8, q9}, [r10]!
    427     vld1.8          {q2, q3}, [r8]!
    428     vld1.8          {q10, q11}, [r10]!
    429     vld1.8          {q4, q5}, [r8]!
    430     vld1.8          {q12, q13}, [r10]!
    431     vld1.8          {q6, q7}, [r8]!
    432     vld1.8          {q14, q15}, [r10]!
    433 
    434     subs            r12, r12, #1
    435 
    436     vst1.8          {q0, q1}, [r9]!
    437     vst1.8          {q8, q9}, [r11]!
    438     vst1.8          {q2, q3}, [r9]!
    439     vst1.8          {q10, q11}, [r11]!
    440     vst1.8          {q4, q5}, [r9]!
    441     vst1.8          {q12, q13}, [r11]!
    442     vst1.8          {q6, q7}, [r9]!
    443     vst1.8          {q14, q15}, [r11]!
    444 
    445     bne             cp_src_to_dst_width_loop1
    446 
    447     subs            lr, lr, #1
    448     add             r2, r2, r6, lsl #1
    449     add             r3, r3, r7, lsl #1
    450 
    451     bne             cp_src_to_dst_height_loop1
    452 
    453     ands            r10, r5, #0x7f                  ;check to see if extra copy is needed
    454     sub             r11, r5, r10
    455     ldr             r2, [r0, #yv12_buffer_config_y_buffer]       ;srcptr1
    456     ldr             r3, [r1, #yv12_buffer_config_y_buffer]       ;dstptr1
    457     bne             extra_cp_src_to_dst_width1
    458 end_of_cp_src_to_dst1
    459 
    460     vpop            {d8 - d15}
    461     pop             {r4-r11, pc}
    462 
    463 ;=============================
    464 extra_cp_src_to_dst_width1
    465     add             r2, r2, r11
    466     add             r3, r3, r11
    467     add             r0, r8, r6
    468     add             r11, r9, r7
    469 
    470     mov             lr, r4, lsr #1
    471 extra_cp_src_to_dst_height_loop1
    472     mov             r8, r2
    473     mov             r9, r3
    474     add             r0, r8, r6
    475     add             r11, r9, r7
    476 
    477     mov             r12, r10
    478 
    479 extra_cp_src_to_dst_width_loop1
    480     vld1.8          {q0}, [r8]!
    481     vld1.8          {q1}, [r0]!
    482 
    483     subs            r12, r12, #16
    484 
    485     vst1.8          {q0}, [r9]!
    486     vst1.8          {q1}, [r11]!
    487     bne             extra_cp_src_to_dst_width_loop1
    488 
    489     subs            lr, lr, #1
    490 
    491     add             r2, r2, r6, lsl #1
    492     add             r3, r3, r7, lsl #1
    493 
    494     bne             extra_cp_src_to_dst_height_loop1
    495 
    496     b               end_of_cp_src_to_dst1
    497 
    498     ENDP
    499 
    500     END
    501