Home | History | Annotate | Download | only in neon
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12     EXPORT  |vp8_yv12_copy_src_frame_func_neon|
     13     ARM
     14     REQUIRE8
     15     PRESERVE8
     16 
     17     INCLUDE vpx_asm_offsets.asm
     18 
     19     AREA ||.text||, CODE, READONLY, ALIGN=2
     20 ;Note: This function is used to copy source data in src_buffer[i] at beginning of
     21 ;the encoding. The buffer has a width and height of cpi->oxcf.Width and cpi->oxcf.Height,
     22 ;which can be ANY numbers(NOT always multiples of 16 or 4).
     23 
     24 ;void vp8_yv12_copy_src_frame_func_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc);
     25 
     26 |vp8_yv12_copy_src_frame_func_neon| PROC
     27     push            {r4 - r11, lr}
     28     vpush           {d8 - d15}
     29 
     30     ;Copy Y plane
     31     ldr             r4, [r0, #yv12_buffer_config_y_height]
     32     ldr             r5, [r0, #yv12_buffer_config_y_width]
     33     ldr             r6, [r0, #yv12_buffer_config_y_stride]
     34     ldr             r7, [r1, #yv12_buffer_config_y_stride]
     35     ldr             r2, [r0, #yv12_buffer_config_y_buffer]       ;srcptr1
     36     ldr             r3, [r1, #yv12_buffer_config_y_buffer]       ;dstptr1
     37 
     38     add             r10, r2, r6             ;second row src
     39     add             r11, r3, r7             ;second row dst
     40     mov             r6, r6, lsl #1
     41     mov             r7, r7, lsl #1
     42     sub             r6, r6, r5              ;adjust stride
     43     sub             r7, r7, r5
     44 
     45     ; copy two rows at one time
     46     mov             lr, r4, lsr #1
     47 
     48 cp_src_to_dst_height_loop
     49     mov             r12, r5
     50 
     51 cp_width_128_loop
     52     vld1.8          {q0, q1}, [r2]!
     53     vld1.8          {q4, q5}, [r10]!
     54     vld1.8          {q2, q3}, [r2]!
     55     vld1.8          {q6, q7}, [r10]!
     56     vld1.8          {q8, q9}, [r2]!
     57     vld1.8          {q12, q13}, [r10]!
     58     vld1.8          {q10, q11}, [r2]!
     59     vld1.8          {q14, q15}, [r10]!
     60     sub             r12, r12, #128
     61     cmp             r12, #128
     62     vst1.8          {q0, q1}, [r3]!
     63     vst1.8          {q4, q5}, [r11]!
     64     vst1.8          {q2, q3}, [r3]!
     65     vst1.8          {q6, q7}, [r11]!
     66     vst1.8          {q8, q9}, [r3]!
     67     vst1.8          {q12, q13}, [r11]!
     68     vst1.8          {q10, q11}, [r3]!
     69     vst1.8          {q14, q15}, [r11]!
     70     bhs             cp_width_128_loop
     71 
     72     cmp             r12, #0
     73     beq             cp_width_done
     74 
     75 cp_width_8_loop
     76     vld1.8          {d0}, [r2]!
     77     vld1.8          {d1}, [r10]!
     78     sub             r12, r12, #8
     79     cmp             r12, #8
     80     vst1.8          {d0}, [r3]!
     81     vst1.8          {d1}, [r11]!
     82     bhs             cp_width_8_loop
     83 
     84     cmp             r12, #0
     85     beq             cp_width_done
     86 
     87 cp_width_1_loop
     88     ldrb            r8, [r2], #1
     89     subs            r12, r12, #1
     90     strb            r8, [r3], #1
     91     ldrb            r8, [r10], #1
     92     strb            r8, [r11], #1
     93     bne             cp_width_1_loop
     94 
     95 cp_width_done
     96     subs            lr, lr, #1
     97     add             r2, r2, r6
     98     add             r3, r3, r7
     99     add             r10, r10, r6
    100     add             r11, r11, r7
    101     bne             cp_src_to_dst_height_loop
    102 
    103 ;copy last line for Y if y_height is odd
    104     tst             r4, #1
    105     beq             cp_width_done_1
    106     mov             r12, r5
    107 
    108 cp_width_128_loop_1
    109     vld1.8          {q0, q1}, [r2]!
    110     vld1.8          {q2, q3}, [r2]!
    111     vld1.8          {q8, q9}, [r2]!
    112     vld1.8          {q10, q11}, [r2]!
    113     sub             r12, r12, #128
    114     cmp             r12, #128
    115     vst1.8          {q0, q1}, [r3]!
    116     vst1.8          {q2, q3}, [r3]!
    117     vst1.8          {q8, q9}, [r3]!
    118     vst1.8          {q10, q11}, [r3]!
    119     bhs             cp_width_128_loop_1
    120 
    121     cmp             r12, #0
    122     beq             cp_width_done_1
    123 
    124 cp_width_8_loop_1
    125     vld1.8          {d0}, [r2]!
    126     sub             r12, r12, #8
    127     cmp             r12, #8
    128     vst1.8          {d0}, [r3]!
    129     bhs             cp_width_8_loop_1
    130 
    131     cmp             r12, #0
    132     beq             cp_width_done_1
    133 
    134 cp_width_1_loop_1
    135     ldrb            r8, [r2], #1
    136     subs            r12, r12, #1
    137     strb            r8, [r3], #1
    138     bne             cp_width_1_loop_1
    139 cp_width_done_1
    140 
    141 ;Copy U & V planes
    142     ldr             r4, [r0, #yv12_buffer_config_uv_height]
    143     ldr             r5, [r0, #yv12_buffer_config_uv_width]
    144     ldr             r6, [r0, #yv12_buffer_config_uv_stride]
    145     ldr             r7, [r1, #yv12_buffer_config_uv_stride]
    146     ldr             r2, [r0, #yv12_buffer_config_u_buffer]       ;srcptr1
    147     ldr             r3, [r1, #yv12_buffer_config_u_buffer]       ;dstptr1
    148 
    149     add             r10, r2, r6             ;second row src
    150     add             r11, r3, r7             ;second row dst
    151     mov             r6, r6, lsl #1
    152     mov             r7, r7, lsl #1
    153     sub             r6, r6, r5              ;adjust stride
    154     sub             r7, r7, r5
    155 
    156     mov             r9, #2
    157 
    158 cp_uv_loop
    159     ;copy two rows at one time
    160     mov             lr, r4, lsr #1
    161 
    162 cp_src_to_dst_height_uv_loop
    163     mov             r12, r5
    164 
    165 cp_width_uv_64_loop
    166     vld1.8          {q0, q1}, [r2]!
    167     vld1.8          {q4, q5}, [r10]!
    168     vld1.8          {q2, q3}, [r2]!
    169     vld1.8          {q6, q7}, [r10]!
    170     sub             r12, r12, #64
    171     cmp             r12, #64
    172     vst1.8          {q0, q1}, [r3]!
    173     vst1.8          {q4, q5}, [r11]!
    174     vst1.8          {q2, q3}, [r3]!
    175     vst1.8          {q6, q7}, [r11]!
    176     bhs             cp_width_uv_64_loop
    177 
    178     cmp             r12, #0
    179     beq             cp_width_uv_done
    180 
    181 cp_width_uv_8_loop
    182     vld1.8          {d0}, [r2]!
    183     vld1.8          {d1}, [r10]!
    184     sub             r12, r12, #8
    185     cmp             r12, #8
    186     vst1.8          {d0}, [r3]!
    187     vst1.8          {d1}, [r11]!
    188     bhs             cp_width_uv_8_loop
    189 
    190     cmp             r12, #0
    191     beq             cp_width_uv_done
    192 
    193 cp_width_uv_1_loop
    194     ldrb            r8, [r2], #1
    195     subs            r12, r12, #1
    196     strb            r8, [r3], #1
    197     ldrb            r8, [r10], #1
    198     strb            r8, [r11], #1
    199     bne             cp_width_uv_1_loop
    200 
    201 cp_width_uv_done
    202     subs            lr, lr, #1
    203     add             r2, r2, r6
    204     add             r3, r3, r7
    205     add             r10, r10, r6
    206     add             r11, r11, r7
    207     bne             cp_src_to_dst_height_uv_loop
    208 
    209 ;copy last line for U & V if uv_height is odd
    210     tst             r4, #1
    211     beq             cp_width_uv_done_1
    212     mov             r12, r5
    213 
    214 cp_width_uv_64_loop_1
    215     vld1.8          {q0, q1}, [r2]!
    216     vld1.8          {q2, q3}, [r2]!
    217     sub             r12, r12, #64
    218     cmp             r12, #64
    219     vst1.8          {q0, q1}, [r3]!
    220     vst1.8          {q2, q3}, [r3]!
    221     bhs             cp_width_uv_64_loop_1
    222 
    223     cmp             r12, #0
    224     beq             cp_width_uv_done_1
    225 
    226 cp_width_uv_8_loop_1
    227     vld1.8          {d0}, [r2]!
    228     sub             r12, r12, #8
    229     cmp             r12, #8
    230     vst1.8          {d0}, [r3]!
    231     bhs             cp_width_uv_8_loop_1
    232 
    233     cmp             r12, #0
    234     beq             cp_width_uv_done_1
    235 
    236 cp_width_uv_1_loop_1
    237     ldrb            r8, [r2], #1
    238     subs            r12, r12, #1
    239     strb            r8, [r3], #1
    240     bne             cp_width_uv_1_loop_1
    241 cp_width_uv_done_1
    242 
    243     subs            r9, r9, #1
    244     ldrne           r2, [r0, #yv12_buffer_config_v_buffer]      ;srcptr1
    245     ldrne           r3, [r1, #yv12_buffer_config_v_buffer]      ;dstptr1
    246     ldrne           r10, [r0, #yv12_buffer_config_uv_stride]
    247     ldrne           r11, [r1, #yv12_buffer_config_uv_stride]
    248 
    249     addne           r10, r2, r10                ;second row src
    250     addne           r11, r3, r11                ;second row dst
    251 
    252     bne             cp_uv_loop
    253 
    254     vpop            {d8 - d15}
    255     pop             {r4 - r11, pc}
    256 
    257     ENDP
    258     END
    259