Home | History | Annotate | Download | only in neon
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12     EXPORT  |vp8_yv12_extend_frame_borders_neon|
     13     ARM
     14     REQUIRE8
     15     PRESERVE8
     16 
     17     INCLUDE vpx_scale_asm_offsets.asm
     18 
     19     AREA ||.text||, CODE, READONLY, ALIGN=2
     20 ;void vp8_yv12_extend_frame_borders_neon (YV12_BUFFER_CONFIG *ybf);
     21 ; we depend on VP8BORDERINPIXELS being 32
     22 
     23 |vp8_yv12_extend_frame_borders_neon| PROC
     24     push            {r4 - r10, lr}
     25     vpush           {d8 - d15}
     26 
     27     ; Border = 32
     28     ldr             r3, [r0, #yv12_buffer_config_y_width]  ; plane_width
     29     ldr             r1, [r0, #yv12_buffer_config_y_buffer] ; src_ptr1
     30     ldr             r4, [r0, #yv12_buffer_config_y_height] ; plane_height
     31     ldr             lr, [r0, #yv12_buffer_config_y_stride] ; plane_stride
     32 
     33 ; Border copy for Y plane
     34 ; copy the left and right most columns out
     35     add             r6, r1, r3              ; dest_ptr2 = src_ptr2 + 1 (src_ptr1 + plane_width)
     36     sub             r2, r6, #1              ; src_ptr2 = src_ptr1 + plane_width - 1
     37     sub             r5, r1, #32             ; dest_ptr1 = src_ptr1 - Border
     38 
     39     mov             r12, r4, lsr #2         ; plane_height / 4
     40 
     41 copy_left_right_y
     42     vld1.8          {d0[], d1[]}, [r1], lr
     43     vld1.8          {d4[], d5[]}, [r2], lr
     44     vld1.8          {d8[], d9[]}, [r1], lr
     45     vld1.8          {d12[], d13[]}, [r2], lr
     46     vld1.8          {d16[], d17[]}, [r1], lr
     47     vld1.8          {d20[], d21[]}, [r2], lr
     48     vld1.8          {d24[], d25[]}, [r1], lr
     49     vld1.8          {d28[], d29[]}, [r2], lr
     50 
     51     vmov            q1, q0
     52     vmov            q3, q2
     53     vmov            q5, q4
     54     vmov            q7, q6
     55     vmov            q9, q8
     56     vmov            q11, q10
     57     vmov            q13, q12
     58     vmov            q15, q14
     59 
     60     subs            r12, r12, #1
     61 
     62     vst1.8          {q0, q1}, [r5], lr
     63     vst1.8          {q2, q3}, [r6], lr
     64     vst1.8          {q4, q5}, [r5], lr
     65     vst1.8          {q6, q7}, [r6], lr
     66     vst1.8          {q8, q9}, [r5], lr
     67     vst1.8          {q10, q11}, [r6], lr
     68     vst1.8          {q12, q13}, [r5], lr
     69     vst1.8          {q14, q15}, [r6], lr
     70 
     71     bne             copy_left_right_y
     72 
     73 ;Now copy the top and bottom source lines into each line of the respective borders
     74     ldr             r1, [r0, #yv12_buffer_config_y_buffer] ; y_buffer
     75     mul             r8, r4, lr              ; plane_height * plane_stride
     76 
     77     ; copy width is plane_stride
     78     movs            r12, lr, lsr #7         ; plane_stride / 128
     79 
     80     sub             r1, r1, #32             ; src_ptr1 = y_buffer - Border
     81     add             r6, r1, r8              ; dest_ptr2 = src_ptr2 - plane_stride (src_ptr1 + (plane_height * plane_stride))
     82     sub             r2, r6, lr              ; src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride
     83     sub             r5, r1, lr, asl #5      ; dest_ptr1 = src_ptr1 - (Border * plane_stride)
     84     ble             extra_y_copy_needed     ; plane stride < 128
     85 
     86 copy_top_bottom_y
     87     vld1.8          {q0, q1}, [r1]!
     88     vld1.8          {q8, q9}, [r2]!
     89     vld1.8          {q2, q3}, [r1]!
     90     vld1.8          {q10, q11}, [r2]!
     91     vld1.8          {q4, q5}, [r1]!
     92     vld1.8          {q12, q13}, [r2]!
     93     vld1.8          {q6, q7}, [r1]!
     94     vld1.8          {q14, q15}, [r2]!
     95 
     96     mov             r7, #32                 ; Border
     97 
     98 top_bottom_32
     99     subs            r7, r7, #1
    100 
    101     vst1.8          {q0, q1}, [r5]!
    102     vst1.8          {q8, q9}, [r6]!
    103     vst1.8          {q2, q3}, [r5]!
    104     vst1.8          {q10, q11}, [r6]!
    105     vst1.8          {q4, q5}, [r5]!
    106     vst1.8          {q12, q13}, [r6]!
    107     vst1.8          {q6, q7}, [r5]!
    108     vst1.8          {q14, q15}, [r6]!
    109 
    110     add             r5, r5, lr              ; dest_ptr1 += plane_stride
    111     sub             r5, r5, #128            ; dest_ptr1 -= 128
    112     add             r6, r6, lr              ; dest_ptr2 += plane_stride
    113     sub             r6, r6, #128            ; dest_ptr2 -= 128
    114 
    115     bne             top_bottom_32
    116 
    117     sub             r5, r1, lr, asl #5      ; src_ptr1 - (Border* plane_stride)
    118     add             r6, r2, lr              ; src_ptr2 + plane_stride
    119 
    120     subs            r12, r12, #1
    121     bne             copy_top_bottom_y
    122 
    123 extra_y_copy_needed
    124     mov             r7, lr, lsr #4          ; check to see if extra copy is needed
    125     ands            r7, r7, #0x7
    126     bne             extra_top_bottom_y
    127 end_of_border_copy_y
    128 
    129 ;Border copy for U, V planes
    130 ; Border = 16
    131     ldr             r7, [r0, #yv12_buffer_config_u_buffer]  ; src_ptr1
    132     ldr             lr, [r0, #yv12_buffer_config_uv_stride] ; plane_stride
    133     ldr             r3, [r0, #yv12_buffer_config_uv_width]  ; plane_width
    134     ldr             r4, [r0, #yv12_buffer_config_uv_height] ; plane_height
    135 
    136     mov             r10, #2
    137 
    138 ;copy the left and right most columns out
    139 border_copy_uv
    140     mov             r1, r7                  ; src_ptr1 needs to be saved for second half of loop
    141     sub             r5, r1, #16             ; dest_ptr1 = src_ptr1 - Border
    142     add             r6, r1, r3              ; dest_ptr2 = src_ptr2 + 1 (src_ptr1 + plane_width)
    143     sub             r2, r6, #1              ; src_ptr2 = src_ptr1 + plane_width - 1
    144 
    145     mov             r12, r4, lsr #3         ; plane_height / 8
    146 
    147 copy_left_right_uv
    148     vld1.8          {d0[], d1[]}, [r1], lr
    149     vld1.8          {d2[], d3[]}, [r2], lr
    150     vld1.8          {d4[], d5[]}, [r1], lr
    151     vld1.8          {d6[], d7[]}, [r2], lr
    152     vld1.8          {d8[], d9[]},  [r1], lr
    153     vld1.8          {d10[], d11[]}, [r2], lr
    154     vld1.8          {d12[], d13[]}, [r1], lr
    155     vld1.8          {d14[], d15[]}, [r2], lr
    156     vld1.8          {d16[], d17[]}, [r1], lr
    157     vld1.8          {d18[], d19[]}, [r2], lr
    158     vld1.8          {d20[], d21[]}, [r1], lr
    159     vld1.8          {d22[], d23[]}, [r2], lr
    160     vld1.8          {d24[], d25[]}, [r1], lr
    161     vld1.8          {d26[], d27[]}, [r2], lr
    162     vld1.8          {d28[], d29[]}, [r1], lr
    163     vld1.8          {d30[], d31[]}, [r2], lr
    164 
    165     subs            r12, r12, #1
    166 
    167     vst1.8          {q0}, [r5], lr
    168     vst1.8          {q1}, [r6], lr
    169     vst1.8          {q2}, [r5], lr
    170     vst1.8          {q3}, [r6], lr
    171     vst1.8          {q4}, [r5], lr
    172     vst1.8          {q5}, [r6], lr
    173     vst1.8          {q6}, [r5], lr
    174     vst1.8          {q7}, [r6], lr
    175     vst1.8          {q8}, [r5], lr
    176     vst1.8          {q9}, [r6], lr
    177     vst1.8          {q10}, [r5], lr
    178     vst1.8          {q11}, [r6], lr
    179     vst1.8          {q12}, [r5], lr
    180     vst1.8          {q13}, [r6], lr
    181     vst1.8          {q14}, [r5], lr
    182     vst1.8          {q15}, [r6], lr
    183 
    184     bne             copy_left_right_uv
    185 
    186 ;Now copy the top and bottom source lines into each line of the respective borders
    187     mov             r1, r7
    188     mul             r8, r4, lr              ; plane_height * plane_stride
    189     movs            r12, lr, lsr #6         ; plane_stride / 64
    190 
    191     sub             r1, r1, #16             ; src_ptr1 = u_buffer - Border
    192     add             r6, r1, r8              ; dest_ptr2 = src_ptr2 + plane_stride (src_ptr1 + (plane_height * plane_stride)
    193     sub             r2, r6, lr              ; src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride
    194     sub             r5, r1, lr, asl #4      ; dest_ptr1 = src_ptr1 - (Border * plane_stride)
    195     ble             extra_uv_copy_needed    ; plane_stride < 64
    196 
    197 copy_top_bottom_uv
    198     vld1.8          {q0, q1}, [r1]!
    199     vld1.8          {q8, q9}, [r2]!
    200     vld1.8          {q2, q3}, [r1]!
    201     vld1.8          {q10, q11}, [r2]!
    202 
    203     mov             r7, #16                 ; Border
    204 
    205 top_bottom_16
    206     subs            r7, r7, #1
    207 
    208     vst1.8          {q0, q1}, [r5]!
    209     vst1.8          {q8, q9}, [r6]!
    210     vst1.8          {q2, q3}, [r5]!
    211     vst1.8          {q10, q11}, [r6]!
    212 
    213     add             r5, r5, lr              ; dest_ptr1 += plane_stride
    214     sub             r5, r5, #64
    215     add             r6, r6, lr              ; dest_ptr2 += plane_stride
    216     sub             r6, r6, #64
    217 
    218     bne             top_bottom_16
    219 
    220     sub             r5, r1, lr, asl #4      ; dest_ptr1 = src_ptr1 - (Border * plane_stride)
    221     add             r6, r2, lr              ; dest_ptr2 = src_ptr2 + plane_stride
    222 
    223     subs            r12, r12, #1
    224     bne             copy_top_bottom_uv
    225 extra_uv_copy_needed
    226     mov             r7, lr, lsr #3          ; check to see if extra copy is needed
    227     ands            r7, r7, #0x7
    228     bne             extra_top_bottom_uv
    229 
    230 end_of_border_copy_uv
    231     subs            r10, r10, #1
    232     ldrne           r7, [r0, #yv12_buffer_config_v_buffer] ; src_ptr1
    233     bne             border_copy_uv
    234 
    235     vpop            {d8 - d15}
    236     pop             {r4 - r10, pc}
    237 
    238 ;;;;;;;;;;;;;;;;;;;;;;
    239 extra_top_bottom_y
    240     vld1.8          {q0}, [r1]!
    241     vld1.8          {q2}, [r2]!
    242 
    243     mov             r9, #4                  ; 32 >> 3
    244 
    245 extra_top_bottom_32
    246     subs            r9, r9, #1
    247 
    248     vst1.8          {q0}, [r5], lr
    249     vst1.8          {q2}, [r6], lr
    250     vst1.8          {q0}, [r5], lr
    251     vst1.8          {q2}, [r6], lr
    252     vst1.8          {q0}, [r5], lr
    253     vst1.8          {q2}, [r6], lr
    254     vst1.8          {q0}, [r5], lr
    255     vst1.8          {q2}, [r6], lr
    256     vst1.8          {q0}, [r5], lr
    257     vst1.8          {q2}, [r6], lr
    258     vst1.8          {q0}, [r5], lr
    259     vst1.8          {q2}, [r6], lr
    260     vst1.8          {q0}, [r5], lr
    261     vst1.8          {q2}, [r6], lr
    262     vst1.8          {q0}, [r5], lr
    263     vst1.8          {q2}, [r6], lr
    264     bne             extra_top_bottom_32
    265 
    266     sub             r5, r1, lr, asl #5      ; src_ptr1 - (Border * plane_stride)
    267     add             r6, r2, lr              ; src_ptr2 + plane_stride
    268     subs            r7, r7, #1
    269     bne             extra_top_bottom_y
    270 
    271     b               end_of_border_copy_y
    272 
    273 extra_top_bottom_uv
    274     vld1.8          {d0}, [r1]!
    275     vld1.8          {d8}, [r2]!
    276 
    277     mov             r9, #2                  ; 16 >> 3
    278 
    279 extra_top_bottom_16
    280     subs            r9, r9, #1
    281 
    282     vst1.8          {d0}, [r5], lr
    283     vst1.8          {d8}, [r6], lr
    284     vst1.8          {d0}, [r5], lr
    285     vst1.8          {d8}, [r6], lr
    286     vst1.8          {d0}, [r5], lr
    287     vst1.8          {d8}, [r6], lr
    288     vst1.8          {d0}, [r5], lr
    289     vst1.8          {d8}, [r6], lr
    290     vst1.8          {d0}, [r5], lr
    291     vst1.8          {d8}, [r6], lr
    292     vst1.8          {d0}, [r5], lr
    293     vst1.8          {d8}, [r6], lr
    294     vst1.8          {d0}, [r5], lr
    295     vst1.8          {d8}, [r6], lr
    296     vst1.8          {d0}, [r5], lr
    297     vst1.8          {d8}, [r6], lr
    298     bne             extra_top_bottom_16
    299 
    300     sub             r5, r1, lr, asl #4      ; src_ptr1 - (Border * plane_stride)
    301     add             r6, r2, lr              ; src_ptr2 + plane_stride
    302     subs            r7, r7, #1
    303     bne             extra_top_bottom_uv
    304 
    305     b               end_of_border_copy_uv
    306 
    307     ENDP
    308     END
    309