Home | History | Annotate | Download | only in neon
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12     EXPORT  |vp8_build_intra_predictors_mby_neon_func|
     13     EXPORT  |vp8_build_intra_predictors_mby_s_neon_func|
     14 
     15     ARM
     16     REQUIRE8
     17     PRESERVE8
     18 
     19     AREA ||.text||, CODE, READONLY, ALIGN=2
     20 ; r0    unsigned char *y_buffer
     21 ; r1    unsigned char *ypred_ptr
     22 ; r2    int y_stride
     23 ; r3    int mode
     24 ; stack int Up
     25 ; stack int Left
     26 
     27 |vp8_build_intra_predictors_mby_neon_func| PROC
     28     push            {r4-r8, lr}
     29 
     30     cmp             r3, #0
     31     beq             case_dc_pred
     32     cmp             r3, #1
     33     beq             case_v_pred
     34     cmp             r3, #2
     35     beq             case_h_pred
     36     cmp             r3, #3
     37     beq             case_tm_pred
     38 
     39 case_dc_pred
     40     ldr             r4, [sp, #24]       ; Up
     41     ldr             r5, [sp, #28]       ; Left
     42 
     43     ; Default the DC average to 128
     44     mov             r12, #128
     45     vdup.u8         q0, r12
     46 
     47     ; Zero out running sum
     48     mov             r12, #0
     49 
     50     ; compute shift and jump
     51     adds            r7, r4, r5
     52     beq             skip_dc_pred_up_left
     53 
     54     ; Load above row, if it exists
     55     cmp             r4, #0
     56     beq             skip_dc_pred_up
     57 
     58     sub             r6, r0, r2
     59     vld1.8          {q1}, [r6]
     60     vpaddl.u8       q2, q1
     61     vpaddl.u16      q3, q2
     62     vpaddl.u32      q4, q3
     63 
     64     vmov.32         r4, d8[0]
     65     vmov.32         r6, d9[0]
     66 
     67     add             r12, r4, r6
     68 
     69     ; Move back to interger registers
     70 
     71 skip_dc_pred_up
     72 
     73     cmp             r5, #0
     74     beq             skip_dc_pred_left
     75 
     76     sub             r0, r0, #1
     77 
     78     ; Load left row, if it exists
     79     ldrb            r3, [r0], r2
     80     ldrb            r4, [r0], r2
     81     ldrb            r5, [r0], r2
     82     ldrb            r6, [r0], r2
     83 
     84     add             r12, r12, r3
     85     add             r12, r12, r4
     86     add             r12, r12, r5
     87     add             r12, r12, r6
     88 
     89     ldrb            r3, [r0], r2
     90     ldrb            r4, [r0], r2
     91     ldrb            r5, [r0], r2
     92     ldrb            r6, [r0], r2
     93 
     94     add             r12, r12, r3
     95     add             r12, r12, r4
     96     add             r12, r12, r5
     97     add             r12, r12, r6
     98 
     99     ldrb            r3, [r0], r2
    100     ldrb            r4, [r0], r2
    101     ldrb            r5, [r0], r2
    102     ldrb            r6, [r0], r2
    103 
    104     add             r12, r12, r3
    105     add             r12, r12, r4
    106     add             r12, r12, r5
    107     add             r12, r12, r6
    108 
    109     ldrb            r3, [r0], r2
    110     ldrb            r4, [r0], r2
    111     ldrb            r5, [r0], r2
    112     ldrb            r6, [r0]
    113 
    114     add             r12, r12, r3
    115     add             r12, r12, r4
    116     add             r12, r12, r5
    117     add             r12, r12, r6
    118 
    119 skip_dc_pred_left
    120     add             r7, r7, #3          ; Shift
    121     sub             r4, r7, #1
    122     mov             r5, #1
    123     add             r12, r12, r5, lsl r4
    124     mov             r5, r12, lsr r7     ; expected_dc
    125 
    126     vdup.u8         q0, r5
    127 
    128 skip_dc_pred_up_left
    129     vst1.u8         {q0}, [r1]!
    130     vst1.u8         {q0}, [r1]!
    131     vst1.u8         {q0}, [r1]!
    132     vst1.u8         {q0}, [r1]!
    133     vst1.u8         {q0}, [r1]!
    134     vst1.u8         {q0}, [r1]!
    135     vst1.u8         {q0}, [r1]!
    136     vst1.u8         {q0}, [r1]!
    137     vst1.u8         {q0}, [r1]!
    138     vst1.u8         {q0}, [r1]!
    139     vst1.u8         {q0}, [r1]!
    140     vst1.u8         {q0}, [r1]!
    141     vst1.u8         {q0}, [r1]!
    142     vst1.u8         {q0}, [r1]!
    143     vst1.u8         {q0}, [r1]!
    144     vst1.u8         {q0}, [r1]!
    145 
    146     pop             {r4-r8,pc}
    147 case_v_pred
    148     ; Copy down above row
    149     sub             r6, r0, r2
    150     vld1.8          {q0}, [r6]
    151 
    152     vst1.u8         {q0}, [r1]!
    153     vst1.u8         {q0}, [r1]!
    154     vst1.u8         {q0}, [r1]!
    155     vst1.u8         {q0}, [r1]!
    156     vst1.u8         {q0}, [r1]!
    157     vst1.u8         {q0}, [r1]!
    158     vst1.u8         {q0}, [r1]!
    159     vst1.u8         {q0}, [r1]!
    160     vst1.u8         {q0}, [r1]!
    161     vst1.u8         {q0}, [r1]!
    162     vst1.u8         {q0}, [r1]!
    163     vst1.u8         {q0}, [r1]!
    164     vst1.u8         {q0}, [r1]!
    165     vst1.u8         {q0}, [r1]!
    166     vst1.u8         {q0}, [r1]!
    167     vst1.u8         {q0}, [r1]!
    168     pop             {r4-r8,pc}
    169 
    170 case_h_pred
    171     ; Load 4x yleft_col
    172     sub             r0, r0, #1
    173 
    174     ldrb            r3, [r0], r2
    175     ldrb            r4, [r0], r2
    176     ldrb            r5, [r0], r2
    177     ldrb            r6, [r0], r2
    178     vdup.u8         q0, r3
    179     vdup.u8         q1, r4
    180     vdup.u8         q2, r5
    181     vdup.u8         q3, r6
    182     vst1.u8         {q0}, [r1]!
    183     vst1.u8         {q1}, [r1]!
    184     vst1.u8         {q2}, [r1]!
    185     vst1.u8         {q3}, [r1]!
    186 
    187     ldrb            r3, [r0], r2
    188     ldrb            r4, [r0], r2
    189     ldrb            r5, [r0], r2
    190     ldrb            r6, [r0], r2
    191     vdup.u8         q0, r3
    192     vdup.u8         q1, r4
    193     vdup.u8         q2, r5
    194     vdup.u8         q3, r6
    195     vst1.u8         {q0}, [r1]!
    196     vst1.u8         {q1}, [r1]!
    197     vst1.u8         {q2}, [r1]!
    198     vst1.u8         {q3}, [r1]!
    199 
    200 
    201     ldrb            r3, [r0], r2
    202     ldrb            r4, [r0], r2
    203     ldrb            r5, [r0], r2
    204     ldrb            r6, [r0], r2
    205     vdup.u8         q0, r3
    206     vdup.u8         q1, r4
    207     vdup.u8         q2, r5
    208     vdup.u8         q3, r6
    209     vst1.u8         {q0}, [r1]!
    210     vst1.u8         {q1}, [r1]!
    211     vst1.u8         {q2}, [r1]!
    212     vst1.u8         {q3}, [r1]!
    213 
    214     ldrb            r3, [r0], r2
    215     ldrb            r4, [r0], r2
    216     ldrb            r5, [r0], r2
    217     ldrb            r6, [r0], r2
    218     vdup.u8         q0, r3
    219     vdup.u8         q1, r4
    220     vdup.u8         q2, r5
    221     vdup.u8         q3, r6
    222     vst1.u8         {q0}, [r1]!
    223     vst1.u8         {q1}, [r1]!
    224     vst1.u8         {q2}, [r1]!
    225     vst1.u8         {q3}, [r1]!
    226 
    227     pop             {r4-r8,pc}
    228 
    229 case_tm_pred
    230     ; Load yabove_row
    231     sub             r3, r0, r2
    232     vld1.8          {q8}, [r3]
    233 
    234     ; Load ytop_left
    235     sub             r3, r3, #1
    236     ldrb            r7, [r3]
    237 
    238     vdup.u16        q7, r7
    239 
    240     ; Compute yabove_row - ytop_left
    241     mov             r3, #1
    242     vdup.u8         q0, r3
    243 
    244     vmull.u8        q4, d16, d0
    245     vmull.u8        q5, d17, d0
    246 
    247     vsub.s16        q4, q4, q7
    248     vsub.s16        q5, q5, q7
    249 
    250     ; Load 4x yleft_col
    251     sub             r0, r0, #1
    252     mov             r12, #4
    253 
    254 case_tm_pred_loop
    255     ldrb            r3, [r0], r2
    256     ldrb            r4, [r0], r2
    257     ldrb            r5, [r0], r2
    258     ldrb            r6, [r0], r2
    259     vdup.u16        q0, r3
    260     vdup.u16        q1, r4
    261     vdup.u16        q2, r5
    262     vdup.u16        q3, r6
    263 
    264     vqadd.s16       q8, q0, q4
    265     vqadd.s16       q9, q0, q5
    266 
    267     vqadd.s16       q10, q1, q4
    268     vqadd.s16       q11, q1, q5
    269 
    270     vqadd.s16       q12, q2, q4
    271     vqadd.s16       q13, q2, q5
    272 
    273     vqadd.s16       q14, q3, q4
    274     vqadd.s16       q15, q3, q5
    275 
    276     vqshrun.s16     d0, q8, #0
    277     vqshrun.s16     d1, q9, #0
    278 
    279     vqshrun.s16     d2, q10, #0
    280     vqshrun.s16     d3, q11, #0
    281 
    282     vqshrun.s16     d4, q12, #0
    283     vqshrun.s16     d5, q13, #0
    284 
    285     vqshrun.s16     d6, q14, #0
    286     vqshrun.s16     d7, q15, #0
    287 
    288     vst1.u8         {q0}, [r1]!
    289     vst1.u8         {q1}, [r1]!
    290     vst1.u8         {q2}, [r1]!
    291     vst1.u8         {q3}, [r1]!
    292 
    293     subs            r12, r12, #1
    294     bne             case_tm_pred_loop
    295 
    296     pop             {r4-r8,pc}
    297 
    298     ENDP
    299 
    300 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    301 ; r0    unsigned char *y_buffer
    302 ; r1    unsigned char *ypred_ptr
    303 ; r2    int y_stride
    304 ; r3    int mode
    305 ; stack int Up
    306 ; stack int Left
    307 
    308 |vp8_build_intra_predictors_mby_s_neon_func| PROC
    309     push            {r4-r8, lr}
    310 
    311     mov             r1, r0      ;   unsigned char *ypred_ptr = x->dst.y_buffer; //x->Predictor;
    312 
    313     cmp             r3, #0
    314     beq             case_dc_pred_s
    315     cmp             r3, #1
    316     beq             case_v_pred_s
    317     cmp             r3, #2
    318     beq             case_h_pred_s
    319     cmp             r3, #3
    320     beq             case_tm_pred_s
    321 
    322 case_dc_pred_s
    323     ldr             r4, [sp, #24]       ; Up
    324     ldr             r5, [sp, #28]       ; Left
    325 
    326     ; Default the DC average to 128
    327     mov             r12, #128
    328     vdup.u8         q0, r12
    329 
    330     ; Zero out running sum
    331     mov             r12, #0
    332 
    333     ; compute shift and jump
    334     adds            r7, r4, r5
    335     beq             skip_dc_pred_up_left_s
    336 
    337     ; Load above row, if it exists
    338     cmp             r4, #0
    339     beq             skip_dc_pred_up_s
    340 
    341     sub             r6, r0, r2
    342     vld1.8          {q1}, [r6]
    343     vpaddl.u8       q2, q1
    344     vpaddl.u16      q3, q2
    345     vpaddl.u32      q4, q3
    346 
    347     vmov.32         r4, d8[0]
    348     vmov.32         r6, d9[0]
    349 
    350     add             r12, r4, r6
    351 
    352     ; Move back to interger registers
    353 
    354 skip_dc_pred_up_s
    355 
    356     cmp             r5, #0
    357     beq             skip_dc_pred_left_s
    358 
    359     sub             r0, r0, #1
    360 
    361     ; Load left row, if it exists
    362     ldrb            r3, [r0], r2
    363     ldrb            r4, [r0], r2
    364     ldrb            r5, [r0], r2
    365     ldrb            r6, [r0], r2
    366 
    367     add             r12, r12, r3
    368     add             r12, r12, r4
    369     add             r12, r12, r5
    370     add             r12, r12, r6
    371 
    372     ldrb            r3, [r0], r2
    373     ldrb            r4, [r0], r2
    374     ldrb            r5, [r0], r2
    375     ldrb            r6, [r0], r2
    376 
    377     add             r12, r12, r3
    378     add             r12, r12, r4
    379     add             r12, r12, r5
    380     add             r12, r12, r6
    381 
    382     ldrb            r3, [r0], r2
    383     ldrb            r4, [r0], r2
    384     ldrb            r5, [r0], r2
    385     ldrb            r6, [r0], r2
    386 
    387     add             r12, r12, r3
    388     add             r12, r12, r4
    389     add             r12, r12, r5
    390     add             r12, r12, r6
    391 
    392     ldrb            r3, [r0], r2
    393     ldrb            r4, [r0], r2
    394     ldrb            r5, [r0], r2
    395     ldrb            r6, [r0]
    396 
    397     add             r12, r12, r3
    398     add             r12, r12, r4
    399     add             r12, r12, r5
    400     add             r12, r12, r6
    401 
    402 skip_dc_pred_left_s
    403     add             r7, r7, #3          ; Shift
    404     sub             r4, r7, #1
    405     mov             r5, #1
    406     add             r12, r12, r5, lsl r4
    407     mov             r5, r12, lsr r7     ; expected_dc
    408 
    409     vdup.u8         q0, r5
    410 
    411 skip_dc_pred_up_left_s
    412     vst1.u8         {q0}, [r1], r2
    413     vst1.u8         {q0}, [r1], r2
    414     vst1.u8         {q0}, [r1], r2
    415     vst1.u8         {q0}, [r1], r2
    416     vst1.u8         {q0}, [r1], r2
    417     vst1.u8         {q0}, [r1], r2
    418     vst1.u8         {q0}, [r1], r2
    419     vst1.u8         {q0}, [r1], r2
    420     vst1.u8         {q0}, [r1], r2
    421     vst1.u8         {q0}, [r1], r2
    422     vst1.u8         {q0}, [r1], r2
    423     vst1.u8         {q0}, [r1], r2
    424     vst1.u8         {q0}, [r1], r2
    425     vst1.u8         {q0}, [r1], r2
    426     vst1.u8         {q0}, [r1], r2
    427     vst1.u8         {q0}, [r1], r2
    428 
    429     pop             {r4-r8,pc}
    430 case_v_pred_s
    431     ; Copy down above row
    432     sub             r6, r0, r2
    433     vld1.8          {q0}, [r6]
    434 
    435     vst1.u8         {q0}, [r1], r2
    436     vst1.u8         {q0}, [r1], r2
    437     vst1.u8         {q0}, [r1], r2
    438     vst1.u8         {q0}, [r1], r2
    439     vst1.u8         {q0}, [r1], r2
    440     vst1.u8         {q0}, [r1], r2
    441     vst1.u8         {q0}, [r1], r2
    442     vst1.u8         {q0}, [r1], r2
    443     vst1.u8         {q0}, [r1], r2
    444     vst1.u8         {q0}, [r1], r2
    445     vst1.u8         {q0}, [r1], r2
    446     vst1.u8         {q0}, [r1], r2
    447     vst1.u8         {q0}, [r1], r2
    448     vst1.u8         {q0}, [r1], r2
    449     vst1.u8         {q0}, [r1], r2
    450     vst1.u8         {q0}, [r1], r2
    451     pop             {r4-r8,pc}
    452 
    453 case_h_pred_s
    454     ; Load 4x yleft_col
    455     sub             r0, r0, #1
    456 
    457     ldrb            r3, [r0], r2
    458     ldrb            r4, [r0], r2
    459     ldrb            r5, [r0], r2
    460     ldrb            r6, [r0], r2
    461     vdup.u8         q0, r3
    462     vdup.u8         q1, r4
    463     vdup.u8         q2, r5
    464     vdup.u8         q3, r6
    465     vst1.u8         {q0}, [r1], r2
    466     vst1.u8         {q1}, [r1], r2
    467     vst1.u8         {q2}, [r1], r2
    468     vst1.u8         {q3}, [r1], r2
    469 
    470     ldrb            r3, [r0], r2
    471     ldrb            r4, [r0], r2
    472     ldrb            r5, [r0], r2
    473     ldrb            r6, [r0], r2
    474     vdup.u8         q0, r3
    475     vdup.u8         q1, r4
    476     vdup.u8         q2, r5
    477     vdup.u8         q3, r6
    478     vst1.u8         {q0}, [r1], r2
    479     vst1.u8         {q1}, [r1], r2
    480     vst1.u8         {q2}, [r1], r2
    481     vst1.u8         {q3}, [r1], r2
    482 
    483 
    484     ldrb            r3, [r0], r2
    485     ldrb            r4, [r0], r2
    486     ldrb            r5, [r0], r2
    487     ldrb            r6, [r0], r2
    488     vdup.u8         q0, r3
    489     vdup.u8         q1, r4
    490     vdup.u8         q2, r5
    491     vdup.u8         q3, r6
    492     vst1.u8         {q0}, [r1], r2
    493     vst1.u8         {q1}, [r1], r2
    494     vst1.u8         {q2}, [r1], r2
    495     vst1.u8         {q3}, [r1], r2
    496 
    497     ldrb            r3, [r0], r2
    498     ldrb            r4, [r0], r2
    499     ldrb            r5, [r0], r2
    500     ldrb            r6, [r0], r2
    501     vdup.u8         q0, r3
    502     vdup.u8         q1, r4
    503     vdup.u8         q2, r5
    504     vdup.u8         q3, r6
    505     vst1.u8         {q0}, [r1], r2
    506     vst1.u8         {q1}, [r1], r2
    507     vst1.u8         {q2}, [r1], r2
    508     vst1.u8         {q3}, [r1], r2
    509 
    510     pop             {r4-r8,pc}
    511 
    512 case_tm_pred_s
    513     ; Load yabove_row
    514     sub             r3, r0, r2
    515     vld1.8          {q8}, [r3]
    516 
    517     ; Load ytop_left
    518     sub             r3, r3, #1
    519     ldrb            r7, [r3]
    520 
    521     vdup.u16        q7, r7
    522 
    523     ; Compute yabove_row - ytop_left
    524     mov             r3, #1
    525     vdup.u8         q0, r3
    526 
    527     vmull.u8        q4, d16, d0
    528     vmull.u8        q5, d17, d0
    529 
    530     vsub.s16        q4, q4, q7
    531     vsub.s16        q5, q5, q7
    532 
    533     ; Load 4x yleft_col
    534     sub             r0, r0, #1
    535     mov             r12, #4
    536 
    537 case_tm_pred_loop_s
    538     ldrb            r3, [r0], r2
    539     ldrb            r4, [r0], r2
    540     ldrb            r5, [r0], r2
    541     ldrb            r6, [r0], r2
    542     vdup.u16        q0, r3
    543     vdup.u16        q1, r4
    544     vdup.u16        q2, r5
    545     vdup.u16        q3, r6
    546 
    547     vqadd.s16       q8, q0, q4
    548     vqadd.s16       q9, q0, q5
    549 
    550     vqadd.s16       q10, q1, q4
    551     vqadd.s16       q11, q1, q5
    552 
    553     vqadd.s16       q12, q2, q4
    554     vqadd.s16       q13, q2, q5
    555 
    556     vqadd.s16       q14, q3, q4
    557     vqadd.s16       q15, q3, q5
    558 
    559     vqshrun.s16     d0, q8, #0
    560     vqshrun.s16     d1, q9, #0
    561 
    562     vqshrun.s16     d2, q10, #0
    563     vqshrun.s16     d3, q11, #0
    564 
    565     vqshrun.s16     d4, q12, #0
    566     vqshrun.s16     d5, q13, #0
    567 
    568     vqshrun.s16     d6, q14, #0
    569     vqshrun.s16     d7, q15, #0
    570 
    571     vst1.u8         {q0}, [r1], r2
    572     vst1.u8         {q1}, [r1], r2
    573     vst1.u8         {q2}, [r1], r2
    574     vst1.u8         {q3}, [r1], r2
    575 
    576     subs            r12, r12, #1
    577     bne             case_tm_pred_loop_s
    578 
    579     pop             {r4-r8,pc}
    580 
    581     ENDP
    582 
    583 
    584     END
    585