Home | History | Annotate | Download | only in armv6
      1 ;
      2 ;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12     EXPORT  |vp8_intra4x4_predict_armv6|
     13 
     14     ARM
     15     REQUIRE8
     16     PRESERVE8
     17 
     18     AREA ||.text||, CODE, READONLY, ALIGN=2
     19 
     20 
     21 ;void vp8_intra4x4_predict_armv6(unsigned char *Above, unsigned char *yleft,
     22 ;                                B_PREDICTION_MODE left_stride, int b_mode,
     23 ;                                unsigned char *dst, int dst_stride,
     24 ;                                unsigned char top_left)
     25 
     26 ; r0: *Above
     27 ; r1: *yleft
     28 ; r2: left_stride
     29 ; r3: b_mode
     30 ; sp + #40: dst
     31 ; sp + #44: dst_stride
     32 ; sp + #48: top_left
     33 |vp8_intra4x4_predict_armv6| PROC
     34     push        {r4-r12, lr}
     35 
     36     cmp         r3, #10
     37     addlt       pc, pc, r3, lsl #2       ; position independent switch
     38     pop         {r4-r12, pc}             ; default
     39     b           b_dc_pred
     40     b           b_tm_pred
     41     b           b_ve_pred
     42     b           b_he_pred
     43     b           b_ld_pred
     44     b           b_rd_pred
     45     b           b_vr_pred
     46     b           b_vl_pred
     47     b           b_hd_pred
     48     b           b_hu_pred
     49 
     50 b_dc_pred
     51     ; load values
     52     ldr         r8, [r0]                 ; Above
     53     ldrb        r4, [r1], r2             ; Left[0]
     54     mov         r9, #0
     55     ldrb        r5, [r1], r2             ; Left[1]
     56     ldrb        r6, [r1], r2             ; Left[2]
     57     usad8       r12, r8, r9
     58     ldrb        r7, [r1]                 ; Left[3]
     59 
     60     ; calculate dc
     61     add         r4, r4, r5
     62     add         r4, r4, r6
     63     add         r4, r4, r7
     64     add         r4, r4, r12
     65     add         r4, r4, #4
     66     ldr         r0, [sp, #44]           ; dst_stride
     67     mov         r12, r4, asr #3         ; (expected_dc + 4) >> 3
     68 
     69     add         r12, r12, r12, lsl #8
     70     ldr         r3, [sp, #40]           ; dst
     71     add         r12, r12, r12, lsl #16
     72 
     73     ; store values
     74     str         r12, [r3], r0
     75     str         r12, [r3], r0
     76     str         r12, [r3], r0
     77     str         r12, [r3]
     78 
     79     pop        {r4-r12, pc}
     80 
     81 b_tm_pred
     82     ldr         r8, [r0]                ; Above
     83     ldrb        r9, [sp, #48]           ; top_left
     84     ldrb        r4, [r1], r2            ; Left[0]
     85     ldrb        r5, [r1], r2            ; Left[1]
     86     ldrb        r6, [r1], r2            ; Left[2]
     87     ldrb        r7, [r1]                ; Left[3]
     88     ldr         r0, [sp, #44]           ; dst_stride
     89     ldr         r3, [sp, #40]           ; dst
     90 
     91     add         r9, r9, r9, lsl #16     ; [tl|tl]
     92     uxtb16      r10, r8                 ; a[2|0]
     93     uxtb16      r11, r8, ror #8         ; a[3|1]
     94     ssub16      r10, r10, r9            ; a[2|0] - [tl|tl]
     95     ssub16      r11, r11, r9            ; a[3|1] - [tl|tl]
     96 
     97     add         r4, r4, r4, lsl #16     ; l[0|0]
     98     add         r5, r5, r5, lsl #16     ; l[1|1]
     99     add         r6, r6, r6, lsl #16     ; l[2|2]
    100     add         r7, r7, r7, lsl #16     ; l[3|3]
    101 
    102     sadd16      r1, r4, r10             ; l[0|0] + a[2|0] - [tl|tl]
    103     sadd16      r2, r4, r11             ; l[0|0] + a[3|1] - [tl|tl]
    104     usat16      r1, #8, r1
    105     usat16      r2, #8, r2
    106 
    107     sadd16      r4, r5, r10             ; l[1|1] + a[2|0] - [tl|tl]
    108     sadd16      r5, r5, r11             ; l[1|1] + a[3|1] - [tl|tl]
    109 
    110     add         r12, r1, r2, lsl #8     ; [3|2|1|0]
    111     str         r12, [r3], r0
    112 
    113     usat16      r4, #8, r4
    114     usat16      r5, #8, r5
    115 
    116     sadd16      r1, r6, r10             ; l[2|2] + a[2|0] - [tl|tl]
    117     sadd16      r2, r6, r11             ; l[2|2] + a[3|1] - [tl|tl]
    118 
    119     add         r12, r4, r5, lsl #8     ; [3|2|1|0]
    120     str         r12, [r3], r0
    121 
    122     usat16      r1, #8, r1
    123     usat16      r2, #8, r2
    124 
    125     sadd16      r4, r7, r10             ; l[3|3] + a[2|0] - [tl|tl]
    126     sadd16      r5, r7, r11             ; l[3|3] + a[3|1] - [tl|tl]
    127 
    128     add         r12, r1, r2, lsl #8     ; [3|2|1|0]
    129 
    130     usat16      r4, #8, r4
    131     usat16      r5, #8, r5
    132 
    133     str         r12, [r3], r0
    134 
    135     add         r12, r4, r5, lsl #8     ; [3|2|1|0]
    136     str         r12, [r3]
    137 
    138     pop        {r4-r12, pc}
    139 
    140 b_ve_pred
    141     ldr         r8, [r0]                ; a[3|2|1|0]
    142     ldr         r11, c00FF00FF
    143     ldrb        r9, [sp, #48]           ; top_left
    144     ldrb        r10, [r0, #4]           ; a[4]
    145 
    146     ldr         r0, c00020002
    147 
    148     uxtb16      r4, r8                  ; a[2|0]
    149     uxtb16      r5, r8, ror #8          ; a[3|1]
    150     ldr         r2, [sp, #44]           ; dst_stride
    151     pkhbt       r9, r9, r5, lsl #16     ; a[1|-1]
    152 
    153     add         r9, r9, r4, lsl #1      ;[a[1]+2*a[2]       | tl+2*a[0]       ]
    154     uxtab16     r9, r9, r5              ;[a[1]+2*a[2]+a[3]  | tl+2*a[0]+a[1]  ]
    155     ldr         r3, [sp, #40]           ; dst
    156     uxtab16     r9, r9, r0              ;[a[1]+2*a[2]+a[3]+2| tl+2*a[0]+a[1]+2]
    157 
    158     add         r0, r0, r10, lsl #16    ;[a[4]+2            |                 2]
    159     add         r0, r0, r4, asr #16     ;[a[4]+2            |            a[2]+2]
    160     add         r0, r0, r5, lsl #1      ;[a[4]+2*a[3]+2     |     a[2]+2*a[1]+2]
    161     uadd16      r4, r4, r0              ;[a[4]+2*a[3]+a[2]+2|a[2]+2*a[1]+a[0]+2]
    162 
    163     and         r9, r11, r9, asr #2
    164     and         r4, r11, r4, asr #2
    165     add         r9, r9, r4, lsl #8
    166 
    167     ; store values
    168     str         r9, [r3], r2
    169     str         r9, [r3], r2
    170     str         r9, [r3], r2
    171     str         r9, [r3]
    172 
    173     pop        {r4-r12, pc}
    174 
    175 
    176 b_he_pred
    177     ldrb        r4, [r1], r2            ; Left[0]
    178     ldrb        r8, [sp, #48]           ; top_left
    179     ldrb        r5, [r1], r2            ; Left[1]
    180     ldrb        r6, [r1], r2            ; Left[2]
    181     ldrb        r7, [r1]                ; Left[3]
    182 
    183     add         r8, r8, r4              ; tl   + l[0]
    184     add         r9, r4, r5              ; l[0] + l[1]
    185     add         r10, r5, r6             ; l[1] + l[2]
    186     add         r11, r6, r7             ; l[2] + l[3]
    187 
    188     mov         r0, #2<<14
    189 
    190     add         r8, r8, r9              ; tl + 2*l[0] + l[1]
    191     add         r4, r9, r10             ; l[0] + 2*l[1] + l[2]
    192     add         r5, r10, r11            ; l[1] + 2*l[2] + l[3]
    193     add         r6, r11, r7, lsl #1     ; l[2] + 2*l[3] + l[3]
    194 
    195 
    196     add         r8, r0, r8, lsl #14     ; (tl + 2*l[0] + l[1])>>2 in top half
    197     add         r9, r0, r4, lsl #14     ; (l[0] + 2*l[1] + l[2])>>2 in top half
    198     add         r10,r0, r5, lsl #14     ; (l[1] + 2*l[2] + l[3])>>2 in top half
    199     add         r11,r0, r6, lsl #14     ; (l[2] + 2*l[3] + l[3])>>2 in top half
    200 
    201     pkhtb       r8, r8, r8, asr #16     ; l[-|0|-|0]
    202     pkhtb       r9, r9, r9, asr #16     ; l[-|1|-|1]
    203     pkhtb       r10, r10, r10, asr #16  ; l[-|2|-|2]
    204     pkhtb       r11, r11, r11, asr #16  ; l[-|3|-|3]
    205 
    206     ldr         r0, [sp, #44]           ; dst_stride
    207     ldr         r3, [sp, #40]           ; dst
    208 
    209     add         r8, r8, r8, lsl #8      ; l[0|0|0|0]
    210     add         r9, r9, r9, lsl #8      ; l[1|1|1|1]
    211     add         r10, r10, r10, lsl #8   ; l[2|2|2|2]
    212     add         r11, r11, r11, lsl #8   ; l[3|3|3|3]
    213 
    214     ; store values
    215     str         r8, [r3], r0
    216     str         r9, [r3], r0
    217     str         r10, [r3], r0
    218     str         r11, [r3]
    219 
    220     pop        {r4-r12, pc}
    221 
    222 b_ld_pred
    223     ldr         r4, [r0]                ; Above[0-3]
    224     ldr         r12, c00020002
    225     ldr         r5, [r0, #4]            ; Above[4-7]
    226     ldr         lr,  c00FF00FF
    227 
    228     uxtb16      r6, r4                  ; a[2|0]
    229     uxtb16      r7, r4, ror #8          ; a[3|1]
    230     uxtb16      r8, r5                  ; a[6|4]
    231     uxtb16      r9, r5, ror #8          ; a[7|5]
    232     pkhtb       r10, r6, r8             ; a[2|4]
    233     pkhtb       r11, r7, r9             ; a[3|5]
    234 
    235     add         r4, r6, r7, lsl #1      ; [a2+2*a3      |      a0+2*a1]
    236     add         r4, r4, r10, ror #16    ; [a2+2*a3+a4   |   a0+2*a1+a2]
    237     uxtab16     r4, r4, r12             ; [a2+2*a3+a4+2 | a0+2*a1+a2+2]
    238 
    239     add         r5, r7, r10, ror #15    ; [a3+2*a4      |      a1+2*a2]
    240     add         r5, r5, r11, ror #16    ; [a3+2*a4+a5   |   a1+2*a2+a3]
    241     uxtab16     r5, r5, r12             ; [a3+2*a4+a5+2 | a1+2*a2+a3+2]
    242 
    243     pkhtb       r7, r9, r8, asr #16
    244     add         r6, r8, r9, lsl #1      ; [a6+2*a7      |      a4+2*a5]
    245     uadd16      r6, r6, r7              ; [a6+2*a7+a7   |   a4+2*a5+a6]
    246     uxtab16     r6, r6, r12             ; [a6+2*a7+a7+2 | a4+2*a5+a6+2]
    247 
    248     uxth        r7, r9                  ; [                         a5]
    249     add         r7, r7, r8, asr #15     ; [                    a5+2*a6]
    250     add         r7, r7, r9, asr #16     ; [                 a5+2*a6+a7]
    251     uxtah       r7, r7, r12             ; [               a5+2*a6+a7+2]
    252 
    253     ldr         r0, [sp, #44]           ; dst_stride
    254     ldr         r3, [sp, #40]           ; dst
    255 
    256     ; scale down
    257     and         r4, lr, r4, asr #2
    258     and         r5, lr, r5, asr #2
    259     and         r6, lr, r6, asr #2
    260     mov         r7, r7, asr #2
    261 
    262     add         r8, r4, r5, lsl #8      ; [3|2|1|0]
    263     str         r8, [r3], r0
    264 
    265     mov         r9, r8, lsr #8
    266     add         r9, r9, r6, lsl #24     ; [4|3|2|1]
    267     str         r9, [r3], r0
    268 
    269     mov         r10, r9, lsr #8
    270     add         r10, r10, r7, lsl #24   ; [5|4|3|2]
    271     str         r10, [r3], r0
    272 
    273     mov         r6, r6, lsr #16
    274     mov         r11, r10, lsr #8
    275     add         r11, r11, r6, lsl #24   ; [6|5|4|3]
    276     str         r11, [r3]
    277 
    278     pop        {r4-r12, pc}
    279 
    280 b_rd_pred
    281     ldrb        r7, [r1], r2            ; l[0] = pp[3]
    282     ldr         lr, [r0]                ; Above = pp[8|7|6|5]
    283     ldrb        r8, [sp, #48]           ; tl   = pp[4]
    284     ldrb        r6, [r1], r2            ; l[1] = pp[2]
    285     ldrb        r5, [r1], r2            ; l[2] = pp[1]
    286     ldrb        r4, [r1], r2            ; l[3] = pp[0]
    287 
    288 
    289     uxtb16      r9, lr                  ; p[7|5]
    290     uxtb16      r10, lr, ror #8         ; p[8|6]
    291     add         r4, r4, r6, lsl #16     ; p[2|0]
    292     add         r5, r5, r7, lsl #16     ; p[3|1]
    293     add         r6, r6, r8, lsl #16     ; p[4|2]
    294     pkhbt       r7, r7, r9, lsl #16     ; p[5|3]
    295     pkhbt       r8, r8, r10, lsl #16    ; p[6|4]
    296 
    297     ldr         r12, c00020002
    298     ldr         lr,  c00FF00FF
    299 
    300     add         r4, r4, r5, lsl #1      ; [p2+2*p3      |      p0+2*p1]
    301     add         r4, r4, r6              ; [p2+2*p3+p4   |   p0+2*p1+p2]
    302     uxtab16     r4, r4, r12             ; [p2+2*p3+p4+2 | p0+2*p1+p2+2]
    303 
    304     add         r5, r5, r6, lsl #1      ; [p3+2*p4      |      p1+2*p2]
    305     add         r5, r5, r7              ; [p3+2*p4+p5   |   p1+2*p2+p3]
    306     uxtab16     r5, r5, r12             ; [p3+2*p4+p5+2 | p1+2*p2+p3+2]
    307 
    308     add         r6, r7, r8, lsl #1      ; [p5+2*p6      |      p3+2*p4]
    309     add         r6, r6, r9              ; [p5+2*p6+p7   |   p3+2*p4+p5]
    310     uxtab16     r6, r6, r12             ; [p5+2*p6+p7+2 | p3+2*p4+p5+2]
    311 
    312     add         r7, r8, r9, lsl #1      ; [p6+2*p7      |      p4+2*p5]
    313     add         r7, r7, r10             ; [p6+2*p7+p8   |   p4+2*p5+p6]
    314     uxtab16     r7, r7, r12             ; [p6+2*p7+p8+2 | p4+2*p5+p6+2]
    315 
    316     ldr         r0, [sp, #44]           ; dst_stride
    317     ldr         r3, [sp, #40]           ; dst
    318 
    319     ; scale down
    320     and         r7, lr, r7, asr #2
    321     and         r6, lr, r6, asr #2
    322     and         r5, lr, r5, asr #2
    323     and         r4, lr, r4, asr #2
    324 
    325     add         r8, r6, r7, lsl #8      ; [6|5|4|3]
    326     str         r8, [r3], r0
    327 
    328     mov         r9, r8, lsl #8          ; [5|4|3|-]
    329     uxtab       r9, r9, r4, ror #16     ; [5|4|3|2]
    330     str         r9, [r3], r0
    331 
    332     mov         r10, r9, lsl #8         ; [4|3|2|-]
    333     uxtab       r10, r10, r5            ; [4|3|2|1]
    334     str         r10, [r3], r0
    335 
    336     mov         r11, r10, lsl #8        ; [3|2|1|-]
    337     uxtab       r11, r11, r4            ; [3|2|1|0]
    338     str         r11, [r3]
    339 
    340     pop        {r4-r12, pc}
    341 
    342 b_vr_pred
    343     ldrb        r7, [r1], r2            ; l[0] = pp[3]
    344     ldr         lr, [r0]                ; Above = pp[8|7|6|5]
    345     ldrb        r8, [sp, #48]           ; tl   = pp[4]
    346     ldrb        r6, [r1], r2            ; l[1] = pp[2]
    347     ldrb        r5, [r1], r2            ; l[2] = pp[1]
    348     ldrb        r4, [r1]                ; l[3] = pp[0]
    349 
    350     add         r5, r5, r7, lsl #16     ; p[3|1]
    351     add         r6, r6, r8, lsl #16     ; p[4|2]
    352     uxtb16      r9, lr                  ; p[7|5]
    353     uxtb16      r10, lr, ror #8         ; p[8|6]
    354     pkhbt       r7, r7, r9, lsl #16     ; p[5|3]
    355     pkhbt       r8, r8, r10, lsl #16    ; p[6|4]
    356 
    357     ldr         r4,  c00010001
    358     ldr         r12, c00020002
    359     ldr         lr,  c00FF00FF
    360 
    361     add         r5, r5, r6, lsl #1      ; [p3+2*p4      |      p1+2*p2]
    362     add         r5, r5, r7              ; [p3+2*p4+p5   |   p1+2*p2+p3]
    363     uxtab16     r5, r5, r12             ; [p3+2*p4+p5+2 | p1+2*p2+p3+2]
    364 
    365     add         r6, r6, r7, lsl #1      ; [p4+2*p5      |      p2+2*p3]
    366     add         r6, r6, r8              ; [p4+2*p5+p6   |   p2+2*p3+p4]
    367     uxtab16     r6, r6, r12             ; [p4+2*p5+p6+2 | p2+2*p3+p4+2]
    368 
    369     uadd16      r11, r8, r9             ; [p6+p7        |        p4+p5]
    370     uhadd16     r11, r11, r4            ; [(p6+p7+1)>>1 | (p4+p5+1)>>1]
    371                                         ; [F|E]
    372 
    373     add         r7, r7, r8, lsl #1      ; [p5+2*p6      |      p3+2*p4]
    374     add         r7, r7, r9              ; [p5+2*p6+p7   |   p3+2*p4+p5]
    375     uxtab16     r7, r7, r12             ; [p5+2*p6+p7+2 | p3+2*p4+p5+2]
    376 
    377     uadd16      r2, r9, r10             ; [p7+p8        |        p5+p6]
    378     uhadd16     r2, r2, r4              ; [(p7+p8+1)>>1 | (p5+p6+1)>>1]
    379                                         ; [J|I]
    380 
    381     add         r8, r8, r9, lsl #1      ; [p6+2*p7      |      p4+2*p5]
    382     add         r8, r8, r10             ; [p6+2*p7+p8   |   p4+2*p5+p6]
    383     uxtab16     r8, r8, r12             ; [p6+2*p7+p8+2 | p4+2*p5+p6+2]
    384 
    385     ldr         r0, [sp, #44]           ; dst_stride
    386     ldr         r3, [sp, #40]           ; dst
    387 
    388     ; scale down
    389     and         r5, lr, r5, asr #2      ; [B|A]
    390     and         r6, lr, r6, asr #2      ; [D|C]
    391     and         r7, lr, r7, asr #2      ; [H|G]
    392     and         r8, lr, r8, asr #2      ; [L|K]
    393 
    394     add         r12, r11, r2, lsl #8    ; [J|F|I|E]
    395     str         r12, [r3], r0
    396 
    397     add         r12, r7, r8, lsl #8     ; [L|H|K|G]
    398     str         r12, [r3], r0
    399 
    400     pkhbt       r2, r6, r2, lsl #16     ; [-|I|-|C]
    401     add         r2, r2, r11, lsl #8     ; [F|I|E|C]
    402 
    403     pkhtb       r12, r6, r5             ; [-|D|-|A]
    404     pkhtb       r10, r7, r5, asr #16    ; [-|H|-|B]
    405     str         r2, [r3], r0
    406     add         r12, r12, r10, lsl #8   ; [H|D|B|A]
    407     str         r12, [r3]
    408 
    409     pop        {r4-r12, pc}
    410 
    411 b_vl_pred
    412     ldr         r4, [r0]                ; [3|2|1|0] = Above[0-3]
    413     ldr         r12, c00020002
    414     ldr         r5, [r0, #4]            ; [7|6|5|4] = Above[4-7]
    415     ldr         lr,  c00FF00FF
    416     ldr         r2,  c00010001
    417 
    418     mov         r0, r4, lsr #16         ; [-|-|3|2]
    419     add         r0, r0, r5, lsl #16     ; [5|4|3|2]
    420     uxtb16      r6, r4                  ; [2|0]
    421     uxtb16      r7, r4, ror #8          ; [3|1]
    422     uxtb16      r8, r0                  ; [4|2]
    423     uxtb16      r9, r0, ror #8          ; [5|3]
    424     uxtb16      r10, r5                 ; [6|4]
    425     uxtb16      r11, r5, ror #8         ; [7|5]
    426 
    427     uadd16      r4, r6, r7              ; [p2+p3        |        p0+p1]
    428     uhadd16     r4, r4, r2              ; [(p2+p3+1)>>1 | (p0+p1+1)>>1]
    429                                         ; [B|A]
    430 
    431     add         r5, r6, r7, lsl #1      ; [p2+2*p3      |      p0+2*p1]
    432     add         r5, r5, r8              ; [p2+2*p3+p4   |   p0+2*p1+p2]
    433     uxtab16     r5, r5, r12             ; [p2+2*p3+p4+2 | p0+2*p1+p2+2]
    434 
    435     uadd16      r6, r7, r8              ; [p3+p4        |        p1+p2]
    436     uhadd16     r6, r6, r2              ; [(p3+p4+1)>>1 | (p1+p2+1)>>1]
    437                                         ; [F|E]
    438 
    439     add         r7, r7, r8, lsl #1      ; [p3+2*p4      |      p1+2*p2]
    440     add         r7, r7, r9              ; [p3+2*p4+p5   |   p1+2*p2+p3]
    441     uxtab16     r7, r7, r12             ; [p3+2*p4+p5+2 | p1+2*p2+p3+2]
    442 
    443     add         r8, r8, r9, lsl #1      ; [p4+2*p5      |      p2+2*p3]
    444     add         r8, r8, r10             ; [p4+2*p5+p6   |   p2+2*p3+p4]
    445     uxtab16     r8, r8, r12             ; [p4+2*p5+p6+2 | p2+2*p3+p4+2]
    446 
    447     add         r9, r9, r10, lsl #1     ; [p5+2*p6      |      p3+2*p4]
    448     add         r9, r9, r11             ; [p5+2*p6+p7   |   p3+2*p4+p5]
    449     uxtab16     r9, r9, r12             ; [p5+2*p6+p7+2 | p3+2*p4+p5+2]
    450 
    451     ldr         r0, [sp, #44]           ; dst_stride
    452     ldr         r3, [sp, #40]           ; dst
    453 
    454     ; scale down
    455     and         r5, lr, r5, asr #2      ; [D|C]
    456     and         r7, lr, r7, asr #2      ; [H|G]
    457     and         r8, lr, r8, asr #2      ; [I|D]
    458     and         r9, lr, r9, asr #2      ; [J|H]
    459 
    460     add         r10, r4, r6, lsl #8     ; [F|B|E|A]
    461     str         r10, [r3], r0
    462 
    463     add         r5, r5, r7, lsl #8      ; [H|C|G|D]
    464     str         r5, [r3], r0
    465 
    466     pkhtb       r12, r8, r4, asr #16    ; [-|I|-|B]
    467     pkhtb       r10, r9, r8             ; [-|J|-|D]
    468 
    469     add         r12, r6, r12, lsl #8    ; [I|F|B|E]
    470     str         r12, [r3], r0
    471 
    472     add         r10, r7, r10, lsl #8    ; [J|H|D|G]
    473     str         r10, [r3]
    474 
    475     pop        {r4-r12, pc}
    476 
    477 b_hd_pred
    478     ldrb        r7, [r1], r2            ; l[0] = pp[3]
    479     ldr         lr, [r0]                ; Above = pp[8|7|6|5]
    480     ldrb        r8, [sp, #48]           ; tl   = pp[4]
    481     ldrb        r6, [r1], r2            ; l[1] = pp[2]
    482     ldrb        r5, [r1], r2            ; l[2] = pp[1]
    483     ldrb        r4, [r1]                ; l[3] = pp[0]
    484 
    485     uxtb16      r9, lr                  ; p[7|5]
    486     uxtb16      r10, lr, ror #8         ; p[8|6]
    487 
    488     add         r4, r4, r5, lsl #16     ; p[1|0]
    489     add         r5, r5, r6, lsl #16     ; p[2|1]
    490     add         r6, r6, r7, lsl #16     ; p[3|2]
    491     add         r7, r7, r8, lsl #16     ; p[4|3]
    492 
    493     ldr         r12, c00020002
    494     ldr         lr,  c00FF00FF
    495     ldr         r2,  c00010001
    496 
    497     pkhtb       r8, r7, r9              ; p[4|5]
    498     pkhtb       r1, r9, r10             ; p[7|6]
    499     pkhbt       r10, r8, r10, lsl #16   ; p[6|5]
    500 
    501     uadd16      r11, r4, r5             ; [p1+p2        |        p0+p1]
    502     uhadd16     r11, r11, r2            ; [(p1+p2+1)>>1 | (p0+p1+1)>>1]
    503                                         ; [B|A]
    504 
    505     add         r4, r4, r5, lsl #1      ; [p1+2*p2      |      p0+2*p1]
    506     add         r4, r4, r6              ; [p1+2*p2+p3   |   p0+2*p1+p2]
    507     uxtab16     r4, r4, r12             ; [p1+2*p2+p3+2 | p0+2*p1+p2+2]
    508 
    509     uadd16      r0, r6, r7              ; [p3+p4        |        p2+p3]
    510     uhadd16     r0, r0, r2              ; [(p3+p4+1)>>1 | (p2+p3+1)>>1]
    511                                         ; [F|E]
    512 
    513     add         r5, r6, r7, lsl #1      ; [p3+2*p4      |      p2+2*p3]
    514     add         r5, r5, r8, ror #16     ; [p3+2*p4+p5   |   p2+2*p3+p4]
    515     uxtab16     r5, r5, r12             ; [p3+2*p4+p5+2 | p2+2*p3+p4+2]
    516 
    517     add         r6, r12, r8, ror #16    ; [p5+2         |         p4+2]
    518     add         r6, r6, r10, lsl #1     ; [p5+2+2*p6    |    p4+2+2*p5]
    519     uxtab16     r6, r6, r1              ; [p5+2+2*p6+p7 | p4+2+2*p5+p6]
    520 
    521     ; scale down
    522     and         r4, lr, r4, asr #2      ; [D|C]
    523     and         r5, lr, r5, asr #2      ; [H|G]
    524     and         r6, lr, r6, asr #2      ; [J|I]
    525 
    526     ldr         lr, [sp, #44]           ; dst_stride
    527     ldr         r3, [sp, #40]           ; dst
    528 
    529     pkhtb       r2, r0, r6              ; [-|F|-|I]
    530     pkhtb       r12, r6, r5, asr #16    ; [-|J|-|H]
    531     add         r12, r12, r2, lsl #8    ; [F|J|I|H]
    532     add         r2, r0, r5, lsl #8      ; [H|F|G|E]
    533     mov         r12, r12, ror #24       ; [J|I|H|F]
    534     str         r12, [r3], lr
    535 
    536     mov         r7, r11, asr #16        ; [-|-|-|B]
    537     str         r2, [r3], lr
    538     add         r7, r7, r0, lsl #16     ; [-|E|-|B]
    539     add         r7, r7, r4, asr #8      ; [-|E|D|B]
    540     add         r7, r7, r5, lsl #24     ; [G|E|D|B]
    541     str         r7, [r3], lr
    542 
    543     add         r5, r11, r4, lsl #8     ; [D|B|C|A]
    544     str         r5, [r3]
    545 
    546     pop        {r4-r12, pc}
    547 
    548 
    549 
    550 b_hu_pred
    551     ldrb        r4, [r1], r2            ; Left[0]
    552     ldr         r12, c00020002
    553     ldrb        r5, [r1], r2            ; Left[1]
    554     ldr         lr,  c00FF00FF
    555     ldrb        r6, [r1], r2            ; Left[2]
    556     ldr         r2,  c00010001
    557     ldrb        r7, [r1]                ; Left[3]
    558 
    559     add         r4, r4, r5, lsl #16     ; [1|0]
    560     add         r5, r5, r6, lsl #16     ; [2|1]
    561     add         r9, r6, r7, lsl #16     ; [3|2]
    562 
    563     uadd16      r8, r4, r5              ; [p1+p2        |        p0+p1]
    564     uhadd16     r8, r8, r2              ; [(p1+p2+1)>>1 | (p0+p1+1)>>1]
    565                                         ; [B|A]
    566 
    567     add         r4, r4, r5, lsl #1      ; [p1+2*p2      |      p0+2*p1]
    568     add         r4, r4, r9              ; [p1+2*p2+p3   |   p0+2*p1+p2]
    569     uxtab16     r4, r4, r12             ; [p1+2*p2+p3+2 | p0+2*p1+p2+2]
    570     ldr         r2, [sp, #44]           ; dst_stride
    571     ldr         r3, [sp, #40]           ; dst
    572     and         r4, lr, r4, asr #2      ; [D|C]
    573 
    574     add         r10, r6, r7             ; [p2+p3]
    575     add         r11, r10, r7, lsl #1    ; [p2+3*p3]
    576     add         r10, r10, #1
    577     add         r11, r11, #2
    578     mov         r10, r10, asr #1        ; [E]
    579     mov         r11, r11, asr #2        ; [F]
    580 
    581     add         r9, r7, r9, asr #8      ; [-|-|G|G]
    582     add         r0, r8, r4, lsl #8      ; [D|B|C|A]
    583     add         r7, r9, r9, lsl #16     ; [G|G|G|G]
    584 
    585     str         r0, [r3], r2
    586 
    587     mov         r1, r8, asr #16         ; [-|-|-|B]
    588     add         r1, r1, r4, asr #8      ; [-|-|D|B]
    589     add         r1, r1, r10, lsl #16    ; [-|E|D|B]
    590     add         r1, r1, r11, lsl #24    ; [F|E|D|B]
    591     str         r1, [r3], r2
    592 
    593     add         r10, r11, lsl #8        ; [-|-|F|E]
    594     add         r10, r10, r9, lsl #16   ; [G|G|F|E]
    595     str         r10, [r3], r2
    596 
    597     str         r7, [r3]
    598 
    599     pop        {r4-r12, pc}
    600 
    601     ENDP
    602 
    603 ; constants
    604 c00010001
    605     DCD         0x00010001
    606 c00020002
    607     DCD         0x00020002
    608 c00FF00FF
    609     DCD         0x00FF00FF
    610 
    611     END
    612