Home | History | Annotate | Download | only in armv6
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12     EXPORT |vp8_loop_filter_horizontal_edge_armv6|
     13     EXPORT |vp8_mbloop_filter_horizontal_edge_armv6|
     14     EXPORT |vp8_loop_filter_vertical_edge_armv6|
     15     EXPORT |vp8_mbloop_filter_vertical_edge_armv6|
     16 
     17     AREA    |.text|, CODE, READONLY  ; name this block of code
     18 
     19     MACRO
     20     TRANSPOSE_MATRIX $a0, $a1, $a2, $a3, $b0, $b1, $b2, $b3
     21     ; input: $a0, $a1, $a2, $a3; output: $b0, $b1, $b2, $b3
     22     ; a0: 03 02 01 00
     23     ; a1: 13 12 11 10
     24     ; a2: 23 22 21 20
     25     ; a3: 33 32 31 30
     26     ;     b3 b2 b1 b0
     27 
     28     uxtb16      $b1, $a1                    ; xx 12 xx 10
     29     uxtb16      $b0, $a0                    ; xx 02 xx 00
     30     uxtb16      $b3, $a3                    ; xx 32 xx 30
     31     uxtb16      $b2, $a2                    ; xx 22 xx 20
     32     orr         $b1, $b0, $b1, lsl #8       ; 12 02 10 00
     33     orr         $b3, $b2, $b3, lsl #8       ; 32 22 30 20
     34 
     35     uxtb16      $a1, $a1, ror #8            ; xx 13 xx 11
     36     uxtb16      $a3, $a3, ror #8            ; xx 33 xx 31
     37     uxtb16      $a0, $a0, ror #8            ; xx 03 xx 01
     38     uxtb16      $a2, $a2, ror #8            ; xx 23 xx 21
     39     orr         $a0, $a0, $a1, lsl #8       ; 13 03 11 01
     40     orr         $a2, $a2, $a3, lsl #8       ; 33 23 31 21
     41 
     42     pkhtb       $b2, $b3, $b1, asr #16      ; 32 22 12 02   -- p1
     43     pkhbt       $b0, $b1, $b3, lsl #16      ; 30 20 10 00   -- p3
     44 
     45     pkhtb       $b3, $a2, $a0, asr #16      ; 33 23 13 03   -- p0
     46     pkhbt       $b1, $a0, $a2, lsl #16      ; 31 21 11 01   -- p2
     47     MEND
     48 
     49 
     50 src         RN  r0
     51 pstep       RN  r1
     52 count       RN  r5
     53 
     54 ;r0     unsigned char *src_ptr,
     55 ;r1     int src_pixel_step,
     56 ;r2     const char *blimit,
     57 ;r3     const char *limit,
     58 ;stack  const char *thresh,
     59 ;stack  int  count
     60 
     61 ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
     62 |vp8_loop_filter_horizontal_edge_armv6| PROC
     63 ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
     64     stmdb       sp!, {r4 - r11, lr}
     65 
     66     sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
     67     ldr         count, [sp, #40]            ; count for 8-in-parallel
     68     ldr         r6, [sp, #36]               ; load thresh address
     69     sub         sp, sp, #16                 ; create temp buffer
     70 
     71     ldr         r9, [src], pstep            ; p3
     72     ldrb        r4, [r2]                    ; blimit
     73     ldr         r10, [src], pstep           ; p2
     74     ldrb        r2, [r3]                    ; limit
     75     ldr         r11, [src], pstep           ; p1
     76     orr         r4, r4, r4, lsl #8
     77     ldrb        r3, [r6]                    ; thresh
     78     orr         r2, r2, r2, lsl #8
     79     mov         count, count, lsl #1        ; 4-in-parallel
     80     orr         r4, r4, r4, lsl #16
     81     orr         r3, r3, r3, lsl #8
     82     orr         r2, r2, r2, lsl #16
     83     orr         r3, r3, r3, lsl #16
     84 
     85 |Hnext8|
     86     ; vp8_filter_mask() function
     87     ; calculate breakout conditions
     88     ldr         r12, [src], pstep           ; p0
     89 
     90     uqsub8      r6, r9, r10                 ; p3 - p2
     91     uqsub8      r7, r10, r9                 ; p2 - p3
     92     uqsub8      r8, r10, r11                ; p2 - p1
     93     uqsub8      r10, r11, r10               ; p1 - p2
     94 
     95     orr         r6, r6, r7                  ; abs (p3-p2)
     96     orr         r8, r8, r10                 ; abs (p2-p1)
     97     uqsub8      lr, r6, r2                  ; compare to limit. lr: vp8_filter_mask
     98     uqsub8      r8, r8, r2                  ; compare to limit
     99     uqsub8      r6, r11, r12                ; p1 - p0
    100     orr         lr, lr, r8
    101     uqsub8      r7, r12, r11                ; p0 - p1
    102     ldr         r9, [src], pstep            ; q0
    103     ldr         r10, [src], pstep           ; q1
    104     orr         r6, r6, r7                  ; abs (p1-p0)
    105     uqsub8      r7, r6, r2                  ; compare to limit
    106     uqsub8      r8, r6, r3                  ; compare to thresh  -- save r8 for later
    107     orr         lr, lr, r7
    108 
    109     uqsub8      r6, r11, r10                ; p1 - q1
    110     uqsub8      r7, r10, r11                ; q1 - p1
    111     uqsub8      r11, r12, r9                ; p0 - q0
    112     uqsub8      r12, r9, r12                ; q0 - p0
    113     orr         r6, r6, r7                  ; abs (p1-q1)
    114     ldr         r7, c0x7F7F7F7F
    115     orr         r12, r11, r12               ; abs (p0-q0)
    116     ldr         r11, [src], pstep           ; q2
    117     uqadd8      r12, r12, r12               ; abs (p0-q0) * 2
    118     and         r6, r7, r6, lsr #1          ; abs (p1-q1) / 2
    119     uqsub8      r7, r9, r10                 ; q0 - q1
    120     uqadd8      r12, r12, r6                ; abs (p0-q0)*2 + abs (p1-q1)/2
    121     uqsub8      r6, r10, r9                 ; q1 - q0
    122     uqsub8      r12, r12, r4                ; compare to flimit
    123     uqsub8      r9, r11, r10                ; q2 - q1
    124 
    125     orr         lr, lr, r12
    126 
    127     ldr         r12, [src], pstep           ; q3
    128     uqsub8      r10, r10, r11               ; q1 - q2
    129     orr         r6, r7, r6                  ; abs (q1-q0)
    130     orr         r10, r9, r10                ; abs (q2-q1)
    131     uqsub8      r7, r6, r2                  ; compare to limit
    132     uqsub8      r10, r10, r2                ; compare to limit
    133     uqsub8      r6, r6, r3                  ; compare to thresh -- save r6 for later
    134     orr         lr, lr, r7
    135     orr         lr, lr, r10
    136 
    137     uqsub8      r10, r12, r11               ; q3 - q2
    138     uqsub8      r9, r11, r12                ; q2 - q3
    139 
    140     mvn         r11, #0                     ; r11 == -1
    141 
    142     orr         r10, r10, r9                ; abs (q3-q2)
    143     uqsub8      r10, r10, r2                ; compare to limit
    144 
    145     mov         r12, #0
    146     orr         lr, lr, r10
    147     sub         src, src, pstep, lsl #2
    148 
    149     usub8       lr, r12, lr                 ; use usub8 instead of ssub8
    150     sel         lr, r11, r12                ; filter mask: lr
    151 
    152     cmp         lr, #0
    153     beq         hskip_filter                 ; skip filtering
    154 
    155     sub         src, src, pstep, lsl #1     ; move src pointer down by 6 lines
    156 
    157     ;vp8_hevmask() function
    158     ;calculate high edge variance
    159     orr         r10, r6, r8                 ; calculate vp8_hevmask
    160 
    161     ldr         r7, [src], pstep            ; p1
    162 
    163     usub8       r10, r12, r10               ; use usub8 instead of ssub8
    164     sel         r6, r12, r11                ; obtain vp8_hevmask: r6
    165 
    166     ;vp8_filter() function
    167     ldr         r8, [src], pstep            ; p0
    168     ldr         r12, c0x80808080
    169     ldr         r9, [src], pstep            ; q0
    170     ldr         r10, [src], pstep           ; q1
    171 
    172     eor         r7, r7, r12                 ; p1 offset to convert to a signed value
    173     eor         r8, r8, r12                 ; p0 offset to convert to a signed value
    174     eor         r9, r9, r12                 ; q0 offset to convert to a signed value
    175     eor         r10, r10, r12               ; q1 offset to convert to a signed value
    176 
    177     str         r9, [sp]                    ; store qs0 temporarily
    178     str         r8, [sp, #4]                ; store ps0 temporarily
    179     str         r10, [sp, #8]               ; store qs1 temporarily
    180     str         r7, [sp, #12]               ; store ps1 temporarily
    181 
    182     qsub8       r7, r7, r10                 ; vp8_signed_char_clamp(ps1-qs1)
    183     qsub8       r8, r9, r8                  ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
    184 
    185     and         r7, r7, r6                  ; vp8_filter (r7) &= hev
    186 
    187     qadd8       r7, r7, r8
    188     ldr         r9, c0x03030303             ; r9 = 3 --modified for vp8
    189 
    190     qadd8       r7, r7, r8
    191     ldr         r10, c0x04040404
    192 
    193     qadd8       r7, r7, r8
    194     and         r7, r7, lr                  ; vp8_filter &= mask;
    195 
    196     ;modify code for vp8 -- Filter1 = vp8_filter (r7)
    197     qadd8       r8 , r7 , r9                ; Filter2 (r8) = vp8_signed_char_clamp(vp8_filter+3)
    198     qadd8       r7 , r7 , r10               ; vp8_filter = vp8_signed_char_clamp(vp8_filter+4)
    199 
    200     mov         r9, #0
    201     shadd8      r8 , r8 , r9                ; Filter2 >>= 3
    202     shadd8      r7 , r7 , r9                ; vp8_filter >>= 3
    203     shadd8      r8 , r8 , r9
    204     shadd8      r7 , r7 , r9
    205     shadd8      lr , r8 , r9                ; lr: Filter2
    206     shadd8      r7 , r7 , r9                ; r7: filter
    207 
    208     ;usub8      lr, r8, r10                 ; s = (s==4)*-1
    209     ;sel        lr, r11, r9
    210     ;usub8      r8, r10, r8
    211     ;sel        r8, r11, r9
    212     ;and        r8, r8, lr                  ; -1 for each element that equals 4
    213 
    214     ;calculate output
    215     ;qadd8      lr, r8, r7                  ; u = vp8_signed_char_clamp(s + vp8_filter)
    216 
    217     ldr         r8, [sp]                    ; load qs0
    218     ldr         r9, [sp, #4]                ; load ps0
    219 
    220     ldr         r10, c0x01010101
    221 
    222     qsub8       r8 ,r8, r7                  ; u = vp8_signed_char_clamp(qs0 - vp8_filter)
    223     qadd8       r9, r9, lr                  ; u = vp8_signed_char_clamp(ps0 + Filter2)
    224 
    225     ;end of modification for vp8
    226 
    227     mov         lr, #0
    228     sadd8       r7, r7 , r10                ; vp8_filter += 1
    229     shadd8      r7, r7, lr                  ; vp8_filter >>= 1
    230 
    231     ldr         r11, [sp, #12]              ; load ps1
    232     ldr         r10, [sp, #8]               ; load qs1
    233 
    234     bic         r7, r7, r6                  ; vp8_filter &= ~hev
    235     sub         src, src, pstep, lsl #2
    236 
    237     qadd8       r11, r11, r7                ; u = vp8_signed_char_clamp(ps1 + vp8_filter)
    238     qsub8       r10, r10,r7                 ; u = vp8_signed_char_clamp(qs1 - vp8_filter)
    239 
    240     eor         r11, r11, r12               ; *op1 = u^0x80
    241     str         r11, [src], pstep           ; store op1
    242     eor         r9, r9, r12                 ; *op0 = u^0x80
    243     str         r9, [src], pstep            ; store op0 result
    244     eor         r8, r8, r12                 ; *oq0 = u^0x80
    245     str         r8, [src], pstep            ; store oq0 result
    246     eor         r10, r10, r12               ; *oq1 = u^0x80
    247     str         r10, [src], pstep           ; store oq1
    248 
    249     sub         src, src, pstep, lsl #1
    250 
    251 |hskip_filter|
    252     add         src, src, #4
    253     sub         src, src, pstep, lsl #2
    254 
    255     subs        count, count, #1
    256 
    257     ldrne       r9, [src], pstep            ; p3
    258     ldrne       r10, [src], pstep           ; p2
    259     ldrne       r11, [src], pstep           ; p1
    260 
    261     bne         Hnext8
    262 
    263     add         sp, sp, #16
    264     ldmia       sp!, {r4 - r11, pc}
    265     ENDP        ; |vp8_loop_filter_horizontal_edge_armv6|
    266 
    267 
    268 ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
    269 |vp8_mbloop_filter_horizontal_edge_armv6| PROC
    270 ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
    271     stmdb       sp!, {r4 - r11, lr}
    272 
    273     sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
    274     ldr         count, [sp, #40]            ; count for 8-in-parallel
    275     ldr         r6, [sp, #36]               ; load thresh address
    276     sub         sp, sp, #16                 ; create temp buffer
    277 
    278     ldr         r9, [src], pstep            ; p3
    279     ldrb        r4, [r2]                    ; blimit
    280     ldr         r10, [src], pstep           ; p2
    281     ldrb        r2, [r3]                    ; limit
    282     ldr         r11, [src], pstep           ; p1
    283     orr         r4, r4, r4, lsl #8
    284     ldrb        r3, [r6]                    ; thresh
    285     orr         r2, r2, r2, lsl #8
    286     mov         count, count, lsl #1        ; 4-in-parallel
    287     orr         r4, r4, r4, lsl #16
    288     orr         r3, r3, r3, lsl #8
    289     orr         r2, r2, r2, lsl #16
    290     orr         r3, r3, r3, lsl #16
    291 
    292 |MBHnext8|
    293 
    294     ; vp8_filter_mask() function
    295     ; calculate breakout conditions
    296     ldr         r12, [src], pstep           ; p0
    297 
    298     uqsub8      r6, r9, r10                 ; p3 - p2
    299     uqsub8      r7, r10, r9                 ; p2 - p3
    300     uqsub8      r8, r10, r11                ; p2 - p1
    301     uqsub8      r10, r11, r10               ; p1 - p2
    302 
    303     orr         r6, r6, r7                  ; abs (p3-p2)
    304     orr         r8, r8, r10                 ; abs (p2-p1)
    305     uqsub8      lr, r6, r2                  ; compare to limit. lr: vp8_filter_mask
    306     uqsub8      r8, r8, r2                  ; compare to limit
    307 
    308     uqsub8      r6, r11, r12                ; p1 - p0
    309     orr         lr, lr, r8
    310     uqsub8      r7, r12, r11                ; p0 - p1
    311     ldr         r9, [src], pstep            ; q0
    312     ldr         r10, [src], pstep           ; q1
    313     orr         r6, r6, r7                  ; abs (p1-p0)
    314     uqsub8      r7, r6, r2                  ; compare to limit
    315     uqsub8      r8, r6, r3                  ; compare to thresh  -- save r8 for later
    316     orr         lr, lr, r7
    317 
    318     uqsub8      r6, r11, r10                ; p1 - q1
    319     uqsub8      r7, r10, r11                ; q1 - p1
    320     uqsub8      r11, r12, r9                ; p0 - q0
    321     uqsub8      r12, r9, r12                ; q0 - p0
    322     orr         r6, r6, r7                  ; abs (p1-q1)
    323     ldr         r7, c0x7F7F7F7F
    324     orr         r12, r11, r12               ; abs (p0-q0)
    325     ldr         r11, [src], pstep           ; q2
    326     uqadd8      r12, r12, r12               ; abs (p0-q0) * 2
    327     and         r6, r7, r6, lsr #1          ; abs (p1-q1) / 2
    328     uqsub8      r7, r9, r10                 ; q0 - q1
    329     uqadd8      r12, r12, r6                ; abs (p0-q0)*2 + abs (p1-q1)/2
    330     uqsub8      r6, r10, r9                 ; q1 - q0
    331     uqsub8      r12, r12, r4                ; compare to flimit
    332     uqsub8      r9, r11, r10                ; q2 - q1
    333 
    334     orr         lr, lr, r12
    335 
    336     ldr         r12, [src], pstep           ; q3
    337 
    338     uqsub8      r10, r10, r11               ; q1 - q2
    339     orr         r6, r7, r6                  ; abs (q1-q0)
    340     orr         r10, r9, r10                ; abs (q2-q1)
    341     uqsub8      r7, r6, r2                  ; compare to limit
    342     uqsub8      r10, r10, r2                ; compare to limit
    343     uqsub8      r6, r6, r3                  ; compare to thresh -- save r6 for later
    344     orr         lr, lr, r7
    345     orr         lr, lr, r10
    346 
    347     uqsub8      r10, r12, r11               ; q3 - q2
    348     uqsub8      r9, r11, r12                ; q2 - q3
    349 
    350     mvn         r11, #0                     ; r11 == -1
    351 
    352     orr         r10, r10, r9                ; abs (q3-q2)
    353     uqsub8      r10, r10, r2                ; compare to limit
    354 
    355     mov         r12, #0
    356 
    357     orr         lr, lr, r10
    358 
    359     usub8       lr, r12, lr                 ; use usub8 instead of ssub8
    360     sel         lr, r11, r12                ; filter mask: lr
    361 
    362     cmp         lr, #0
    363     beq         mbhskip_filter               ; skip filtering
    364 
    365     ;vp8_hevmask() function
    366     ;calculate high edge variance
    367     sub         src, src, pstep, lsl #2     ; move src pointer down by 6 lines
    368     sub         src, src, pstep, lsl #1
    369 
    370     orr         r10, r6, r8
    371     ldr         r7, [src], pstep            ; p1
    372 
    373     usub8       r10, r12, r10
    374     sel         r6, r12, r11                ; hev mask: r6
    375 
    376     ;vp8_mbfilter() function
    377     ;p2, q2 are only needed at the end. Don't need to load them in now.
    378     ldr         r8, [src], pstep            ; p0
    379     ldr         r12, c0x80808080
    380     ldr         r9, [src], pstep            ; q0
    381     ldr         r10, [src]                  ; q1
    382 
    383     eor         r7, r7, r12                 ; ps1
    384     eor         r8, r8, r12                 ; ps0
    385     eor         r9, r9, r12                 ; qs0
    386     eor         r10, r10, r12               ; qs1
    387 
    388     qsub8       r12, r9, r8                 ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
    389     str         r7, [sp, #12]               ; store ps1 temporarily
    390     qsub8       r7, r7, r10                 ; vp8_signed_char_clamp(ps1-qs1)
    391     str         r10, [sp, #8]               ; store qs1 temporarily
    392     qadd8       r7, r7, r12
    393     str         r9, [sp]                    ; store qs0 temporarily
    394     qadd8       r7, r7, r12
    395     str         r8, [sp, #4]                ; store ps0 temporarily
    396     qadd8       r7, r7, r12                 ; vp8_filter: r7
    397 
    398     ldr         r10, c0x03030303            ; r10 = 3 --modified for vp8
    399     ldr         r9, c0x04040404
    400 
    401     and         r7, r7, lr                  ; vp8_filter &= mask (lr is free)
    402 
    403     mov         r12, r7                     ; Filter2: r12
    404     and         r12, r12, r6                ; Filter2 &= hev
    405 
    406     ;modify code for vp8
    407     ;save bottom 3 bits so that we round one side +4 and the other +3
    408     qadd8       r8 , r12 , r9               ; Filter1 (r8) = vp8_signed_char_clamp(Filter2+4)
    409     qadd8       r12 , r12 , r10             ; Filter2 (r12) = vp8_signed_char_clamp(Filter2+3)
    410 
    411     mov         r10, #0
    412     shadd8      r8 , r8 , r10               ; Filter1 >>= 3
    413     shadd8      r12 , r12 , r10             ; Filter2 >>= 3
    414     shadd8      r8 , r8 , r10
    415     shadd8      r12 , r12 , r10
    416     shadd8      r8 , r8 , r10               ; r8: Filter1
    417     shadd8      r12 , r12 , r10             ; r12: Filter2
    418 
    419     ldr         r9, [sp]                    ; load qs0
    420     ldr         r11, [sp, #4]               ; load ps0
    421 
    422     qsub8       r9 , r9, r8                 ; qs0 = vp8_signed_char_clamp(qs0 - Filter1)
    423     qadd8       r11, r11, r12               ; ps0 = vp8_signed_char_clamp(ps0 + Filter2)
    424 
    425     ;save bottom 3 bits so that we round one side +4 and the other +3
    426     ;and            r8, r12, r10                ; s = Filter2 & 7 (s: r8)
    427     ;qadd8      r12 , r12 , r9              ; Filter2 = vp8_signed_char_clamp(Filter2+4)
    428     ;mov            r10, #0
    429     ;shadd8     r12 , r12 , r10             ; Filter2 >>= 3
    430     ;usub8      lr, r8, r9                  ; s = (s==4)*-1
    431     ;sel            lr, r11, r10
    432     ;shadd8     r12 , r12 , r10
    433     ;usub8      r8, r9, r8
    434     ;sel            r8, r11, r10
    435     ;ldr            r9, [sp]                    ; load qs0
    436     ;ldr            r11, [sp, #4]               ; load ps0
    437     ;shadd8     r12 , r12 , r10
    438     ;and            r8, r8, lr                  ; -1 for each element that equals 4
    439     ;qadd8      r10, r8, r12                ; u = vp8_signed_char_clamp(s + Filter2)
    440     ;qsub8      r9 , r9, r12                ; qs0 = vp8_signed_char_clamp(qs0 - Filter2)
    441     ;qadd8      r11, r11, r10               ; ps0 = vp8_signed_char_clamp(ps0 + u)
    442 
    443     ;end of modification for vp8
    444 
    445     bic         r12, r7, r6                 ; vp8_filter &= ~hev    ( r6 is free)
    446     ;mov        r12, r7
    447 
    448     ;roughly 3/7th difference across boundary
    449     mov         lr, #0x1b                   ; 27
    450     mov         r7, #0x3f                   ; 63
    451 
    452     sxtb16      r6, r12
    453     sxtb16      r10, r12, ror #8
    454     smlabb      r8, r6, lr, r7
    455     smlatb      r6, r6, lr, r7
    456     smlabb      r7, r10, lr, r7
    457     smultb      r10, r10, lr
    458     ssat        r8, #8, r8, asr #7
    459     ssat        r6, #8, r6, asr #7
    460     add         r10, r10, #63
    461     ssat        r7, #8, r7, asr #7
    462     ssat        r10, #8, r10, asr #7
    463 
    464     ldr         lr, c0x80808080
    465 
    466     pkhbt       r6, r8, r6, lsl #16
    467     pkhbt       r10, r7, r10, lsl #16
    468     uxtb16      r6, r6
    469     uxtb16      r10, r10
    470 
    471     sub         src, src, pstep
    472 
    473     orr         r10, r6, r10, lsl #8        ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7)
    474 
    475     qsub8       r8, r9, r10                 ; s = vp8_signed_char_clamp(qs0 - u)
    476     qadd8       r10, r11, r10               ; s = vp8_signed_char_clamp(ps0 + u)
    477     eor         r8, r8, lr                  ; *oq0 = s^0x80
    478     str         r8, [src]                   ; store *oq0
    479     sub         src, src, pstep
    480     eor         r10, r10, lr                ; *op0 = s^0x80
    481     str         r10, [src]                  ; store *op0
    482 
    483     ;roughly 2/7th difference across boundary
    484     mov         lr, #0x12                   ; 18
    485     mov         r7, #0x3f                   ; 63
    486 
    487     sxtb16      r6, r12
    488     sxtb16      r10, r12, ror #8
    489     smlabb      r8, r6, lr, r7
    490     smlatb      r6, r6, lr, r7
    491     smlabb      r9, r10, lr, r7
    492     smlatb      r10, r10, lr, r7
    493     ssat        r8, #8, r8, asr #7
    494     ssat        r6, #8, r6, asr #7
    495     ssat        r9, #8, r9, asr #7
    496     ssat        r10, #8, r10, asr #7
    497 
    498     ldr         lr, c0x80808080
    499 
    500     pkhbt       r6, r8, r6, lsl #16
    501     pkhbt       r10, r9, r10, lsl #16
    502 
    503     ldr         r9, [sp, #8]                ; load qs1
    504     ldr         r11, [sp, #12]              ; load ps1
    505 
    506     uxtb16      r6, r6
    507     uxtb16      r10, r10
    508 
    509     sub         src, src, pstep
    510 
    511     orr         r10, r6, r10, lsl #8        ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7)
    512 
    513     qadd8       r11, r11, r10               ; s = vp8_signed_char_clamp(ps1 + u)
    514     qsub8       r8, r9, r10                 ; s = vp8_signed_char_clamp(qs1 - u)
    515     eor         r11, r11, lr                ; *op1 = s^0x80
    516     str         r11, [src], pstep           ; store *op1
    517     eor         r8, r8, lr                  ; *oq1 = s^0x80
    518     add         src, src, pstep, lsl #1
    519 
    520     mov         r7, #0x3f                   ; 63
    521 
    522     str         r8, [src], pstep            ; store *oq1
    523 
    524     ;roughly 1/7th difference across boundary
    525     mov         lr, #0x9                    ; 9
    526     ldr         r9, [src]                   ; load q2
    527 
    528     sxtb16      r6, r12
    529     sxtb16      r10, r12, ror #8
    530     smlabb      r8, r6, lr, r7
    531     smlatb      r6, r6, lr, r7
    532     smlabb      r12, r10, lr, r7
    533     smlatb      r10, r10, lr, r7
    534     ssat        r8, #8, r8, asr #7
    535     ssat        r6, #8, r6, asr #7
    536     ssat        r12, #8, r12, asr #7
    537     ssat        r10, #8, r10, asr #7
    538 
    539     sub         src, src, pstep, lsl #2
    540 
    541     pkhbt       r6, r8, r6, lsl #16
    542     pkhbt       r10, r12, r10, lsl #16
    543 
    544     sub         src, src, pstep
    545     ldr         lr, c0x80808080
    546 
    547     ldr         r11, [src]                  ; load p2
    548 
    549     uxtb16      r6, r6
    550     uxtb16      r10, r10
    551 
    552     eor         r9, r9, lr
    553     eor         r11, r11, lr
    554 
    555     orr         r10, r6, r10, lsl #8        ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7)
    556 
    557     qadd8       r8, r11, r10                ; s = vp8_signed_char_clamp(ps2 + u)
    558     qsub8       r10, r9, r10                ; s = vp8_signed_char_clamp(qs2 - u)
    559     eor         r8, r8, lr                  ; *op2 = s^0x80
    560     str         r8, [src], pstep, lsl #2    ; store *op2
    561     add         src, src, pstep
    562     eor         r10, r10, lr                ; *oq2 = s^0x80
    563     str         r10, [src], pstep, lsl #1   ; store *oq2
    564 
    565 |mbhskip_filter|
    566     add         src, src, #4
    567     sub         src, src, pstep, lsl #3
    568     subs        count, count, #1
    569 
    570     ldrne       r9, [src], pstep            ; p3
    571     ldrne       r10, [src], pstep           ; p2
    572     ldrne       r11, [src], pstep           ; p1
    573 
    574     bne         MBHnext8
    575 
    576     add         sp, sp, #16
    577     ldmia       sp!, {r4 - r11, pc}
    578     ENDP        ; |vp8_mbloop_filter_horizontal_edge_armv6|
    579 
    580 
    581 ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
    582 |vp8_loop_filter_vertical_edge_armv6| PROC
    583 ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
    584     stmdb       sp!, {r4 - r11, lr}
    585 
    586     sub         src, src, #4                ; move src pointer down by 4
    587     ldr         count, [sp, #40]            ; count for 8-in-parallel
    588     ldr         r12, [sp, #36]              ; load thresh address
    589     sub         sp, sp, #16                 ; create temp buffer
    590 
    591     ldr         r6, [src], pstep            ; load source data
    592     ldrb        r4, [r2]                    ; blimit
    593     ldr         r7, [src], pstep
    594     ldrb        r2, [r3]                    ; limit
    595     ldr         r8, [src], pstep
    596     orr         r4, r4, r4, lsl #8
    597     ldrb        r3, [r12]                   ; thresh
    598     orr         r2, r2, r2, lsl #8
    599     ldr         lr, [src], pstep
    600     mov         count, count, lsl #1        ; 4-in-parallel
    601     orr         r4, r4, r4, lsl #16
    602     orr         r3, r3, r3, lsl #8
    603     orr         r2, r2, r2, lsl #16
    604     orr         r3, r3, r3, lsl #16
    605 
    606 |Vnext8|
    607 
    608     ; vp8_filter_mask() function
    609     ; calculate breakout conditions
    610     ; transpose the source data for 4-in-parallel operation
    611     TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12
    612 
    613     uqsub8      r7, r9, r10                 ; p3 - p2
    614     uqsub8      r8, r10, r9                 ; p2 - p3
    615     uqsub8      r9, r10, r11                ; p2 - p1
    616     uqsub8      r10, r11, r10               ; p1 - p2
    617     orr         r7, r7, r8                  ; abs (p3-p2)
    618     orr         r10, r9, r10                ; abs (p2-p1)
    619     uqsub8      lr, r7, r2                  ; compare to limit. lr: vp8_filter_mask
    620     uqsub8      r10, r10, r2                ; compare to limit
    621 
    622     sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
    623 
    624     orr         lr, lr, r10
    625 
    626     uqsub8      r6, r11, r12                ; p1 - p0
    627     uqsub8      r7, r12, r11                ; p0 - p1
    628     add         src, src, #4                ; move src pointer up by 4
    629     orr         r6, r6, r7                  ; abs (p1-p0)
    630     str         r11, [sp, #12]              ; save p1
    631     uqsub8      r10, r6, r2                 ; compare to limit
    632     uqsub8      r11, r6, r3                 ; compare to thresh
    633     orr         lr, lr, r10
    634 
    635     ; transpose uses 8 regs(r6 - r12 and lr). Need to save reg value now
    636     ; transpose the source data for 4-in-parallel operation
    637     ldr         r6, [src], pstep            ; load source data
    638     str         r11, [sp]                   ; push r11 to stack
    639     ldr         r7, [src], pstep
    640     str         r12, [sp, #4]               ; save current reg before load q0 - q3 data
    641     ldr         r8, [src], pstep
    642     str         lr, [sp, #8]
    643     ldr         lr, [src], pstep
    644 
    645     TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12
    646 
    647     ldr         lr, [sp, #8]                ; load back (f)limit accumulator
    648 
    649     uqsub8      r6, r12, r11                ; q3 - q2
    650     uqsub8      r7, r11, r12                ; q2 - q3
    651     uqsub8      r12, r11, r10               ; q2 - q1
    652     uqsub8      r11, r10, r11               ; q1 - q2
    653     orr         r6, r6, r7                  ; abs (q3-q2)
    654     orr         r7, r12, r11                ; abs (q2-q1)
    655     uqsub8      r6, r6, r2                  ; compare to limit
    656     uqsub8      r7, r7, r2                  ; compare to limit
    657     ldr         r11, [sp, #4]               ; load back p0
    658     ldr         r12, [sp, #12]              ; load back p1
    659     orr         lr, lr, r6
    660     orr         lr, lr, r7
    661 
    662     uqsub8      r6, r11, r9                 ; p0 - q0
    663     uqsub8      r7, r9, r11                 ; q0 - p0
    664     uqsub8      r8, r12, r10                ; p1 - q1
    665     uqsub8      r11, r10, r12               ; q1 - p1
    666     orr         r6, r6, r7                  ; abs (p0-q0)
    667     ldr         r7, c0x7F7F7F7F
    668     orr         r8, r8, r11                 ; abs (p1-q1)
    669     uqadd8      r6, r6, r6                  ; abs (p0-q0) * 2
    670     and         r8, r7, r8, lsr #1          ; abs (p1-q1) / 2
    671     uqsub8      r11, r10, r9                ; q1 - q0
    672     uqadd8      r6, r8, r6                  ; abs (p0-q0)*2 + abs (p1-q1)/2
    673     uqsub8      r12, r9, r10                ; q0 - q1
    674     uqsub8      r6, r6, r4                  ; compare to flimit
    675 
    676     orr         r9, r11, r12                ; abs (q1-q0)
    677     uqsub8      r8, r9, r2                  ; compare to limit
    678     uqsub8      r10, r9, r3                 ; compare to thresh
    679     orr         lr, lr, r6
    680     orr         lr, lr, r8
    681 
    682     mvn         r11, #0                     ; r11 == -1
    683     mov         r12, #0
    684 
    685     usub8       lr, r12, lr
    686     ldr         r9, [sp]                    ; load the compared result
    687     sel         lr, r11, r12                ; filter mask: lr
    688 
    689     cmp         lr, #0
    690     beq         vskip_filter                 ; skip filtering
    691 
    692     ;vp8_hevmask() function
    693     ;calculate high edge variance
    694 
    695     sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
    696 
    697     orr         r9, r9, r10
    698 
    699     ldrh        r7, [src, #-2]
    700     ldrh        r8, [src], pstep
    701 
    702     usub8       r9, r12, r9
    703     sel         r6, r12, r11                ; hev mask: r6
    704 
    705     ;vp8_filter() function
    706     ; load soure data to r6, r11, r12, lr
    707     ldrh        r9, [src, #-2]
    708     ldrh        r10, [src], pstep
    709 
    710     pkhbt       r12, r7, r8, lsl #16
    711 
    712     ldrh        r7, [src, #-2]
    713     ldrh        r8, [src], pstep
    714 
    715     pkhbt       r11, r9, r10, lsl #16
    716 
    717     ldrh        r9, [src, #-2]
    718     ldrh        r10, [src], pstep
    719 
    720     ; Transpose needs 8 regs(r6 - r12, and lr). Save r6 and lr first
    721     str         r6, [sp]
    722     str         lr, [sp, #4]
    723 
    724     pkhbt       r6, r7, r8, lsl #16
    725     pkhbt       lr, r9, r10, lsl #16
    726 
    727     ;transpose r12, r11, r6, lr to r7, r8, r9, r10
    728     TRANSPOSE_MATRIX r12, r11, r6, lr, r7, r8, r9, r10
    729 
    730     ;load back hev_mask r6 and filter_mask lr
    731     ldr         r12, c0x80808080
    732     ldr         r6, [sp]
    733     ldr         lr, [sp, #4]
    734 
    735     eor         r7, r7, r12                 ; p1 offset to convert to a signed value
    736     eor         r8, r8, r12                 ; p0 offset to convert to a signed value
    737     eor         r9, r9, r12                 ; q0 offset to convert to a signed value
    738     eor         r10, r10, r12               ; q1 offset to convert to a signed value
    739 
    740     str         r9, [sp]                    ; store qs0 temporarily
    741     str         r8, [sp, #4]                ; store ps0 temporarily
    742     str         r10, [sp, #8]               ; store qs1 temporarily
    743     str         r7, [sp, #12]               ; store ps1 temporarily
    744 
    745     qsub8       r7, r7, r10                 ; vp8_signed_char_clamp(ps1-qs1)
    746     qsub8       r8, r9, r8                  ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
    747 
    748     and         r7, r7, r6                  ;  vp8_filter (r7) &= hev (r7 : filter)
    749 
    750     qadd8       r7, r7, r8
    751     ldr         r9, c0x03030303             ; r9 = 3 --modified for vp8
    752 
    753     qadd8       r7, r7, r8
    754     ldr         r10, c0x04040404
    755 
    756     qadd8       r7, r7, r8
    757     ;mvn         r11, #0                     ; r11 == -1
    758 
    759     and         r7, r7, lr                  ; vp8_filter &= mask
    760 
    761     ;modify code for vp8 -- Filter1 = vp8_filter (r7)
    762     qadd8       r8 , r7 , r9                ; Filter2 (r8) = vp8_signed_char_clamp(vp8_filter+3)
    763     qadd8       r7 , r7 , r10               ; vp8_filter = vp8_signed_char_clamp(vp8_filter+4)
    764 
    765     mov         r9, #0
    766     shadd8      r8 , r8 , r9                ; Filter2 >>= 3
    767     shadd8      r7 , r7 , r9                ; vp8_filter >>= 3
    768     shadd8      r8 , r8 , r9
    769     shadd8      r7 , r7 , r9
    770     shadd8      lr , r8 , r9                ; lr: filter2
    771     shadd8      r7 , r7 , r9                ; r7: filter
    772 
    773     ;usub8      lr, r8, r10                 ; s = (s==4)*-1
    774     ;sel            lr, r11, r9
    775     ;usub8      r8, r10, r8
    776     ;sel            r8, r11, r9
    777     ;and            r8, r8, lr                  ; -1 for each element that equals 4 -- r8: s
    778 
    779     ;calculate output
    780     ;qadd8      lr, r8, r7                  ; u = vp8_signed_char_clamp(s + vp8_filter)
    781 
    782     ldr         r8, [sp]                    ; load qs0
    783     ldr         r9, [sp, #4]                ; load ps0
    784 
    785     ldr         r10, c0x01010101
    786 
    787     qsub8       r8, r8, r7                  ; u = vp8_signed_char_clamp(qs0 - vp8_filter)
    788     qadd8       r9, r9, lr                  ; u = vp8_signed_char_clamp(ps0 + Filter2)
    789     ;end of modification for vp8
    790 
    791     eor         r8, r8, r12
    792     eor         r9, r9, r12
    793 
    794     mov         lr, #0
    795 
    796     sadd8       r7, r7, r10
    797     shadd8      r7, r7, lr
    798 
    799     ldr         r10, [sp, #8]               ; load qs1
    800     ldr         r11, [sp, #12]              ; load ps1
    801 
    802     bic         r7, r7, r6                  ; r7: vp8_filter
    803 
    804     qsub8       r10 , r10, r7               ; u = vp8_signed_char_clamp(qs1 - vp8_filter)
    805     qadd8       r11, r11, r7                ; u = vp8_signed_char_clamp(ps1 + vp8_filter)
    806     eor         r10, r10, r12
    807     eor         r11, r11, r12
    808 
    809     sub         src, src, pstep, lsl #2
    810 
    811     ;we can use TRANSPOSE_MATRIX macro to transpose output - input: q1, q0, p0, p1
    812     ;output is b0, b1, b2, b3
    813     ;b0: 03 02 01 00
    814     ;b1: 13 12 11 10
    815     ;b2: 23 22 21 20
    816     ;b3: 33 32 31 30
    817     ;    p1 p0 q0 q1
    818     ;   (a3 a2 a1 a0)
    819     TRANSPOSE_MATRIX r11, r9, r8, r10, r6, r7, r12, lr
    820 
    821     strh        r6, [src, #-2]              ; store the result
    822     mov         r6, r6, lsr #16
    823     strh        r6, [src], pstep
    824 
    825     strh        r7, [src, #-2]
    826     mov         r7, r7, lsr #16
    827     strh        r7, [src], pstep
    828 
    829     strh        r12, [src, #-2]
    830     mov         r12, r12, lsr #16
    831     strh        r12, [src], pstep
    832 
    833     strh        lr, [src, #-2]
    834     mov         lr, lr, lsr #16
    835     strh        lr, [src], pstep
    836 
    837 |vskip_filter|
    838     sub         src, src, #4
    839     subs        count, count, #1
    840 
    841     ldrne       r6, [src], pstep            ; load source data
    842     ldrne       r7, [src], pstep
    843     ldrne       r8, [src], pstep
    844     ldrne       lr, [src], pstep
    845 
    846     bne         Vnext8
    847 
    848     add         sp, sp, #16
    849 
    850     ldmia       sp!, {r4 - r11, pc}
    851     ENDP        ; |vp8_loop_filter_vertical_edge_armv6|
    852 
    853 
    854 
    855 ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
    856 |vp8_mbloop_filter_vertical_edge_armv6| PROC
    857 ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
    858     stmdb       sp!, {r4 - r11, lr}
    859 
    860     sub         src, src, #4                ; move src pointer down by 4
    861     ldr         count, [sp, #40]            ; count for 8-in-parallel
    862     ldr         r12, [sp, #36]              ; load thresh address
    863     pld         [src, #23]                  ; preload for next block
    864     sub         sp, sp, #16                 ; create temp buffer
    865 
    866     ldr         r6, [src], pstep            ; load source data
    867     ldrb        r4, [r2]                    ; blimit
    868     pld         [src, #23]
    869     ldr         r7, [src], pstep
    870     ldrb        r2, [r3]                    ; limit
    871     pld         [src, #23]
    872     ldr         r8, [src], pstep
    873     orr         r4, r4, r4, lsl #8
    874     ldrb        r3, [r12]                   ; thresh
    875     orr         r2, r2, r2, lsl #8
    876     pld         [src, #23]
    877     ldr         lr, [src], pstep
    878     mov         count, count, lsl #1        ; 4-in-parallel
    879     orr         r4, r4, r4, lsl #16
    880     orr         r3, r3, r3, lsl #8
    881     orr         r2, r2, r2, lsl #16
    882     orr         r3, r3, r3, lsl #16
    883 
    884 |MBVnext8|
    885     ; vp8_filter_mask() function
    886     ; calculate breakout conditions
    887     ; transpose the source data for 4-in-parallel operation
    888     TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12
    889 
    890     uqsub8      r7, r9, r10                 ; p3 - p2
    891     uqsub8      r8, r10, r9                 ; p2 - p3
    892     uqsub8      r9, r10, r11                ; p2 - p1
    893     uqsub8      r10, r11, r10               ; p1 - p2
    894     orr         r7, r7, r8                  ; abs (p3-p2)
    895     orr         r10, r9, r10                ; abs (p2-p1)
    896     uqsub8      lr, r7, r2                  ; compare to limit. lr: vp8_filter_mask
    897     uqsub8      r10, r10, r2                ; compare to limit
    898 
    899     sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
    900 
    901     orr         lr, lr, r10
    902 
    903     uqsub8      r6, r11, r12                ; p1 - p0
    904     uqsub8      r7, r12, r11                ; p0 - p1
    905     add         src, src, #4                ; move src pointer up by 4
    906     orr         r6, r6, r7                  ; abs (p1-p0)
    907     str         r11, [sp, #12]              ; save p1
    908     uqsub8      r10, r6, r2                 ; compare to limit
    909     uqsub8      r11, r6, r3                 ; compare to thresh
    910     orr         lr, lr, r10
    911 
    912     ; transpose uses 8 regs(r6 - r12 and lr). Need to save reg value now
    913     ; transpose the source data for 4-in-parallel operation
    914     ldr         r6, [src], pstep            ; load source data
    915     str         r11, [sp]                   ; push r11 to stack
    916     ldr         r7, [src], pstep
    917     str         r12, [sp, #4]               ; save current reg before load q0 - q3 data
    918     ldr         r8, [src], pstep
    919     str         lr, [sp, #8]
    920     ldr         lr, [src], pstep
    921 
    922 
    923     TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12
    924 
    925     ldr         lr, [sp, #8]                ; load back (f)limit accumulator
    926 
    927     uqsub8      r6, r12, r11                ; q3 - q2
    928     uqsub8      r7, r11, r12                ; q2 - q3
    929     uqsub8      r12, r11, r10               ; q2 - q1
    930     uqsub8      r11, r10, r11               ; q1 - q2
    931     orr         r6, r6, r7                  ; abs (q3-q2)
    932     orr         r7, r12, r11                ; abs (q2-q1)
    933     uqsub8      r6, r6, r2                  ; compare to limit
    934     uqsub8      r7, r7, r2                  ; compare to limit
    935     ldr         r11, [sp, #4]               ; load back p0
    936     ldr         r12, [sp, #12]              ; load back p1
    937     orr         lr, lr, r6
    938     orr         lr, lr, r7
    939 
    940     uqsub8      r6, r11, r9                 ; p0 - q0
    941     uqsub8      r7, r9, r11                 ; q0 - p0
    942     uqsub8      r8, r12, r10                ; p1 - q1
    943     uqsub8      r11, r10, r12               ; q1 - p1
    944     orr         r6, r6, r7                  ; abs (p0-q0)
    945     ldr         r7, c0x7F7F7F7F
    946     orr         r8, r8, r11                 ; abs (p1-q1)
    947     uqadd8      r6, r6, r6                  ; abs (p0-q0) * 2
    948     and         r8, r7, r8, lsr #1          ; abs (p1-q1) / 2
    949     uqsub8      r11, r10, r9                ; q1 - q0
    950     uqadd8      r6, r8, r6                  ; abs (p0-q0)*2 + abs (p1-q1)/2
    951     uqsub8      r12, r9, r10                ; q0 - q1
    952     uqsub8      r6, r6, r4                  ; compare to flimit
    953 
    954     orr         r9, r11, r12                ; abs (q1-q0)
    955     uqsub8      r8, r9, r2                  ; compare to limit
    956     uqsub8      r10, r9, r3                 ; compare to thresh
    957     orr         lr, lr, r6
    958     orr         lr, lr, r8
    959 
    960     mvn         r11, #0                     ; r11 == -1
    961     mov         r12, #0
    962 
    963     usub8       lr, r12, lr
    964     ldr         r9, [sp]                    ; load the compared result
    965     sel         lr, r11, r12                ; filter mask: lr
    966 
    967     cmp         lr, #0
    968     beq         mbvskip_filter               ; skip filtering
    969 
    970 
    971 
    972     ;vp8_hevmask() function
    973     ;calculate high edge variance
    974 
    975     sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
    976 
    977     orr         r9, r9, r10
    978 
    979     ldrh        r7, [src, #-2]
    980     ldrh        r8, [src], pstep
    981 
    982     usub8       r9, r12, r9
    983     sel         r6, r12, r11                ; hev mask: r6
    984 
    985 
    986     ; vp8_mbfilter() function
    987     ; p2, q2 are only needed at the end. Don't need to load them in now.
    988     ; Transpose needs 8 regs(r6 - r12, and lr). Save r6 and lr first
    989     ; load soure data to r6, r11, r12, lr
    990     ldrh        r9, [src, #-2]
    991     ldrh        r10, [src], pstep
    992 
    993     pkhbt       r12, r7, r8, lsl #16
    994 
    995     ldrh        r7, [src, #-2]
    996     ldrh        r8, [src], pstep
    997 
    998     pkhbt       r11, r9, r10, lsl #16
    999 
   1000     ldrh        r9, [src, #-2]
   1001     ldrh        r10, [src], pstep
   1002 
   1003     str         r6, [sp]                    ; save r6
   1004     str         lr, [sp, #4]                ; save lr
   1005 
   1006     pkhbt       r6, r7, r8, lsl #16
   1007     pkhbt       lr, r9, r10, lsl #16
   1008 
   1009     ;transpose r12, r11, r6, lr to p1, p0, q0, q1
   1010     TRANSPOSE_MATRIX r12, r11, r6, lr, r7, r8, r9, r10
   1011 
   1012     ;load back hev_mask r6 and filter_mask lr
   1013     ldr         r12, c0x80808080
   1014     ldr         r6, [sp]
   1015     ldr         lr, [sp, #4]
   1016 
   1017     eor         r7, r7, r12                 ; ps1
   1018     eor         r8, r8, r12                 ; ps0
   1019     eor         r9, r9, r12                 ; qs0
   1020     eor         r10, r10, r12               ; qs1
   1021 
   1022     qsub8       r12, r9, r8                 ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
   1023     str         r7, [sp, #12]               ; store ps1 temporarily
   1024     qsub8       r7, r7, r10                 ; vp8_signed_char_clamp(ps1-qs1)
   1025     str         r10, [sp, #8]               ; store qs1 temporarily
   1026     qadd8       r7, r7, r12
   1027     str         r9, [sp]                    ; store qs0 temporarily
   1028     qadd8       r7, r7, r12
   1029     str         r8, [sp, #4]                ; store ps0 temporarily
   1030     qadd8       r7, r7, r12                 ; vp8_filter: r7
   1031 
   1032     ldr         r10, c0x03030303            ; r10 = 3 --modified for vp8
   1033     ldr         r9, c0x04040404
   1034     ;mvn         r11, #0                     ; r11 == -1
   1035 
   1036     and         r7, r7, lr                  ; vp8_filter &= mask (lr is free)
   1037 
   1038     mov         r12, r7                     ; Filter2: r12
   1039     and         r12, r12, r6                ; Filter2 &= hev
   1040 
   1041     ;modify code for vp8
   1042     ;save bottom 3 bits so that we round one side +4 and the other +3
   1043     qadd8       r8 , r12 , r9               ; Filter1 (r8) = vp8_signed_char_clamp(Filter2+4)
   1044     qadd8       r12 , r12 , r10             ; Filter2 (r12) = vp8_signed_char_clamp(Filter2+3)
   1045 
   1046     mov         r10, #0
   1047     shadd8      r8 , r8 , r10               ; Filter1 >>= 3
   1048     shadd8      r12 , r12 , r10             ; Filter2 >>= 3
   1049     shadd8      r8 , r8 , r10
   1050     shadd8      r12 , r12 , r10
   1051     shadd8      r8 , r8 , r10               ; r8: Filter1
   1052     shadd8      r12 , r12 , r10             ; r12: Filter2
   1053 
   1054     ldr         r9, [sp]                    ; load qs0
   1055     ldr         r11, [sp, #4]               ; load ps0
   1056 
   1057     qsub8       r9 , r9, r8                 ; qs0 = vp8_signed_char_clamp(qs0 - Filter1)
   1058     qadd8       r11, r11, r12               ; ps0 = vp8_signed_char_clamp(ps0 + Filter2)
   1059 
   1060     ;save bottom 3 bits so that we round one side +4 and the other +3
   1061     ;and            r8, r12, r10                ; s = Filter2 & 7 (s: r8)
   1062     ;qadd8      r12 , r12 , r9              ; Filter2 = vp8_signed_char_clamp(Filter2+4)
   1063     ;mov            r10, #0
   1064     ;shadd8     r12 , r12 , r10             ; Filter2 >>= 3
   1065     ;usub8      lr, r8, r9                  ; s = (s==4)*-1
   1066     ;sel            lr, r11, r10
   1067     ;shadd8     r12 , r12 , r10
   1068     ;usub8      r8, r9, r8
   1069     ;sel            r8, r11, r10
   1070     ;ldr            r9, [sp]                    ; load qs0
   1071     ;ldr            r11, [sp, #4]               ; load ps0
   1072     ;shadd8     r12 , r12 , r10
   1073     ;and            r8, r8, lr                  ; -1 for each element that equals 4
   1074     ;qadd8      r10, r8, r12                ; u = vp8_signed_char_clamp(s + Filter2)
   1075     ;qsub8      r9 , r9, r12                ; qs0 = vp8_signed_char_clamp(qs0 - Filter2)
   1076     ;qadd8      r11, r11, r10               ; ps0 = vp8_signed_char_clamp(ps0 + u)
   1077 
   1078     ;end of modification for vp8
   1079 
   1080     bic         r12, r7, r6                 ;vp8_filter &= ~hev    ( r6 is free)
   1081     ;mov            r12, r7
   1082 
   1083     ;roughly 3/7th difference across boundary
   1084     mov         lr, #0x1b                   ; 27
   1085     mov         r7, #0x3f                   ; 63
   1086 
   1087     sxtb16      r6, r12
   1088     sxtb16      r10, r12, ror #8
   1089     smlabb      r8, r6, lr, r7
   1090     smlatb      r6, r6, lr, r7
   1091     smlabb      r7, r10, lr, r7
   1092     smultb      r10, r10, lr
   1093     ssat        r8, #8, r8, asr #7
   1094     ssat        r6, #8, r6, asr #7
   1095     add         r10, r10, #63
   1096     ssat        r7, #8, r7, asr #7
   1097     ssat        r10, #8, r10, asr #7
   1098 
   1099     ldr         lr, c0x80808080
   1100 
   1101     pkhbt       r6, r8, r6, lsl #16
   1102     pkhbt       r10, r7, r10, lsl #16
   1103     uxtb16      r6, r6
   1104     uxtb16      r10, r10
   1105 
   1106     sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
   1107 
   1108     orr         r10, r6, r10, lsl #8        ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7)
   1109 
   1110     qsub8       r8, r9, r10                 ; s = vp8_signed_char_clamp(qs0 - u)
   1111     qadd8       r10, r11, r10               ; s = vp8_signed_char_clamp(ps0 + u)
   1112     eor         r8, r8, lr                  ; *oq0 = s^0x80
   1113     eor         r10, r10, lr                ; *op0 = s^0x80
   1114 
   1115     strb        r10, [src, #-1]             ; store op0 result
   1116     strb        r8, [src], pstep            ; store oq0 result
   1117     mov         r10, r10, lsr #8
   1118     mov         r8, r8, lsr #8
   1119     strb        r10, [src, #-1]
   1120     strb        r8, [src], pstep
   1121     mov         r10, r10, lsr #8
   1122     mov         r8, r8, lsr #8
   1123     strb        r10, [src, #-1]
   1124     strb        r8, [src], pstep
   1125     mov         r10, r10, lsr #8
   1126     mov         r8, r8, lsr #8
   1127     strb        r10, [src, #-1]
   1128     strb        r8, [src], pstep
   1129 
   1130     ;roughly 2/7th difference across boundary
   1131     mov         lr, #0x12                   ; 18
   1132     mov         r7, #0x3f                   ; 63
   1133 
   1134     sxtb16      r6, r12
   1135     sxtb16      r10, r12, ror #8
   1136     smlabb      r8, r6, lr, r7
   1137     smlatb      r6, r6, lr, r7
   1138     smlabb      r9, r10, lr, r7
   1139 
   1140     smlatb      r10, r10, lr, r7
   1141     ssat        r8, #8, r8, asr #7
   1142     ssat        r6, #8, r6, asr #7
   1143     ssat        r9, #8, r9, asr #7
   1144     ssat        r10, #8, r10, asr #7
   1145 
   1146     sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
   1147 
   1148     pkhbt       r6, r8, r6, lsl #16
   1149     pkhbt       r10, r9, r10, lsl #16
   1150 
   1151     ldr         r9, [sp, #8]                ; load qs1
   1152     ldr         r11, [sp, #12]              ; load ps1
   1153     ldr         lr, c0x80808080
   1154 
   1155     uxtb16      r6, r6
   1156     uxtb16      r10, r10
   1157 
   1158     add         src, src, #2
   1159 
   1160     orr         r10, r6, r10, lsl #8        ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7)
   1161 
   1162     qsub8       r8, r9, r10                 ; s = vp8_signed_char_clamp(qs1 - u)
   1163     qadd8       r10, r11, r10               ; s = vp8_signed_char_clamp(ps1 + u)
   1164     eor         r8, r8, lr                  ; *oq1 = s^0x80
   1165     eor         r10, r10, lr                ; *op1 = s^0x80
   1166 
   1167     ldrb        r11, [src, #-5]             ; load p2 for 1/7th difference across boundary
   1168     strb        r10, [src, #-4]             ; store op1
   1169     strb        r8, [src, #-1]              ; store oq1
   1170     ldrb        r9, [src], pstep            ; load q2 for 1/7th difference across boundary
   1171 
   1172     mov         r10, r10, lsr #8
   1173     mov         r8, r8, lsr #8
   1174 
   1175     ldrb        r6, [src, #-5]
   1176     strb        r10, [src, #-4]
   1177     strb        r8, [src, #-1]
   1178     ldrb        r7, [src], pstep
   1179 
   1180     mov         r10, r10, lsr #8
   1181     mov         r8, r8, lsr #8
   1182     orr         r11, r11, r6, lsl #8
   1183     orr         r9, r9, r7, lsl #8
   1184 
   1185     ldrb        r6, [src, #-5]
   1186     strb        r10, [src, #-4]
   1187     strb        r8, [src, #-1]
   1188     ldrb        r7, [src], pstep
   1189 
   1190     mov         r10, r10, lsr #8
   1191     mov         r8, r8, lsr #8
   1192     orr         r11, r11, r6, lsl #16
   1193     orr         r9, r9, r7, lsl #16
   1194 
   1195     ldrb        r6, [src, #-5]
   1196     strb        r10, [src, #-4]
   1197     strb        r8, [src, #-1]
   1198     ldrb        r7, [src], pstep
   1199     orr         r11, r11, r6, lsl #24
   1200     orr         r9, r9, r7, lsl #24
   1201 
   1202     ;roughly 1/7th difference across boundary
   1203     eor         r9, r9, lr
   1204     eor         r11, r11, lr
   1205 
   1206     mov         lr, #0x9                    ; 9
   1207     mov         r7, #0x3f                   ; 63
   1208 
   1209     sxtb16      r6, r12
   1210     sxtb16      r10, r12, ror #8
   1211     smlabb      r8, r6, lr, r7
   1212     smlatb      r6, r6, lr, r7
   1213     smlabb      r12, r10, lr, r7
   1214     smlatb      r10, r10, lr, r7
   1215     ssat        r8, #8, r8, asr #7
   1216     ssat        r6, #8, r6, asr #7
   1217     ssat        r12, #8, r12, asr #7
   1218     ssat        r10, #8, r10, asr #7
   1219 
   1220     sub         src, src, pstep, lsl #2
   1221 
   1222     pkhbt       r6, r8, r6, lsl #16
   1223     pkhbt       r10, r12, r10, lsl #16
   1224 
   1225     uxtb16      r6, r6
   1226     uxtb16      r10, r10
   1227 
   1228     ldr         lr, c0x80808080
   1229 
   1230     orr         r10, r6, r10, lsl #8        ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7)
   1231 
   1232     qadd8       r8, r11, r10                ; s = vp8_signed_char_clamp(ps2 + u)
   1233     qsub8       r10, r9, r10                ; s = vp8_signed_char_clamp(qs2 - u)
   1234     eor         r8, r8, lr                  ; *op2 = s^0x80
   1235     eor         r10, r10, lr                ; *oq2 = s^0x80
   1236 
   1237     strb        r8, [src, #-5]              ; store *op2
   1238     strb        r10, [src], pstep           ; store *oq2
   1239     mov         r8, r8, lsr #8
   1240     mov         r10, r10, lsr #8
   1241     strb        r8, [src, #-5]
   1242     strb        r10, [src], pstep
   1243     mov         r8, r8, lsr #8
   1244     mov         r10, r10, lsr #8
   1245     strb        r8, [src, #-5]
   1246     strb        r10, [src], pstep
   1247     mov         r8, r8, lsr #8
   1248     mov         r10, r10, lsr #8
   1249     strb        r8, [src, #-5]
   1250     strb        r10, [src], pstep
   1251 
   1252     ;adjust src pointer for next loop
   1253     sub         src, src, #2
   1254 
   1255 |mbvskip_filter|
   1256     sub         src, src, #4
   1257     subs        count, count, #1
   1258 
   1259     pld         [src, #23]                  ; preload for next block
   1260     ldrne       r6, [src], pstep            ; load source data
   1261     pld         [src, #23]
   1262     ldrne       r7, [src], pstep
   1263     pld         [src, #23]
   1264     ldrne       r8, [src], pstep
   1265     pld         [src, #23]
   1266     ldrne       lr, [src], pstep
   1267 
   1268     bne         MBVnext8
   1269 
   1270     add         sp, sp, #16
   1271 
   1272     ldmia       sp!, {r4 - r11, pc}
   1273     ENDP        ; |vp8_mbloop_filter_vertical_edge_armv6|
   1274 
   1275 ; Constant Pool
   1276 c0x80808080 DCD     0x80808080
   1277 c0x03030303 DCD     0x03030303
   1278 c0x04040404 DCD     0x04040404
   1279 c0x01010101 DCD     0x01010101
   1280 c0x7F7F7F7F DCD     0x7F7F7F7F
   1281 
   1282     END
   1283