Home | History | Annotate | Download | only in armv6
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12     EXPORT |vp8_loop_filter_horizontal_edge_armv6|
     13     EXPORT |vp8_mbloop_filter_horizontal_edge_armv6|
     14     EXPORT |vp8_loop_filter_vertical_edge_armv6|
     15     EXPORT |vp8_mbloop_filter_vertical_edge_armv6|
     16 
     17     AREA    |.text|, CODE, READONLY  ; name this block of code
     18 
     19     MACRO
     20     TRANSPOSE_MATRIX $a0, $a1, $a2, $a3, $b0, $b1, $b2, $b3
     21     ; input: $a0, $a1, $a2, $a3; output: $b0, $b1, $b2, $b3
     22     ; a0: 03 02 01 00
     23     ; a1: 13 12 11 10
     24     ; a2: 23 22 21 20
     25     ; a3: 33 32 31 30
     26     ;     b3 b2 b1 b0
     27 
     28     uxtb16      $b1, $a1                    ; xx 12 xx 10
     29     uxtb16      $b0, $a0                    ; xx 02 xx 00
     30     uxtb16      $b3, $a3                    ; xx 32 xx 30
     31     uxtb16      $b2, $a2                    ; xx 22 xx 20
     32     orr         $b1, $b0, $b1, lsl #8       ; 12 02 10 00
     33     orr         $b3, $b2, $b3, lsl #8       ; 32 22 30 20
     34 
     35     uxtb16      $a1, $a1, ror #8            ; xx 13 xx 11
     36     uxtb16      $a3, $a3, ror #8            ; xx 33 xx 31
     37     uxtb16      $a0, $a0, ror #8            ; xx 03 xx 01
     38     uxtb16      $a2, $a2, ror #8            ; xx 23 xx 21
     39     orr         $a0, $a0, $a1, lsl #8       ; 13 03 11 01
     40     orr         $a2, $a2, $a3, lsl #8       ; 33 23 31 21
     41 
     42     pkhtb       $b2, $b3, $b1, asr #16      ; 32 22 12 02   -- p1
     43     pkhbt       $b0, $b1, $b3, lsl #16      ; 30 20 10 00   -- p3
     44 
     45     pkhtb       $b3, $a2, $a0, asr #16      ; 33 23 13 03   -- p0
     46     pkhbt       $b1, $a0, $a2, lsl #16      ; 31 21 11 01   -- p2
     47     MEND
     48 
     49 
     50 src         RN  r0
     51 pstep       RN  r1
     52 count       RN  r5
     53 
     54 ;r0     unsigned char *src_ptr,
     55 ;r1     int src_pixel_step,
     56 ;r2     const char *flimit,
     57 ;r3     const char *limit,
     58 ;stack  const char *thresh,
     59 ;stack  int  count
     60 
     61 ;Note: All 16 elements in flimit are equal. So, in the code, only one load is needed
     62 ;for flimit. Same way applies to limit and thresh.
     63 
     64 ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
     65 |vp8_loop_filter_horizontal_edge_armv6| PROC
     66 ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
     67     stmdb       sp!, {r4 - r11, lr}
     68 
     69     sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
     70     ldr         count, [sp, #40]            ; count for 8-in-parallel
     71     ldr         r6, [sp, #36]               ; load thresh address
     72     sub         sp, sp, #16                 ; create temp buffer
     73 
     74     ldr         r9, [src], pstep            ; p3
     75     ldr         r4, [r2], #4                ; flimit
     76     ldr         r10, [src], pstep           ; p2
     77     ldr         r2, [r3], #4                ; limit
     78     ldr         r11, [src], pstep           ; p1
     79     uadd8       r4, r4, r4                  ; flimit * 2
     80     ldr         r3, [r6], #4                ; thresh
     81     mov         count, count, lsl #1        ; 4-in-parallel
     82     uadd8       r4, r4, r2                  ; flimit * 2 + limit
     83 
     84 |Hnext8|
     85     ; vp8_filter_mask() function
     86     ; calculate breakout conditions
     87     ldr         r12, [src], pstep           ; p0
     88 
     89     uqsub8      r6, r9, r10                 ; p3 - p2
     90     uqsub8      r7, r10, r9                 ; p2 - p3
     91     uqsub8      r8, r10, r11                ; p2 - p1
     92     uqsub8      r10, r11, r10               ; p1 - p2
     93 
     94     orr         r6, r6, r7                  ; abs (p3-p2)
     95     orr         r8, r8, r10                 ; abs (p2-p1)
     96     uqsub8      lr, r6, r2                  ; compare to limit. lr: vp8_filter_mask
     97     uqsub8      r8, r8, r2                  ; compare to limit
     98     uqsub8      r6, r11, r12                ; p1 - p0
     99     orr         lr, lr, r8
    100     uqsub8      r7, r12, r11                ; p0 - p1
    101     ldr         r9, [src], pstep            ; q0
    102     ldr         r10, [src], pstep           ; q1
    103     orr         r6, r6, r7                  ; abs (p1-p0)
    104     uqsub8      r7, r6, r2                  ; compare to limit
    105     uqsub8      r8, r6, r3                  ; compare to thresh  -- save r8 for later
    106     orr         lr, lr, r7
    107 
    108     uqsub8      r6, r11, r10                ; p1 - q1
    109     uqsub8      r7, r10, r11                ; q1 - p1
    110     uqsub8      r11, r12, r9                ; p0 - q0
    111     uqsub8      r12, r9, r12                ; q0 - p0
    112     orr         r6, r6, r7                  ; abs (p1-q1)
    113     ldr         r7, c0x7F7F7F7F
    114     orr         r12, r11, r12               ; abs (p0-q0)
    115     ldr         r11, [src], pstep           ; q2
    116     uqadd8      r12, r12, r12               ; abs (p0-q0) * 2
    117     and         r6, r7, r6, lsr #1          ; abs (p1-q1) / 2
    118     uqsub8      r7, r9, r10                 ; q0 - q1
    119     uqadd8      r12, r12, r6                ; abs (p0-q0)*2 + abs (p1-q1)/2
    120     uqsub8      r6, r10, r9                 ; q1 - q0
    121     uqsub8      r12, r12, r4                ; compare to flimit
    122     uqsub8      r9, r11, r10                ; q2 - q1
    123 
    124     orr         lr, lr, r12
    125 
    126     ldr         r12, [src], pstep           ; q3
    127     uqsub8      r10, r10, r11               ; q1 - q2
    128     orr         r6, r7, r6                  ; abs (q1-q0)
    129     orr         r10, r9, r10                ; abs (q2-q1)
    130     uqsub8      r7, r6, r2                  ; compare to limit
    131     uqsub8      r10, r10, r2                ; compare to limit
    132     uqsub8      r6, r6, r3                  ; compare to thresh -- save r6 for later
    133     orr         lr, lr, r7
    134     orr         lr, lr, r10
    135 
    136     uqsub8      r10, r12, r11               ; q3 - q2
    137     uqsub8      r9, r11, r12                ; q2 - q3
    138 
    139     mvn         r11, #0                     ; r11 == -1
    140 
    141     orr         r10, r10, r9                ; abs (q3-q2)
    142     uqsub8      r10, r10, r2                ; compare to limit
    143 
    144     mov         r12, #0
    145     orr         lr, lr, r10
    146     sub         src, src, pstep, lsl #2
    147 
    148     usub8       lr, r12, lr                 ; use usub8 instead of ssub8
    149     sel         lr, r11, r12                ; filter mask: lr
    150 
    151     cmp         lr, #0
    152     beq         hskip_filter                 ; skip filtering
    153 
    154     sub         src, src, pstep, lsl #1     ; move src pointer down by 6 lines
    155 
    156     ;vp8_hevmask() function
    157     ;calculate high edge variance
    158     orr         r10, r6, r8                 ; calculate vp8_hevmask
    159 
    160     ldr         r7, [src], pstep            ; p1
    161 
    162     usub8       r10, r12, r10               ; use usub8 instead of ssub8
    163     sel         r6, r12, r11                ; obtain vp8_hevmask: r6
    164 
    165     ;vp8_filter() function
    166     ldr         r8, [src], pstep            ; p0
    167     ldr         r12, c0x80808080
    168     ldr         r9, [src], pstep            ; q0
    169     ldr         r10, [src], pstep           ; q1
    170 
    171     eor         r7, r7, r12                 ; p1 offset to convert to a signed value
    172     eor         r8, r8, r12                 ; p0 offset to convert to a signed value
    173     eor         r9, r9, r12                 ; q0 offset to convert to a signed value
    174     eor         r10, r10, r12               ; q1 offset to convert to a signed value
    175 
    176     str         r9, [sp]                    ; store qs0 temporarily
    177     str         r8, [sp, #4]                ; store ps0 temporarily
    178     str         r10, [sp, #8]               ; store qs1 temporarily
    179     str         r7, [sp, #12]               ; store ps1 temporarily
    180 
    181     qsub8       r7, r7, r10                 ; vp8_signed_char_clamp(ps1-qs1)
    182     qsub8       r8, r9, r8                  ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
    183 
    184     and         r7, r7, r6                  ; vp8_filter (r7) &= hev
    185 
    186     qadd8       r7, r7, r8
    187     ldr         r9, c0x03030303             ; r9 = 3 --modified for vp8
    188 
    189     qadd8       r7, r7, r8
    190     ldr         r10, c0x04040404
    191 
    192     qadd8       r7, r7, r8
    193     and         r7, r7, lr                  ; vp8_filter &= mask;
    194 
    195     ;modify code for vp8 -- Filter1 = vp8_filter (r7)
    196     qadd8       r8 , r7 , r9                ; Filter2 (r8) = vp8_signed_char_clamp(vp8_filter+3)
    197     qadd8       r7 , r7 , r10               ; vp8_filter = vp8_signed_char_clamp(vp8_filter+4)
    198 
    199     mov         r9, #0
    200     shadd8      r8 , r8 , r9                ; Filter2 >>= 3
    201     shadd8      r7 , r7 , r9                ; vp8_filter >>= 3
    202     shadd8      r8 , r8 , r9
    203     shadd8      r7 , r7 , r9
    204     shadd8      lr , r8 , r9                ; lr: Filter2
    205     shadd8      r7 , r7 , r9                ; r7: filter
    206 
    207     ;usub8      lr, r8, r10                 ; s = (s==4)*-1
    208     ;sel        lr, r11, r9
    209     ;usub8      r8, r10, r8
    210     ;sel        r8, r11, r9
    211     ;and        r8, r8, lr                  ; -1 for each element that equals 4
    212 
    213     ;calculate output
    214     ;qadd8      lr, r8, r7                  ; u = vp8_signed_char_clamp(s + vp8_filter)
    215 
    216     ldr         r8, [sp]                    ; load qs0
    217     ldr         r9, [sp, #4]                ; load ps0
    218 
    219     ldr         r10, c0x01010101
    220 
    221     qsub8       r8 ,r8, r7                  ; u = vp8_signed_char_clamp(qs0 - vp8_filter)
    222     qadd8       r9, r9, lr                  ; u = vp8_signed_char_clamp(ps0 + Filter2)
    223 
    224     ;end of modification for vp8
    225 
    226     mov         lr, #0
    227     sadd8       r7, r7 , r10                ; vp8_filter += 1
    228     shadd8      r7, r7, lr                  ; vp8_filter >>= 1
    229 
    230     ldr         r11, [sp, #12]              ; load ps1
    231     ldr         r10, [sp, #8]               ; load qs1
    232 
    233     bic         r7, r7, r6                  ; vp8_filter &= ~hev
    234     sub         src, src, pstep, lsl #2
    235 
    236     qadd8       r11, r11, r7                ; u = vp8_signed_char_clamp(ps1 + vp8_filter)
    237     qsub8       r10, r10,r7                 ; u = vp8_signed_char_clamp(qs1 - vp8_filter)
    238 
    239     eor         r11, r11, r12               ; *op1 = u^0x80
    240     str         r11, [src], pstep           ; store op1
    241     eor         r9, r9, r12                 ; *op0 = u^0x80
    242     str         r9, [src], pstep            ; store op0 result
    243     eor         r8, r8, r12                 ; *oq0 = u^0x80
    244     str         r8, [src], pstep            ; store oq0 result
    245     eor         r10, r10, r12               ; *oq1 = u^0x80
    246     str         r10, [src], pstep           ; store oq1
    247 
    248     sub         src, src, pstep, lsl #1
    249 
    250 |hskip_filter|
    251     add         src, src, #4
    252     sub         src, src, pstep, lsl #2
    253 
    254     subs        count, count, #1
    255 
    256     ;pld            [src]
    257     ;pld            [src, pstep]
    258     ;pld            [src, pstep, lsl #1]
    259     ;pld            [src, pstep, lsl #2]
    260     ;pld            [src, pstep, lsl #3]
    261 
    262     ldrne       r9, [src], pstep            ; p3
    263     ldrne       r10, [src], pstep           ; p2
    264     ldrne       r11, [src], pstep           ; p1
    265 
    266     bne         Hnext8
    267 
    268     add         sp, sp, #16
    269     ldmia       sp!, {r4 - r11, pc}
    270     ENDP        ; |vp8_loop_filter_horizontal_edge_armv6|
    271 
    272 
    273 ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
    274 |vp8_mbloop_filter_horizontal_edge_armv6| PROC
    275 ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
    276     stmdb       sp!, {r4 - r11, lr}
    277 
    278     sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
    279     ldr         count, [sp, #40]            ; count for 8-in-parallel
    280     ldr         r6, [sp, #36]               ; load thresh address
    281     sub         sp, sp, #16                 ; create temp buffer
    282 
    283     ldr         r9, [src], pstep            ; p3
    284     ldr         r4, [r2], #4                ; flimit
    285     ldr         r10, [src], pstep           ; p2
    286     ldr         r2, [r3], #4                ; limit
    287     ldr         r11, [src], pstep           ; p1
    288     uadd8       r4, r4, r4                  ; flimit * 2
    289     ldr         r3, [r6], #4                ; thresh
    290     mov         count, count, lsl #1        ; 4-in-parallel
    291     uadd8       r4, r4, r2                  ; flimit * 2 + limit
    292 
    293 |MBHnext8|
    294 
    295     ; vp8_filter_mask() function
    296     ; calculate breakout conditions
    297     ldr         r12, [src], pstep           ; p0
    298 
    299     uqsub8      r6, r9, r10                 ; p3 - p2
    300     uqsub8      r7, r10, r9                 ; p2 - p3
    301     uqsub8      r8, r10, r11                ; p2 - p1
    302     uqsub8      r10, r11, r10               ; p1 - p2
    303 
    304     orr         r6, r6, r7                  ; abs (p3-p2)
    305     orr         r8, r8, r10                 ; abs (p2-p1)
    306     uqsub8      lr, r6, r2                  ; compare to limit. lr: vp8_filter_mask
    307     uqsub8      r8, r8, r2                  ; compare to limit
    308 
    309     uqsub8      r6, r11, r12                ; p1 - p0
    310     orr         lr, lr, r8
    311     uqsub8      r7, r12, r11                ; p0 - p1
    312     ldr         r9, [src], pstep            ; q0
    313     ldr         r10, [src], pstep           ; q1
    314     orr         r6, r6, r7                  ; abs (p1-p0)
    315     uqsub8      r7, r6, r2                  ; compare to limit
    316     uqsub8      r8, r6, r3                  ; compare to thresh  -- save r8 for later
    317     orr         lr, lr, r7
    318 
    319     uqsub8      r6, r11, r10                ; p1 - q1
    320     uqsub8      r7, r10, r11                ; q1 - p1
    321     uqsub8      r11, r12, r9                ; p0 - q0
    322     uqsub8      r12, r9, r12                ; q0 - p0
    323     orr         r6, r6, r7                  ; abs (p1-q1)
    324     ldr         r7, c0x7F7F7F7F
    325     orr         r12, r11, r12               ; abs (p0-q0)
    326     ldr         r11, [src], pstep           ; q2
    327     uqadd8      r12, r12, r12               ; abs (p0-q0) * 2
    328     and         r6, r7, r6, lsr #1          ; abs (p1-q1) / 2
    329     uqsub8      r7, r9, r10                 ; q0 - q1
    330     uqadd8      r12, r12, r6                ; abs (p0-q0)*2 + abs (p1-q1)/2
    331     uqsub8      r6, r10, r9                 ; q1 - q0
    332     uqsub8      r12, r12, r4                ; compare to flimit
    333     uqsub8      r9, r11, r10                ; q2 - q1
    334 
    335     orr         lr, lr, r12
    336 
    337     ldr         r12, [src], pstep           ; q3
    338 
    339     uqsub8      r10, r10, r11               ; q1 - q2
    340     orr         r6, r7, r6                  ; abs (q1-q0)
    341     orr         r10, r9, r10                ; abs (q2-q1)
    342     uqsub8      r7, r6, r2                  ; compare to limit
    343     uqsub8      r10, r10, r2                ; compare to limit
    344     uqsub8      r6, r6, r3                  ; compare to thresh -- save r6 for later
    345     orr         lr, lr, r7
    346     orr         lr, lr, r10
    347 
    348     uqsub8      r10, r12, r11               ; q3 - q2
    349     uqsub8      r9, r11, r12                ; q2 - q3
    350 
    351     mvn         r11, #0                     ; r11 == -1
    352 
    353     orr         r10, r10, r9                ; abs (q3-q2)
    354     uqsub8      r10, r10, r2                ; compare to limit
    355 
    356     mov         r12, #0
    357 
    358     orr         lr, lr, r10
    359 
    360     usub8       lr, r12, lr                 ; use usub8 instead of ssub8
    361     sel         lr, r11, r12                ; filter mask: lr
    362 
    363     cmp         lr, #0
    364     beq         mbhskip_filter               ; skip filtering
    365 
    366     ;vp8_hevmask() function
    367     ;calculate high edge variance
    368     sub         src, src, pstep, lsl #2     ; move src pointer down by 6 lines
    369     sub         src, src, pstep, lsl #1
    370 
    371     orr         r10, r6, r8
    372     ldr         r7, [src], pstep            ; p1
    373 
    374     usub8       r10, r12, r10
    375     sel         r6, r12, r11                ; hev mask: r6
    376 
    377     ;vp8_mbfilter() function
    378     ;p2, q2 are only needed at the end. Don't need to load them in now.
    379     ldr         r8, [src], pstep            ; p0
    380     ldr         r12, c0x80808080
    381     ldr         r9, [src], pstep            ; q0
    382     ldr         r10, [src]                  ; q1
    383 
    384     eor         r7, r7, r12                 ; ps1
    385     eor         r8, r8, r12                 ; ps0
    386     eor         r9, r9, r12                 ; qs0
    387     eor         r10, r10, r12               ; qs1
    388 
    389     qsub8       r12, r9, r8                 ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
    390     str         r7, [sp, #12]               ; store ps1 temporarily
    391     qsub8       r7, r7, r10                 ; vp8_signed_char_clamp(ps1-qs1)
    392     str         r10, [sp, #8]               ; store qs1 temporarily
    393     qadd8       r7, r7, r12
    394     str         r9, [sp]                    ; store qs0 temporarily
    395     qadd8       r7, r7, r12
    396     str         r8, [sp, #4]                ; store ps0 temporarily
    397     qadd8       r7, r7, r12                 ; vp8_filter: r7
    398 
    399     ldr         r10, c0x03030303            ; r10 = 3 --modified for vp8
    400     ldr         r9, c0x04040404
    401 
    402     and         r7, r7, lr                  ; vp8_filter &= mask (lr is free)
    403 
    404     mov         r12, r7                     ; Filter2: r12
    405     and         r12, r12, r6                ; Filter2 &= hev
    406 
    407     ;modify code for vp8
    408     ;save bottom 3 bits so that we round one side +4 and the other +3
    409     qadd8       r8 , r12 , r9               ; Filter1 (r8) = vp8_signed_char_clamp(Filter2+4)
    410     qadd8       r12 , r12 , r10             ; Filter2 (r12) = vp8_signed_char_clamp(Filter2+3)
    411 
    412     mov         r10, #0
    413     shadd8      r8 , r8 , r10               ; Filter1 >>= 3
    414     shadd8      r12 , r12 , r10             ; Filter2 >>= 3
    415     shadd8      r8 , r8 , r10
    416     shadd8      r12 , r12 , r10
    417     shadd8      r8 , r8 , r10               ; r8: Filter1
    418     shadd8      r12 , r12 , r10             ; r12: Filter2
    419 
    420     ldr         r9, [sp]                    ; load qs0
    421     ldr         r11, [sp, #4]               ; load ps0
    422 
    423     qsub8       r9 , r9, r8                 ; qs0 = vp8_signed_char_clamp(qs0 - Filter1)
    424     qadd8       r11, r11, r12               ; ps0 = vp8_signed_char_clamp(ps0 + Filter2)
    425 
    426     ;save bottom 3 bits so that we round one side +4 and the other +3
    427     ;and            r8, r12, r10                ; s = Filter2 & 7 (s: r8)
    428     ;qadd8      r12 , r12 , r9              ; Filter2 = vp8_signed_char_clamp(Filter2+4)
    429     ;mov            r10, #0
    430     ;shadd8     r12 , r12 , r10             ; Filter2 >>= 3
    431     ;usub8      lr, r8, r9                  ; s = (s==4)*-1
    432     ;sel            lr, r11, r10
    433     ;shadd8     r12 , r12 , r10
    434     ;usub8      r8, r9, r8
    435     ;sel            r8, r11, r10
    436     ;ldr            r9, [sp]                    ; load qs0
    437     ;ldr            r11, [sp, #4]               ; load ps0
    438     ;shadd8     r12 , r12 , r10
    439     ;and            r8, r8, lr                  ; -1 for each element that equals 4
    440     ;qadd8      r10, r8, r12                ; u = vp8_signed_char_clamp(s + Filter2)
    441     ;qsub8      r9 , r9, r12                ; qs0 = vp8_signed_char_clamp(qs0 - Filter2)
    442     ;qadd8      r11, r11, r10               ; ps0 = vp8_signed_char_clamp(ps0 + u)
    443 
    444     ;end of modification for vp8
    445 
    446     bic         r12, r7, r6                 ; vp8_filter &= ~hev    ( r6 is free)
    447     ;mov        r12, r7
    448 
    449     ;roughly 3/7th difference across boundary
    450     mov         lr, #0x1b                   ; 27
    451     mov         r7, #0x3f                   ; 63
    452 
    453     sxtb16      r6, r12
    454     sxtb16      r10, r12, ror #8
    455     smlabb      r8, r6, lr, r7
    456     smlatb      r6, r6, lr, r7
    457     smlabb      r7, r10, lr, r7
    458     smultb      r10, r10, lr
    459     ssat        r8, #8, r8, asr #7
    460     ssat        r6, #8, r6, asr #7
    461     add         r10, r10, #63
    462     ssat        r7, #8, r7, asr #7
    463     ssat        r10, #8, r10, asr #7
    464 
    465     ldr         lr, c0x80808080
    466 
    467     pkhbt       r6, r8, r6, lsl #16
    468     pkhbt       r10, r7, r10, lsl #16
    469     uxtb16      r6, r6
    470     uxtb16      r10, r10
    471 
    472     sub         src, src, pstep
    473 
    474     orr         r10, r6, r10, lsl #8        ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7)
    475 
    476     qsub8       r8, r9, r10                 ; s = vp8_signed_char_clamp(qs0 - u)
    477     qadd8       r10, r11, r10               ; s = vp8_signed_char_clamp(ps0 + u)
    478     eor         r8, r8, lr                  ; *oq0 = s^0x80
    479     str         r8, [src]                   ; store *oq0
    480     sub         src, src, pstep
    481     eor         r10, r10, lr                ; *op0 = s^0x80
    482     str         r10, [src]                  ; store *op0
    483 
    484     ;roughly 2/7th difference across boundary
    485     mov         lr, #0x12                   ; 18
    486     mov         r7, #0x3f                   ; 63
    487 
    488     sxtb16      r6, r12
    489     sxtb16      r10, r12, ror #8
    490     smlabb      r8, r6, lr, r7
    491     smlatb      r6, r6, lr, r7
    492     smlabb      r9, r10, lr, r7
    493     smlatb      r10, r10, lr, r7
    494     ssat        r8, #8, r8, asr #7
    495     ssat        r6, #8, r6, asr #7
    496     ssat        r9, #8, r9, asr #7
    497     ssat        r10, #8, r10, asr #7
    498 
    499     ldr         lr, c0x80808080
    500 
    501     pkhbt       r6, r8, r6, lsl #16
    502     pkhbt       r10, r9, r10, lsl #16
    503 
    504     ldr         r9, [sp, #8]                ; load qs1
    505     ldr         r11, [sp, #12]              ; load ps1
    506 
    507     uxtb16      r6, r6
    508     uxtb16      r10, r10
    509 
    510     sub         src, src, pstep
    511 
    512     orr         r10, r6, r10, lsl #8        ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7)
    513 
    514     qadd8       r11, r11, r10               ; s = vp8_signed_char_clamp(ps1 + u)
    515     qsub8       r8, r9, r10                 ; s = vp8_signed_char_clamp(qs1 - u)
    516     eor         r11, r11, lr                ; *op1 = s^0x80
    517     str         r11, [src], pstep           ; store *op1
    518     eor         r8, r8, lr                  ; *oq1 = s^0x80
    519     add         src, src, pstep, lsl #1
    520 
    521     mov         r7, #0x3f                   ; 63
    522 
    523     str         r8, [src], pstep            ; store *oq1
    524 
    525     ;roughly 1/7th difference across boundary
    526     mov         lr, #0x9                    ; 9
    527     ldr         r9, [src]                   ; load q2
    528 
    529     sxtb16      r6, r12
    530     sxtb16      r10, r12, ror #8
    531     smlabb      r8, r6, lr, r7
    532     smlatb      r6, r6, lr, r7
    533     smlabb      r12, r10, lr, r7
    534     smlatb      r10, r10, lr, r7
    535     ssat        r8, #8, r8, asr #7
    536     ssat        r6, #8, r6, asr #7
    537     ssat        r12, #8, r12, asr #7
    538     ssat        r10, #8, r10, asr #7
    539 
    540     sub         src, src, pstep, lsl #2
    541 
    542     pkhbt       r6, r8, r6, lsl #16
    543     pkhbt       r10, r12, r10, lsl #16
    544 
    545     sub         src, src, pstep
    546     ldr         lr, c0x80808080
    547 
    548     ldr         r11, [src]                  ; load p2
    549 
    550     uxtb16      r6, r6
    551     uxtb16      r10, r10
    552 
    553     eor         r9, r9, lr
    554     eor         r11, r11, lr
    555 
    556     orr         r10, r6, r10, lsl #8        ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7)
    557 
    558     qadd8       r8, r11, r10                ; s = vp8_signed_char_clamp(ps2 + u)
    559     qsub8       r10, r9, r10                ; s = vp8_signed_char_clamp(qs2 - u)
    560     eor         r8, r8, lr                  ; *op2 = s^0x80
    561     str         r8, [src], pstep, lsl #2    ; store *op2
    562     add         src, src, pstep
    563     eor         r10, r10, lr                ; *oq2 = s^0x80
    564     str         r10, [src], pstep, lsl #1   ; store *oq2
    565 
    566 |mbhskip_filter|
    567     add         src, src, #4
    568     sub         src, src, pstep, lsl #3
    569     subs        count, count, #1
    570 
    571     ldrne       r9, [src], pstep            ; p3
    572     ldrne       r10, [src], pstep           ; p2
    573     ldrne       r11, [src], pstep           ; p1
    574 
    575     bne         MBHnext8
    576 
    577     add         sp, sp, #16
    578     ldmia       sp!, {r4 - r11, pc}
    579     ENDP        ; |vp8_mbloop_filter_horizontal_edge_armv6|
    580 
    581 
    582 ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
    583 |vp8_loop_filter_vertical_edge_armv6| PROC
    584 ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
    585     stmdb       sp!, {r4 - r11, lr}
    586 
    587     sub         src, src, #4                ; move src pointer down by 4
    588     ldr         count, [sp, #40]            ; count for 8-in-parallel
    589     ldr         r12, [sp, #36]              ; load thresh address
    590     sub         sp, sp, #16                 ; create temp buffer
    591 
    592     ldr         r6, [src], pstep            ; load source data
    593     ldr         r4, [r2], #4                ; flimit
    594     ldr         r7, [src], pstep
    595     ldr         r2, [r3], #4                ; limit
    596     ldr         r8, [src], pstep
    597     uadd8       r4, r4, r4                  ; flimit * 2
    598     ldr         r3, [r12], #4               ; thresh
    599     ldr         lr, [src], pstep
    600     mov         count, count, lsl #1        ; 4-in-parallel
    601     uadd8       r4, r4, r2                  ; flimit * 2 + limit
    602 
    603 |Vnext8|
    604 
    605     ; vp8_filter_mask() function
    606     ; calculate breakout conditions
    607     ; transpose the source data for 4-in-parallel operation
    608     TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12
    609 
    610     uqsub8      r7, r9, r10                 ; p3 - p2
    611     uqsub8      r8, r10, r9                 ; p2 - p3
    612     uqsub8      r9, r10, r11                ; p2 - p1
    613     uqsub8      r10, r11, r10               ; p1 - p2
    614     orr         r7, r7, r8                  ; abs (p3-p2)
    615     orr         r10, r9, r10                ; abs (p2-p1)
    616     uqsub8      lr, r7, r2                  ; compare to limit. lr: vp8_filter_mask
    617     uqsub8      r10, r10, r2                ; compare to limit
    618 
    619     sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
    620 
    621     orr         lr, lr, r10
    622 
    623     uqsub8      r6, r11, r12                ; p1 - p0
    624     uqsub8      r7, r12, r11                ; p0 - p1
    625     add         src, src, #4                ; move src pointer up by 4
    626     orr         r6, r6, r7                  ; abs (p1-p0)
    627     str         r11, [sp, #12]              ; save p1
    628     uqsub8      r10, r6, r2                 ; compare to limit
    629     uqsub8      r11, r6, r3                 ; compare to thresh
    630     orr         lr, lr, r10
    631 
    632     ; transpose uses 8 regs(r6 - r12 and lr). Need to save reg value now
    633     ; transpose the source data for 4-in-parallel operation
    634     ldr         r6, [src], pstep            ; load source data
    635     str         r11, [sp]                   ; push r11 to stack
    636     ldr         r7, [src], pstep
    637     str         r12, [sp, #4]               ; save current reg before load q0 - q3 data
    638     ldr         r8, [src], pstep
    639     str         lr, [sp, #8]
    640     ldr         lr, [src], pstep
    641 
    642     TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12
    643 
    644     ldr         lr, [sp, #8]                ; load back (f)limit accumulator
    645 
    646     uqsub8      r6, r12, r11                ; q3 - q2
    647     uqsub8      r7, r11, r12                ; q2 - q3
    648     uqsub8      r12, r11, r10               ; q2 - q1
    649     uqsub8      r11, r10, r11               ; q1 - q2
    650     orr         r6, r6, r7                  ; abs (q3-q2)
    651     orr         r7, r12, r11                ; abs (q2-q1)
    652     uqsub8      r6, r6, r2                  ; compare to limit
    653     uqsub8      r7, r7, r2                  ; compare to limit
    654     ldr         r11, [sp, #4]               ; load back p0
    655     ldr         r12, [sp, #12]              ; load back p1
    656     orr         lr, lr, r6
    657     orr         lr, lr, r7
    658 
    659     uqsub8      r6, r11, r9                 ; p0 - q0
    660     uqsub8      r7, r9, r11                 ; q0 - p0
    661     uqsub8      r8, r12, r10                ; p1 - q1
    662     uqsub8      r11, r10, r12               ; q1 - p1
    663     orr         r6, r6, r7                  ; abs (p0-q0)
    664     ldr         r7, c0x7F7F7F7F
    665     orr         r8, r8, r11                 ; abs (p1-q1)
    666     uqadd8      r6, r6, r6                  ; abs (p0-q0) * 2
    667     and         r8, r7, r8, lsr #1          ; abs (p1-q1) / 2
    668     uqsub8      r11, r10, r9                ; q1 - q0
    669     uqadd8      r6, r8, r6                  ; abs (p0-q0)*2 + abs (p1-q1)/2
    670     uqsub8      r12, r9, r10                ; q0 - q1
    671     uqsub8      r6, r6, r4                  ; compare to flimit
    672 
    673     orr         r9, r11, r12                ; abs (q1-q0)
    674     uqsub8      r8, r9, r2                  ; compare to limit
    675     uqsub8      r10, r9, r3                 ; compare to thresh
    676     orr         lr, lr, r6
    677     orr         lr, lr, r8
    678 
    679     mvn         r11, #0                     ; r11 == -1
    680     mov         r12, #0
    681 
    682     usub8       lr, r12, lr
    683     ldr         r9, [sp]                    ; load the compared result
    684     sel         lr, r11, r12                ; filter mask: lr
    685 
    686     cmp         lr, #0
    687     beq         vskip_filter                 ; skip filtering
    688 
    689     ;vp8_hevmask() function
    690     ;calculate high edge variance
    691 
    692     sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
    693 
    694     orr         r9, r9, r10
    695 
    696     ldrh        r7, [src, #-2]
    697     ldrh        r8, [src], pstep
    698 
    699     usub8       r9, r12, r9
    700     sel         r6, r12, r11                ; hev mask: r6
    701 
    702     ;vp8_filter() function
    703     ; load soure data to r6, r11, r12, lr
    704     ldrh        r9, [src, #-2]
    705     ldrh        r10, [src], pstep
    706 
    707     pkhbt       r12, r7, r8, lsl #16
    708 
    709     ldrh        r7, [src, #-2]
    710     ldrh        r8, [src], pstep
    711 
    712     pkhbt       r11, r9, r10, lsl #16
    713 
    714     ldrh        r9, [src, #-2]
    715     ldrh        r10, [src], pstep
    716 
    717     ; Transpose needs 8 regs(r6 - r12, and lr). Save r6 and lr first
    718     str         r6, [sp]
    719     str         lr, [sp, #4]
    720 
    721     pkhbt       r6, r7, r8, lsl #16
    722     pkhbt       lr, r9, r10, lsl #16
    723 
    724     ;transpose r12, r11, r6, lr to r7, r8, r9, r10
    725     TRANSPOSE_MATRIX r12, r11, r6, lr, r7, r8, r9, r10
    726 
    727     ;load back hev_mask r6 and filter_mask lr
    728     ldr         r12, c0x80808080
    729     ldr         r6, [sp]
    730     ldr         lr, [sp, #4]
    731 
    732     eor         r7, r7, r12                 ; p1 offset to convert to a signed value
    733     eor         r8, r8, r12                 ; p0 offset to convert to a signed value
    734     eor         r9, r9, r12                 ; q0 offset to convert to a signed value
    735     eor         r10, r10, r12               ; q1 offset to convert to a signed value
    736 
    737     str         r9, [sp]                    ; store qs0 temporarily
    738     str         r8, [sp, #4]                ; store ps0 temporarily
    739     str         r10, [sp, #8]               ; store qs1 temporarily
    740     str         r7, [sp, #12]               ; store ps1 temporarily
    741 
    742     qsub8       r7, r7, r10                 ; vp8_signed_char_clamp(ps1-qs1)
    743     qsub8       r8, r9, r8                  ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
    744 
    745     and         r7, r7, r6                  ;  vp8_filter (r7) &= hev (r7 : filter)
    746 
    747     qadd8       r7, r7, r8
    748     ldr         r9, c0x03030303             ; r9 = 3 --modified for vp8
    749 
    750     qadd8       r7, r7, r8
    751     ldr         r10, c0x04040404
    752 
    753     qadd8       r7, r7, r8
    754     ;mvn         r11, #0                     ; r11 == -1
    755 
    756     and         r7, r7, lr                  ; vp8_filter &= mask
    757 
    758     ;modify code for vp8 -- Filter1 = vp8_filter (r7)
    759     qadd8       r8 , r7 , r9                ; Filter2 (r8) = vp8_signed_char_clamp(vp8_filter+3)
    760     qadd8       r7 , r7 , r10               ; vp8_filter = vp8_signed_char_clamp(vp8_filter+4)
    761 
    762     mov         r9, #0
    763     shadd8      r8 , r8 , r9                ; Filter2 >>= 3
    764     shadd8      r7 , r7 , r9                ; vp8_filter >>= 3
    765     shadd8      r8 , r8 , r9
    766     shadd8      r7 , r7 , r9
    767     shadd8      lr , r8 , r9                ; lr: filter2
    768     shadd8      r7 , r7 , r9                ; r7: filter
    769 
    770     ;usub8      lr, r8, r10                 ; s = (s==4)*-1
    771     ;sel            lr, r11, r9
    772     ;usub8      r8, r10, r8
    773     ;sel            r8, r11, r9
    774     ;and            r8, r8, lr                  ; -1 for each element that equals 4 -- r8: s
    775 
    776     ;calculate output
    777     ;qadd8      lr, r8, r7                  ; u = vp8_signed_char_clamp(s + vp8_filter)
    778 
    779     ldr         r8, [sp]                    ; load qs0
    780     ldr         r9, [sp, #4]                ; load ps0
    781 
    782     ldr         r10, c0x01010101
    783 
    784     qsub8       r8, r8, r7                  ; u = vp8_signed_char_clamp(qs0 - vp8_filter)
    785     qadd8       r9, r9, lr                  ; u = vp8_signed_char_clamp(ps0 + Filter2)
    786     ;end of modification for vp8
    787 
    788     eor         r8, r8, r12
    789     eor         r9, r9, r12
    790 
    791     mov         lr, #0
    792 
    793     sadd8       r7, r7, r10
    794     shadd8      r7, r7, lr
    795 
    796     ldr         r10, [sp, #8]               ; load qs1
    797     ldr         r11, [sp, #12]              ; load ps1
    798 
    799     bic         r7, r7, r6                  ; r7: vp8_filter
    800 
    801     qsub8       r10 , r10, r7               ; u = vp8_signed_char_clamp(qs1 - vp8_filter)
    802     qadd8       r11, r11, r7                ; u = vp8_signed_char_clamp(ps1 + vp8_filter)
    803     eor         r10, r10, r12
    804     eor         r11, r11, r12
    805 
    806     sub         src, src, pstep, lsl #2
    807 
    808     ;we can use TRANSPOSE_MATRIX macro to transpose output - input: q1, q0, p0, p1
    809     ;output is b0, b1, b2, b3
    810     ;b0: 03 02 01 00
    811     ;b1: 13 12 11 10
    812     ;b2: 23 22 21 20
    813     ;b3: 33 32 31 30
    814     ;    p1 p0 q0 q1
    815     ;   (a3 a2 a1 a0)
    816     TRANSPOSE_MATRIX r11, r9, r8, r10, r6, r7, r12, lr
    817 
    818     strh        r6, [src, #-2]              ; store the result
    819     mov         r6, r6, lsr #16
    820     strh        r6, [src], pstep
    821 
    822     strh        r7, [src, #-2]
    823     mov         r7, r7, lsr #16
    824     strh        r7, [src], pstep
    825 
    826     strh        r12, [src, #-2]
    827     mov         r12, r12, lsr #16
    828     strh        r12, [src], pstep
    829 
    830     strh        lr, [src, #-2]
    831     mov         lr, lr, lsr #16
    832     strh        lr, [src], pstep
    833 
    834 |vskip_filter|
    835     sub         src, src, #4
    836     subs        count, count, #1
    837 
    838     ldrne       r6, [src], pstep            ; load source data
    839     ldrne       r7, [src], pstep
    840     ldrne       r8, [src], pstep
    841     ldrne       lr, [src], pstep
    842 
    843     bne         Vnext8
    844 
    845     add         sp, sp, #16
    846 
    847     ldmia       sp!, {r4 - r11, pc}
    848     ENDP        ; |vp8_loop_filter_vertical_edge_armv6|
    849 
    850 
    851 
    852 ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
    853 |vp8_mbloop_filter_vertical_edge_armv6| PROC
    854 ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
    855     stmdb       sp!, {r4 - r11, lr}
    856 
    857     sub         src, src, #4                ; move src pointer down by 4
    858     ldr         count, [sp, #40]            ; count for 8-in-parallel
    859     ldr         r12, [sp, #36]              ; load thresh address
    860     sub         sp, sp, #16                 ; create temp buffer
    861 
    862     ldr         r6, [src], pstep            ; load source data
    863     ldr         r4, [r2], #4                ; flimit
    864     ldr         r7, [src], pstep
    865     ldr         r2, [r3], #4                ; limit
    866     ldr         r8, [src], pstep
    867     uadd8       r4, r4, r4                  ; flimit * 2
    868     ldr         r3, [r12], #4               ; thresh
    869     ldr         lr, [src], pstep
    870     mov         count, count, lsl #1        ; 4-in-parallel
    871     uadd8       r4, r4, r2                  ; flimit * 2 + limit
    872 
    873 |MBVnext8|
    874     ; vp8_filter_mask() function
    875     ; calculate breakout conditions
    876     ; transpose the source data for 4-in-parallel operation
    877     TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12
    878 
    879     uqsub8      r7, r9, r10                 ; p3 - p2
    880     uqsub8      r8, r10, r9                 ; p2 - p3
    881     uqsub8      r9, r10, r11                ; p2 - p1
    882     uqsub8      r10, r11, r10               ; p1 - p2
    883     orr         r7, r7, r8                  ; abs (p3-p2)
    884     orr         r10, r9, r10                ; abs (p2-p1)
    885     uqsub8      lr, r7, r2                  ; compare to limit. lr: vp8_filter_mask
    886     uqsub8      r10, r10, r2                ; compare to limit
    887 
    888     sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
    889 
    890     orr         lr, lr, r10
    891 
    892     uqsub8      r6, r11, r12                ; p1 - p0
    893     uqsub8      r7, r12, r11                ; p0 - p1
    894     add         src, src, #4                ; move src pointer up by 4
    895     orr         r6, r6, r7                  ; abs (p1-p0)
    896     str         r11, [sp, #12]              ; save p1
    897     uqsub8      r10, r6, r2                 ; compare to limit
    898     uqsub8      r11, r6, r3                 ; compare to thresh
    899     orr         lr, lr, r10
    900 
    901     ; transpose uses 8 regs(r6 - r12 and lr). Need to save reg value now
    902     ; transpose the source data for 4-in-parallel operation
    903     ldr         r6, [src], pstep            ; load source data
    904     str         r11, [sp]                   ; push r11 to stack
    905     ldr         r7, [src], pstep
    906     str         r12, [sp, #4]               ; save current reg before load q0 - q3 data
    907     ldr         r8, [src], pstep
    908     str         lr, [sp, #8]
    909     ldr         lr, [src], pstep
    910 
    911     TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12
    912 
    913     ldr         lr, [sp, #8]                ; load back (f)limit accumulator
    914 
    915     uqsub8      r6, r12, r11                ; q3 - q2
    916     uqsub8      r7, r11, r12                ; q2 - q3
    917     uqsub8      r12, r11, r10               ; q2 - q1
    918     uqsub8      r11, r10, r11               ; q1 - q2
    919     orr         r6, r6, r7                  ; abs (q3-q2)
    920     orr         r7, r12, r11                ; abs (q2-q1)
    921     uqsub8      r6, r6, r2                  ; compare to limit
    922     uqsub8      r7, r7, r2                  ; compare to limit
    923     ldr         r11, [sp, #4]               ; load back p0
    924     ldr         r12, [sp, #12]              ; load back p1
    925     orr         lr, lr, r6
    926     orr         lr, lr, r7
    927 
    928     uqsub8      r6, r11, r9                 ; p0 - q0
    929     uqsub8      r7, r9, r11                 ; q0 - p0
    930     uqsub8      r8, r12, r10                ; p1 - q1
    931     uqsub8      r11, r10, r12               ; q1 - p1
    932     orr         r6, r6, r7                  ; abs (p0-q0)
    933     ldr         r7, c0x7F7F7F7F
    934     orr         r8, r8, r11                 ; abs (p1-q1)
    935     uqadd8      r6, r6, r6                  ; abs (p0-q0) * 2
    936     and         r8, r7, r8, lsr #1          ; abs (p1-q1) / 2
    937     uqsub8      r11, r10, r9                ; q1 - q0
    938     uqadd8      r6, r8, r6                  ; abs (p0-q0)*2 + abs (p1-q1)/2
    939     uqsub8      r12, r9, r10                ; q0 - q1
    940     uqsub8      r6, r6, r4                  ; compare to flimit
    941 
    942     orr         r9, r11, r12                ; abs (q1-q0)
    943     uqsub8      r8, r9, r2                  ; compare to limit
    944     uqsub8      r10, r9, r3                 ; compare to thresh
    945     orr         lr, lr, r6
    946     orr         lr, lr, r8
    947 
    948     mvn         r11, #0                     ; r11 == -1
    949     mov         r12, #0
    950 
    951     usub8       lr, r12, lr
    952     ldr         r9, [sp]                    ; load the compared result
    953     sel         lr, r11, r12                ; filter mask: lr
    954 
    955     cmp         lr, #0
    956     beq         mbvskip_filter               ; skip filtering
    957 
    958 
    959     ;vp8_hevmask() function
    960     ;calculate high edge variance
    961 
    962     sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
    963 
    964     orr         r9, r9, r10
    965 
    966     ldrh        r7, [src, #-2]
    967     ldrh        r8, [src], pstep
    968 
    969     usub8       r9, r12, r9
    970     sel         r6, r12, r11                ; hev mask: r6
    971 
    972 
    973     ; vp8_mbfilter() function
    974     ; p2, q2 are only needed at the end. Don't need to load them in now.
    975     ; Transpose needs 8 regs(r6 - r12, and lr). Save r6 and lr first
    976     ; load soure data to r6, r11, r12, lr
    977     ldrh        r9, [src, #-2]
    978     ldrh        r10, [src], pstep
    979 
    980     pkhbt       r12, r7, r8, lsl #16
    981 
    982     ldrh        r7, [src, #-2]
    983     ldrh        r8, [src], pstep
    984 
    985     pkhbt       r11, r9, r10, lsl #16
    986 
    987     ldrh        r9, [src, #-2]
    988     ldrh        r10, [src], pstep
    989 
    990     str         r6, [sp]                    ; save r6
    991     str         lr, [sp, #4]                ; save lr
    992 
    993     pkhbt       r6, r7, r8, lsl #16
    994     pkhbt       lr, r9, r10, lsl #16
    995 
    996     ;transpose r12, r11, r6, lr to p1, p0, q0, q1
    997     TRANSPOSE_MATRIX r12, r11, r6, lr, r7, r8, r9, r10
    998 
    999     ;load back hev_mask r6 and filter_mask lr
   1000     ldr         r12, c0x80808080
   1001     ldr         r6, [sp]
   1002     ldr         lr, [sp, #4]
   1003 
   1004     eor         r7, r7, r12                 ; ps1
   1005     eor         r8, r8, r12                 ; ps0
   1006     eor         r9, r9, r12                 ; qs0
   1007     eor         r10, r10, r12               ; qs1
   1008 
   1009     qsub8       r12, r9, r8                 ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
   1010     str         r7, [sp, #12]               ; store ps1 temporarily
   1011     qsub8       r7, r7, r10                 ; vp8_signed_char_clamp(ps1-qs1)
   1012     str         r10, [sp, #8]               ; store qs1 temporarily
   1013     qadd8       r7, r7, r12
   1014     str         r9, [sp]                    ; store qs0 temporarily
   1015     qadd8       r7, r7, r12
   1016     str         r8, [sp, #4]                ; store ps0 temporarily
   1017     qadd8       r7, r7, r12                 ; vp8_filter: r7
   1018 
   1019     ldr         r10, c0x03030303            ; r10 = 3 --modified for vp8
   1020     ldr         r9, c0x04040404
   1021     ;mvn         r11, #0                     ; r11 == -1
   1022 
   1023     and         r7, r7, lr                  ; vp8_filter &= mask (lr is free)
   1024 
   1025     mov         r12, r7                     ; Filter2: r12
   1026     and         r12, r12, r6                ; Filter2 &= hev
   1027 
   1028     ;modify code for vp8
   1029     ;save bottom 3 bits so that we round one side +4 and the other +3
   1030     qadd8       r8 , r12 , r9               ; Filter1 (r8) = vp8_signed_char_clamp(Filter2+4)
   1031     qadd8       r12 , r12 , r10             ; Filter2 (r12) = vp8_signed_char_clamp(Filter2+3)
   1032 
   1033     mov         r10, #0
   1034     shadd8      r8 , r8 , r10               ; Filter1 >>= 3
   1035     shadd8      r12 , r12 , r10             ; Filter2 >>= 3
   1036     shadd8      r8 , r8 , r10
   1037     shadd8      r12 , r12 , r10
   1038     shadd8      r8 , r8 , r10               ; r8: Filter1
   1039     shadd8      r12 , r12 , r10             ; r12: Filter2
   1040 
   1041     ldr         r9, [sp]                    ; load qs0
   1042     ldr         r11, [sp, #4]               ; load ps0
   1043 
   1044     qsub8       r9 , r9, r8                 ; qs0 = vp8_signed_char_clamp(qs0 - Filter1)
   1045     qadd8       r11, r11, r12               ; ps0 = vp8_signed_char_clamp(ps0 + Filter2)
   1046 
   1047     ;save bottom 3 bits so that we round one side +4 and the other +3
   1048     ;and            r8, r12, r10                ; s = Filter2 & 7 (s: r8)
   1049     ;qadd8      r12 , r12 , r9              ; Filter2 = vp8_signed_char_clamp(Filter2+4)
   1050     ;mov            r10, #0
   1051     ;shadd8     r12 , r12 , r10             ; Filter2 >>= 3
   1052     ;usub8      lr, r8, r9                  ; s = (s==4)*-1
   1053     ;sel            lr, r11, r10
   1054     ;shadd8     r12 , r12 , r10
   1055     ;usub8      r8, r9, r8
   1056     ;sel            r8, r11, r10
   1057     ;ldr            r9, [sp]                    ; load qs0
   1058     ;ldr            r11, [sp, #4]               ; load ps0
   1059     ;shadd8     r12 , r12 , r10
   1060     ;and            r8, r8, lr                  ; -1 for each element that equals 4
   1061     ;qadd8      r10, r8, r12                ; u = vp8_signed_char_clamp(s + Filter2)
   1062     ;qsub8      r9 , r9, r12                ; qs0 = vp8_signed_char_clamp(qs0 - Filter2)
   1063     ;qadd8      r11, r11, r10               ; ps0 = vp8_signed_char_clamp(ps0 + u)
   1064 
   1065     ;end of modification for vp8
   1066 
   1067     bic         r12, r7, r6                 ;vp8_filter &= ~hev    ( r6 is free)
   1068     ;mov            r12, r7
   1069 
   1070     ;roughly 3/7th difference across boundary
   1071     mov         lr, #0x1b                   ; 27
   1072     mov         r7, #0x3f                   ; 63
   1073 
   1074     sxtb16      r6, r12
   1075     sxtb16      r10, r12, ror #8
   1076     smlabb      r8, r6, lr, r7
   1077     smlatb      r6, r6, lr, r7
   1078     smlabb      r7, r10, lr, r7
   1079     smultb      r10, r10, lr
   1080     ssat        r8, #8, r8, asr #7
   1081     ssat        r6, #8, r6, asr #7
   1082     add         r10, r10, #63
   1083     ssat        r7, #8, r7, asr #7
   1084     ssat        r10, #8, r10, asr #7
   1085 
   1086     ldr         lr, c0x80808080
   1087 
   1088     pkhbt       r6, r8, r6, lsl #16
   1089     pkhbt       r10, r7, r10, lsl #16
   1090     uxtb16      r6, r6
   1091     uxtb16      r10, r10
   1092 
   1093     sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
   1094 
   1095     orr         r10, r6, r10, lsl #8        ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7)
   1096 
   1097     qsub8       r8, r9, r10                 ; s = vp8_signed_char_clamp(qs0 - u)
   1098     qadd8       r10, r11, r10               ; s = vp8_signed_char_clamp(ps0 + u)
   1099     eor         r8, r8, lr                  ; *oq0 = s^0x80
   1100     eor         r10, r10, lr                ; *op0 = s^0x80
   1101 
   1102     strb        r10, [src, #-1]             ; store op0 result
   1103     strb        r8, [src], pstep            ; store oq0 result
   1104     mov         r10, r10, lsr #8
   1105     mov         r8, r8, lsr #8
   1106     strb        r10, [src, #-1]
   1107     strb        r8, [src], pstep
   1108     mov         r10, r10, lsr #8
   1109     mov         r8, r8, lsr #8
   1110     strb        r10, [src, #-1]
   1111     strb        r8, [src], pstep
   1112     mov         r10, r10, lsr #8
   1113     mov         r8, r8, lsr #8
   1114     strb        r10, [src, #-1]
   1115     strb        r8, [src], pstep
   1116 
   1117     ;roughly 2/7th difference across boundary
   1118     mov         lr, #0x12                   ; 18
   1119     mov         r7, #0x3f                   ; 63
   1120 
   1121     sxtb16      r6, r12
   1122     sxtb16      r10, r12, ror #8
   1123     smlabb      r8, r6, lr, r7
   1124     smlatb      r6, r6, lr, r7
   1125     smlabb      r9, r10, lr, r7
   1126     smlatb      r10, r10, lr, r7
   1127     ssat        r8, #8, r8, asr #7
   1128     ssat        r6, #8, r6, asr #7
   1129     ssat        r9, #8, r9, asr #7
   1130     ssat        r10, #8, r10, asr #7
   1131 
   1132     sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
   1133 
   1134     pkhbt       r6, r8, r6, lsl #16
   1135     pkhbt       r10, r9, r10, lsl #16
   1136 
   1137     ldr         r9, [sp, #8]                ; load qs1
   1138     ldr         r11, [sp, #12]              ; load ps1
   1139     ldr         lr, c0x80808080
   1140 
   1141     uxtb16      r6, r6
   1142     uxtb16      r10, r10
   1143 
   1144     add         src, src, #2
   1145 
   1146     orr         r10, r6, r10, lsl #8        ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7)
   1147 
   1148     qsub8       r8, r9, r10                 ; s = vp8_signed_char_clamp(qs1 - u)
   1149     qadd8       r10, r11, r10               ; s = vp8_signed_char_clamp(ps1 + u)
   1150     eor         r8, r8, lr                  ; *oq1 = s^0x80
   1151     eor         r10, r10, lr                ; *op1 = s^0x80
   1152 
   1153     ldrb        r11, [src, #-5]             ; load p2 for 1/7th difference across boundary
   1154     strb        r10, [src, #-4]             ; store op1
   1155     strb        r8, [src, #-1]              ; store oq1
   1156     ldrb        r9, [src], pstep            ; load q2 for 1/7th difference across boundary
   1157 
   1158     mov         r10, r10, lsr #8
   1159     mov         r8, r8, lsr #8
   1160 
   1161     ldrb        r6, [src, #-5]
   1162     strb        r10, [src, #-4]
   1163     strb        r8, [src, #-1]
   1164     ldrb        r7, [src], pstep
   1165 
   1166     mov         r10, r10, lsr #8
   1167     mov         r8, r8, lsr #8
   1168     orr         r11, r11, r6, lsl #8
   1169     orr         r9, r9, r7, lsl #8
   1170 
   1171     ldrb        r6, [src, #-5]
   1172     strb        r10, [src, #-4]
   1173     strb        r8, [src, #-1]
   1174     ldrb        r7, [src], pstep
   1175 
   1176     mov         r10, r10, lsr #8
   1177     mov         r8, r8, lsr #8
   1178     orr         r11, r11, r6, lsl #16
   1179     orr         r9, r9, r7, lsl #16
   1180 
   1181     ldrb        r6, [src, #-5]
   1182     strb        r10, [src, #-4]
   1183     strb        r8, [src, #-1]
   1184     ldrb        r7, [src], pstep
   1185     orr         r11, r11, r6, lsl #24
   1186     orr         r9, r9, r7, lsl #24
   1187 
   1188     ;roughly 1/7th difference across boundary
   1189     eor         r9, r9, lr
   1190     eor         r11, r11, lr
   1191 
   1192     mov         lr, #0x9                    ; 9
   1193     mov         r7, #0x3f                   ; 63
   1194 
   1195     sxtb16      r6, r12
   1196     sxtb16      r10, r12, ror #8
   1197     smlabb      r8, r6, lr, r7
   1198     smlatb      r6, r6, lr, r7
   1199     smlabb      r12, r10, lr, r7
   1200     smlatb      r10, r10, lr, r7
   1201     ssat        r8, #8, r8, asr #7
   1202     ssat        r6, #8, r6, asr #7
   1203     ssat        r12, #8, r12, asr #7
   1204     ssat        r10, #8, r10, asr #7
   1205 
   1206     sub         src, src, pstep, lsl #2
   1207 
   1208     pkhbt       r6, r8, r6, lsl #16
   1209     pkhbt       r10, r12, r10, lsl #16
   1210 
   1211     uxtb16      r6, r6
   1212     uxtb16      r10, r10
   1213 
   1214     ldr         lr, c0x80808080
   1215 
   1216     orr         r10, r6, r10, lsl #8        ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7)
   1217 
   1218     qadd8       r8, r11, r10                ; s = vp8_signed_char_clamp(ps2 + u)
   1219     qsub8       r10, r9, r10                ; s = vp8_signed_char_clamp(qs2 - u)
   1220     eor         r8, r8, lr                  ; *op2 = s^0x80
   1221     eor         r10, r10, lr                ; *oq2 = s^0x80
   1222 
   1223     strb        r8, [src, #-5]              ; store *op2
   1224     strb        r10, [src], pstep           ; store *oq2
   1225     mov         r8, r8, lsr #8
   1226     mov         r10, r10, lsr #8
   1227     strb        r8, [src, #-5]
   1228     strb        r10, [src], pstep
   1229     mov         r8, r8, lsr #8
   1230     mov         r10, r10, lsr #8
   1231     strb        r8, [src, #-5]
   1232     strb        r10, [src], pstep
   1233     mov         r8, r8, lsr #8
   1234     mov         r10, r10, lsr #8
   1235     strb        r8, [src, #-5]
   1236     strb        r10, [src], pstep
   1237 
   1238     ;adjust src pointer for next loop
   1239     sub         src, src, #2
   1240 
   1241 |mbvskip_filter|
   1242     sub         src, src, #4
   1243     subs        count, count, #1
   1244 
   1245     ldrne       r6, [src], pstep            ; load source data
   1246     ldrne       r7, [src], pstep
   1247     ldrne       r8, [src], pstep
   1248     ldrne       lr, [src], pstep
   1249 
   1250     bne         MBVnext8
   1251 
   1252     add         sp, sp, #16
   1253 
   1254     ldmia       sp!, {r4 - r11, pc}
   1255     ENDP        ; |vp8_mbloop_filter_vertical_edge_armv6|
   1256 
   1257 ; Constant Pool
   1258 c0x80808080 DCD     0x80808080
   1259 c0x03030303 DCD     0x03030303
   1260 c0x04040404 DCD     0x04040404
   1261 c0x01010101 DCD     0x01010101
   1262 c0x7F7F7F7F DCD     0x7F7F7F7F
   1263 
   1264     END
   1265