Home | History | Annotate | Download | only in armv6
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12     EXPORT |vp8_loop_filter_simple_horizontal_edge_armv6|
     13     EXPORT |vp8_loop_filter_simple_vertical_edge_armv6|
     14 
     15     AREA    |.text|, CODE, READONLY  ; name this block of code
     16 
     17     MACRO
     18     TRANSPOSE_MATRIX $a0, $a1, $a2, $a3, $b0, $b1, $b2, $b3
     19     ; input: $a0, $a1, $a2, $a3; output: $b0, $b1, $b2, $b3
     20     ; a0: 03 02 01 00
     21     ; a1: 13 12 11 10
     22     ; a2: 23 22 21 20
     23     ; a3: 33 32 31 30
     24     ;     b3 b2 b1 b0
     25 
     26     uxtb16      $b1, $a1                    ; xx 12 xx 10
     27     uxtb16      $b0, $a0                    ; xx 02 xx 00
     28     uxtb16      $b3, $a3                    ; xx 32 xx 30
     29     uxtb16      $b2, $a2                    ; xx 22 xx 20
     30     orr         $b1, $b0, $b1, lsl #8       ; 12 02 10 00
     31     orr         $b3, $b2, $b3, lsl #8       ; 32 22 30 20
     32 
     33     uxtb16      $a1, $a1, ror #8            ; xx 13 xx 11
     34     uxtb16      $a3, $a3, ror #8            ; xx 33 xx 31
     35     uxtb16      $a0, $a0, ror #8            ; xx 03 xx 01
     36     uxtb16      $a2, $a2, ror #8            ; xx 23 xx 21
     37     orr         $a0, $a0, $a1, lsl #8       ; 13 03 11 01
     38     orr         $a2, $a2, $a3, lsl #8       ; 33 23 31 21
     39 
     40     pkhtb       $b2, $b3, $b1, asr #16      ; 32 22 12 02   -- p1
     41     pkhbt       $b0, $b1, $b3, lsl #16      ; 30 20 10 00   -- p3
     42 
     43     pkhtb       $b3, $a2, $a0, asr #16      ; 33 23 13 03   -- p0
     44     pkhbt       $b1, $a0, $a2, lsl #16      ; 31 21 11 01   -- p2
     45     MEND
     46 
     47 
     48 src         RN  r0
     49 pstep       RN  r1
     50 
     51 ;r0     unsigned char *src_ptr,
     52 ;r1     int src_pixel_step,
     53 ;r2     const char *flimit,
     54 ;r3     const char *limit,
     55 ;stack  const char *thresh,
     56 ;stack  int  count
     57 
     58 ; All 16 elements in flimit are equal. So, in the code, only one load is needed
     59 ; for flimit. Same applies to limit. thresh is not used in simple looopfilter
     60 
     61 ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
     62 |vp8_loop_filter_simple_horizontal_edge_armv6| PROC
     63 ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
     64     stmdb       sp!, {r4 - r11, lr}
     65 
     66     ldr         r12, [r3]                   ; limit
     67     ldr         r3, [src, -pstep, lsl #1]   ; p1
     68     ldr         r4, [src, -pstep]           ; p0
     69     ldr         r5, [src]                   ; q0
     70     ldr         r6, [src, pstep]            ; q1
     71     ldr         r7, [r2]                    ; flimit
     72     ldr         r2, c0x80808080
     73     ldr         r9, [sp, #40]               ; count for 8-in-parallel
     74     uadd8       r7, r7, r7                  ; flimit * 2
     75     mov         r9, r9, lsl #1              ; double the count. we're doing 4 at a time
     76     uadd8       r12, r7, r12                ; flimit * 2 + limit
     77     mov         lr, #0                      ; need 0 in a couple places
     78 
     79 |simple_hnext8|
     80     ; vp8_simple_filter_mask()
     81 
     82     uqsub8      r7, r3, r6                  ; p1 - q1
     83     uqsub8      r8, r6, r3                  ; q1 - p1
     84     uqsub8      r10, r4, r5                 ; p0 - q0
     85     uqsub8      r11, r5, r4                 ; q0 - p0
     86     orr         r8, r8, r7                  ; abs(p1 - q1)
     87     orr         r10, r10, r11               ; abs(p0 - q0)
     88     uqadd8      r10, r10, r10               ; abs(p0 - q0) * 2
     89     uhadd8      r8, r8, lr                  ; abs(p1 - q2) >> 1
     90     uqadd8      r10, r10, r8                ; abs(p0 - q0)*2 + abs(p1 - q1)/2
     91     mvn         r8, #0
     92     usub8       r10, r12, r10               ; compare to flimit. usub8 sets GE flags
     93     sel         r10, r8, lr                 ; filter mask: F or 0
     94     cmp         r10, #0
     95     beq         simple_hskip_filter         ; skip filtering if all masks are 0x00
     96 
     97     ;vp8_simple_filter()
     98 
     99     eor         r3, r3, r2                  ; p1 offset to convert to a signed value
    100     eor         r6, r6, r2                  ; q1 offset to convert to a signed value
    101     eor         r4, r4, r2                  ; p0 offset to convert to a signed value
    102     eor         r5, r5, r2                  ; q0 offset to convert to a signed value
    103 
    104     qsub8       r3, r3, r6                  ; vp8_filter = p1 - q1
    105     qsub8       r6, r5, r4                  ; q0 - p0
    106     qadd8       r3, r3, r6                  ; += q0 - p0
    107     ldr         r7, c0x04040404
    108     qadd8       r3, r3, r6                  ; += q0 - p0
    109     ldr         r8, c0x03030303
    110     qadd8       r3, r3, r6                  ; vp8_filter = p1-q1 + 3*(q0-p0))
    111     ;STALL
    112     and         r3, r3, r10                 ; vp8_filter &= mask
    113 
    114     qadd8       r7 , r3 , r7                ; Filter1 = vp8_filter + 4
    115     qadd8       r8 , r3 , r8                ; Filter2 = vp8_filter + 3
    116 
    117     shadd8      r7 , r7 , lr
    118     shadd8      r8 , r8 , lr
    119     shadd8      r7 , r7 , lr
    120     shadd8      r8 , r8 , lr
    121     shadd8      r7 , r7 , lr                ; Filter1 >>= 3
    122     shadd8      r8 , r8 , lr                ; Filter2 >>= 3
    123 
    124     qsub8       r5 ,r5, r7                  ; u = q0 - Filter1
    125     qadd8       r4, r4, r8                  ; u = p0 + Filter2
    126     eor         r5, r5, r2                  ; *oq0 = u^0x80
    127     str         r5, [src]                   ; store oq0 result
    128     eor         r4, r4, r2                  ; *op0 = u^0x80
    129     str         r4, [src, -pstep]           ; store op0 result
    130 
    131 |simple_hskip_filter|
    132     subs        r9, r9, #1
    133     addne       src, src, #4                ; next row
    134 
    135     ldrne       r3, [src, -pstep, lsl #1]   ; p1
    136     ldrne       r4, [src, -pstep]           ; p0
    137     ldrne       r5, [src]                   ; q0
    138     ldrne       r6, [src, pstep]            ; q1
    139 
    140     bne         simple_hnext8
    141 
    142     ldmia       sp!, {r4 - r11, pc}
    143     ENDP        ; |vp8_loop_filter_simple_horizontal_edge_armv6|
    144 
    145 
    146 ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
    147 |vp8_loop_filter_simple_vertical_edge_armv6| PROC
    148 ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
    149     stmdb       sp!, {r4 - r11, lr}
    150 
    151     ldr         r12, [r2]                   ; r12: flimit
    152     ldr         r2, c0x80808080
    153     ldr         r7, [r3]                    ; limit
    154 
    155     ; load soure data to r7, r8, r9, r10
    156     ldrh        r3, [src, #-2]
    157     ldrh        r4, [src], pstep
    158     uadd8       r12, r12, r12               ; flimit * 2
    159 
    160     ldrh        r5, [src, #-2]
    161     ldrh        r6, [src], pstep
    162     uadd8       r12, r12, r7                ; flimit * 2 + limit
    163 
    164     pkhbt       r7, r3, r4, lsl #16
    165 
    166     ldrh        r3, [src, #-2]
    167     ldrh        r4, [src], pstep
    168     ldr         r11, [sp, #40]              ; count (r11) for 8-in-parallel
    169 
    170     pkhbt       r8, r5, r6, lsl #16
    171 
    172     ldrh        r5, [src, #-2]
    173     ldrh        r6, [src], pstep
    174     mov         r11, r11, lsl #1            ; 4-in-parallel
    175 
    176 |simple_vnext8|
    177     ; vp8_simple_filter_mask() function
    178     pkhbt       r9, r3, r4, lsl #16
    179     pkhbt       r10, r5, r6, lsl #16
    180 
    181     ;transpose r7, r8, r9, r10 to r3, r4, r5, r6
    182     TRANSPOSE_MATRIX r7, r8, r9, r10, r3, r4, r5, r6
    183 
    184     uqsub8      r7, r3, r6                  ; p1 - q1
    185     uqsub8      r8, r6, r3                  ; q1 - p1
    186     uqsub8      r9, r4, r5                  ; p0 - q0
    187     uqsub8      r10, r5, r4                 ; q0 - p0
    188     orr         r7, r7, r8                  ; abs(p1 - q1)
    189     orr         r9, r9, r10                 ; abs(p0 - q0)
    190     mov         r8, #0
    191     uqadd8      r9, r9, r9                  ; abs(p0 - q0) * 2
    192     uhadd8      r7, r7, r8                  ; abs(p1 - q1) / 2
    193     uqadd8      r7, r7, r9                  ; abs(p0 - q0)*2 + abs(p1 - q1)/2
    194     mvn         r10, #0                     ; r10 == -1
    195 
    196     usub8       r7, r12, r7                 ; compare to flimit
    197     sel         lr, r10, r8                 ; filter mask
    198 
    199     cmp         lr, #0
    200     beq         simple_vskip_filter         ; skip filtering
    201 
    202     ;vp8_simple_filter() function
    203     eor         r3, r3, r2                  ; p1 offset to convert to a signed value
    204     eor         r6, r6, r2                  ; q1 offset to convert to a signed value
    205     eor         r4, r4, r2                  ; p0 offset to convert to a signed value
    206     eor         r5, r5, r2                  ; q0 offset to convert to a signed value
    207 
    208     qsub8       r3, r3, r6                  ; vp8_filter = p1 - q1
    209     qsub8       r6, r5, r4                  ; q0 - p0
    210 
    211     qadd8       r3, r3, r6                  ; vp8_filter += q0 - p0
    212     ldr         r9, c0x03030303             ; r9 = 3
    213 
    214     qadd8       r3, r3, r6                  ; vp8_filter += q0 - p0
    215     ldr         r7, c0x04040404
    216 
    217     qadd8       r3, r3, r6                  ; vp8_filter = p1-q1 + 3*(q0-p0))
    218     ;STALL
    219     and         r3, r3, lr                  ; vp8_filter &= mask
    220 
    221     qadd8       r9 , r3 , r9                ; Filter2 = vp8_filter + 3
    222     qadd8       r3 , r3 , r7                ; Filter1 = vp8_filter + 4
    223 
    224     shadd8      r9 , r9 , r8
    225     shadd8      r3 , r3 , r8
    226     shadd8      r9 , r9 , r8
    227     shadd8      r3 , r3 , r8
    228     shadd8      r9 , r9 , r8                ; Filter2 >>= 3
    229     shadd8      r3 , r3 , r8                ; Filter1 >>= 3
    230 
    231     ;calculate output
    232     sub         src, src, pstep, lsl #2
    233 
    234     qadd8       r4, r4, r9                  ; u = p0 + Filter2
    235     qsub8       r5, r5, r3                  ; u = q0 - Filter1
    236     eor         r4, r4, r2                  ; *op0 = u^0x80
    237     eor         r5, r5, r2                  ; *oq0 = u^0x80
    238 
    239     strb        r4, [src, #-1]              ; store the result
    240     mov         r4, r4, lsr #8
    241     strb        r5, [src], pstep
    242     mov         r5, r5, lsr #8
    243 
    244     strb        r4, [src, #-1]
    245     mov         r4, r4, lsr #8
    246     strb        r5, [src], pstep
    247     mov         r5, r5, lsr #8
    248 
    249     strb        r4, [src, #-1]
    250     mov         r4, r4, lsr #8
    251     strb        r5, [src], pstep
    252     mov         r5, r5, lsr #8
    253 
    254     strb        r4, [src, #-1]
    255     strb        r5, [src], pstep
    256 
    257 |simple_vskip_filter|
    258     subs        r11, r11, #1
    259 
    260     ; load soure data to r7, r8, r9, r10
    261     ldrneh      r3, [src, #-2]
    262     ldrneh      r4, [src], pstep
    263 
    264     ldrneh      r5, [src, #-2]
    265     ldrneh      r6, [src], pstep
    266 
    267     pkhbt       r7, r3, r4, lsl #16
    268 
    269     ldrneh      r3, [src, #-2]
    270     ldrneh      r4, [src], pstep
    271 
    272     pkhbt       r8, r5, r6, lsl #16
    273 
    274     ldrneh      r5, [src, #-2]
    275     ldrneh      r6, [src], pstep
    276 
    277     bne         simple_vnext8
    278 
    279     ldmia       sp!, {r4 - r11, pc}
    280     ENDP        ; |vp8_loop_filter_simple_vertical_edge_armv6|
    281 
    282 ; Constant Pool
    283 c0x80808080 DCD     0x80808080
    284 c0x03030303 DCD     0x03030303
    285 c0x04040404 DCD     0x04040404
    286 
    287     END
    288