Home | History | Annotate | Download | only in armv6
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12     EXPORT |vp8_loop_filter_simple_horizontal_edge_armv6|
     13     EXPORT |vp8_loop_filter_simple_vertical_edge_armv6|
     14 
     15     AREA    |.text|, CODE, READONLY  ; name this block of code
     16 
     17     MACRO
     18     TRANSPOSE_MATRIX $a0, $a1, $a2, $a3, $b0, $b1, $b2, $b3
     19     ; input: $a0, $a1, $a2, $a3; output: $b0, $b1, $b2, $b3
     20     ; a0: 03 02 01 00
     21     ; a1: 13 12 11 10
     22     ; a2: 23 22 21 20
     23     ; a3: 33 32 31 30
     24     ;     b3 b2 b1 b0
     25 
     26     uxtb16      $b1, $a1                    ; xx 12 xx 10
     27     uxtb16      $b0, $a0                    ; xx 02 xx 00
     28     uxtb16      $b3, $a3                    ; xx 32 xx 30
     29     uxtb16      $b2, $a2                    ; xx 22 xx 20
     30     orr         $b1, $b0, $b1, lsl #8       ; 12 02 10 00
     31     orr         $b3, $b2, $b3, lsl #8       ; 32 22 30 20
     32 
     33     uxtb16      $a1, $a1, ror #8            ; xx 13 xx 11
     34     uxtb16      $a3, $a3, ror #8            ; xx 33 xx 31
     35     uxtb16      $a0, $a0, ror #8            ; xx 03 xx 01
     36     uxtb16      $a2, $a2, ror #8            ; xx 23 xx 21
     37     orr         $a0, $a0, $a1, lsl #8       ; 13 03 11 01
     38     orr         $a2, $a2, $a3, lsl #8       ; 33 23 31 21
     39 
     40     pkhtb       $b2, $b3, $b1, asr #16      ; 32 22 12 02   -- p1
     41     pkhbt       $b0, $b1, $b3, lsl #16      ; 30 20 10 00   -- p3
     42 
     43     pkhtb       $b3, $a2, $a0, asr #16      ; 33 23 13 03   -- p0
     44     pkhbt       $b1, $a0, $a2, lsl #16      ; 31 21 11 01   -- p2
     45     MEND
     46 
     47 
     48 
     49 src         RN  r0
     50 pstep       RN  r1
     51 
     52 ;r0     unsigned char *src_ptr,
     53 ;r1     int src_pixel_step,
     54 ;r2     const char *blimit
     55 
     56 ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
     57 |vp8_loop_filter_simple_horizontal_edge_armv6| PROC
     58 ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
     59     stmdb       sp!, {r4 - r11, lr}
     60 
     61     ldrb        r12, [r2]                   ; blimit
     62     ldr         r3, [src, -pstep, lsl #1]   ; p1
     63     ldr         r4, [src, -pstep]           ; p0
     64     ldr         r5, [src]                   ; q0
     65     ldr         r6, [src, pstep]            ; q1
     66     orr         r12, r12, r12, lsl #8       ; blimit
     67     ldr         r2, c0x80808080
     68     orr         r12, r12, r12, lsl #16      ; blimit
     69     mov         r9, #4                      ; double the count. we're doing 4 at a time
     70     mov         lr, #0                      ; need 0 in a couple places
     71 
     72 |simple_hnext8|
     73     ; vp8_simple_filter_mask()
     74 
     75     uqsub8      r7, r3, r6                  ; p1 - q1
     76     uqsub8      r8, r6, r3                  ; q1 - p1
     77     uqsub8      r10, r4, r5                 ; p0 - q0
     78     uqsub8      r11, r5, r4                 ; q0 - p0
     79     orr         r8, r8, r7                  ; abs(p1 - q1)
     80     orr         r10, r10, r11               ; abs(p0 - q0)
     81     uqadd8      r10, r10, r10               ; abs(p0 - q0) * 2
     82     uhadd8      r8, r8, lr                  ; abs(p1 - q2) >> 1
     83     uqadd8      r10, r10, r8                ; abs(p0 - q0)*2 + abs(p1 - q1)/2
     84     mvn         r8, #0
     85     usub8       r10, r12, r10               ; compare to flimit. usub8 sets GE flags
     86     sel         r10, r8, lr                 ; filter mask: F or 0
     87     cmp         r10, #0
     88     beq         simple_hskip_filter         ; skip filtering if all masks are 0x00
     89 
     90     ;vp8_simple_filter()
     91 
     92     eor         r3, r3, r2                  ; p1 offset to convert to a signed value
     93     eor         r6, r6, r2                  ; q1 offset to convert to a signed value
     94     eor         r4, r4, r2                  ; p0 offset to convert to a signed value
     95     eor         r5, r5, r2                  ; q0 offset to convert to a signed value
     96 
     97     qsub8       r3, r3, r6                  ; vp8_filter = p1 - q1
     98     qsub8       r6, r5, r4                  ; q0 - p0
     99     qadd8       r3, r3, r6                  ; += q0 - p0
    100     ldr         r7, c0x04040404
    101     qadd8       r3, r3, r6                  ; += q0 - p0
    102     ldr         r8, c0x03030303
    103     qadd8       r3, r3, r6                  ; vp8_filter = p1-q1 + 3*(q0-p0))
    104     ;STALL
    105     and         r3, r3, r10                 ; vp8_filter &= mask
    106 
    107     qadd8       r7 , r3 , r7                ; Filter1 = vp8_filter + 4
    108     qadd8       r8 , r3 , r8                ; Filter2 = vp8_filter + 3
    109 
    110     shadd8      r7 , r7 , lr
    111     shadd8      r8 , r8 , lr
    112     shadd8      r7 , r7 , lr
    113     shadd8      r8 , r8 , lr
    114     shadd8      r7 , r7 , lr                ; Filter1 >>= 3
    115     shadd8      r8 , r8 , lr                ; Filter2 >>= 3
    116 
    117     qsub8       r5 ,r5, r7                  ; u = q0 - Filter1
    118     qadd8       r4, r4, r8                  ; u = p0 + Filter2
    119     eor         r5, r5, r2                  ; *oq0 = u^0x80
    120     str         r5, [src]                   ; store oq0 result
    121     eor         r4, r4, r2                  ; *op0 = u^0x80
    122     str         r4, [src, -pstep]           ; store op0 result
    123 
    124 |simple_hskip_filter|
    125     subs        r9, r9, #1
    126     addne       src, src, #4                ; next row
    127 
    128     ldrne       r3, [src, -pstep, lsl #1]   ; p1
    129     ldrne       r4, [src, -pstep]           ; p0
    130     ldrne       r5, [src]                   ; q0
    131     ldrne       r6, [src, pstep]            ; q1
    132 
    133     bne         simple_hnext8
    134 
    135     ldmia       sp!, {r4 - r11, pc}
    136     ENDP        ; |vp8_loop_filter_simple_horizontal_edge_armv6|
    137 
    138 
    139 ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
    140 |vp8_loop_filter_simple_vertical_edge_armv6| PROC
    141 ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
    142     stmdb       sp!, {r4 - r11, lr}
    143 
    144     ldrb        r12, [r2]                   ; r12: blimit
    145     ldr         r2, c0x80808080
    146     orr         r12, r12, r12, lsl #8
    147 
    148     ; load soure data to r7, r8, r9, r10
    149     ldrh        r3, [src, #-2]
    150     pld         [src, #23]                  ; preload for next block
    151     ldrh        r4, [src], pstep
    152     orr         r12, r12, r12, lsl #16
    153 
    154     ldrh        r5, [src, #-2]
    155     pld         [src, #23]
    156     ldrh        r6, [src], pstep
    157 
    158     pkhbt       r7, r3, r4, lsl #16
    159 
    160     ldrh        r3, [src, #-2]
    161     pld         [src, #23]
    162     ldrh        r4, [src], pstep
    163 
    164     pkhbt       r8, r5, r6, lsl #16
    165 
    166     ldrh        r5, [src, #-2]
    167     pld         [src, #23]
    168     ldrh        r6, [src], pstep
    169     mov         r11, #4                     ; double the count. we're doing 4 at a time
    170 
    171 |simple_vnext8|
    172     ; vp8_simple_filter_mask() function
    173     pkhbt       r9, r3, r4, lsl #16
    174     pkhbt       r10, r5, r6, lsl #16
    175 
    176     ;transpose r7, r8, r9, r10 to r3, r4, r5, r6
    177     TRANSPOSE_MATRIX r7, r8, r9, r10, r3, r4, r5, r6
    178 
    179     uqsub8      r7, r3, r6                  ; p1 - q1
    180     uqsub8      r8, r6, r3                  ; q1 - p1
    181     uqsub8      r9, r4, r5                  ; p0 - q0
    182     uqsub8      r10, r5, r4                 ; q0 - p0
    183     orr         r7, r7, r8                  ; abs(p1 - q1)
    184     orr         r9, r9, r10                 ; abs(p0 - q0)
    185     mov         r8, #0
    186     uqadd8      r9, r9, r9                  ; abs(p0 - q0) * 2
    187     uhadd8      r7, r7, r8                  ; abs(p1 - q1) / 2
    188     uqadd8      r7, r7, r9                  ; abs(p0 - q0)*2 + abs(p1 - q1)/2
    189     mvn         r10, #0                     ; r10 == -1
    190 
    191     usub8       r7, r12, r7                 ; compare to flimit
    192     sel         lr, r10, r8                 ; filter mask
    193 
    194     cmp         lr, #0
    195     beq         simple_vskip_filter         ; skip filtering
    196 
    197     ;vp8_simple_filter() function
    198     eor         r3, r3, r2                  ; p1 offset to convert to a signed value
    199     eor         r6, r6, r2                  ; q1 offset to convert to a signed value
    200     eor         r4, r4, r2                  ; p0 offset to convert to a signed value
    201     eor         r5, r5, r2                  ; q0 offset to convert to a signed value
    202 
    203     qsub8       r3, r3, r6                  ; vp8_filter = p1 - q1
    204     qsub8       r6, r5, r4                  ; q0 - p0
    205 
    206     qadd8       r3, r3, r6                  ; vp8_filter += q0 - p0
    207     ldr         r9, c0x03030303             ; r9 = 3
    208 
    209     qadd8       r3, r3, r6                  ; vp8_filter += q0 - p0
    210     ldr         r7, c0x04040404
    211 
    212     qadd8       r3, r3, r6                  ; vp8_filter = p1-q1 + 3*(q0-p0))
    213     ;STALL
    214     and         r3, r3, lr                  ; vp8_filter &= mask
    215 
    216     qadd8       r9 , r3 , r9                ; Filter2 = vp8_filter + 3
    217     qadd8       r3 , r3 , r7                ; Filter1 = vp8_filter + 4
    218 
    219     shadd8      r9 , r9 , r8
    220     shadd8      r3 , r3 , r8
    221     shadd8      r9 , r9 , r8
    222     shadd8      r3 , r3 , r8
    223     shadd8      r9 , r9 , r8                ; Filter2 >>= 3
    224     shadd8      r3 , r3 , r8                ; Filter1 >>= 3
    225 
    226     ;calculate output
    227     sub         src, src, pstep, lsl #2
    228 
    229     qadd8       r4, r4, r9                  ; u = p0 + Filter2
    230     qsub8       r5, r5, r3                  ; u = q0 - Filter1
    231     eor         r4, r4, r2                  ; *op0 = u^0x80
    232     eor         r5, r5, r2                  ; *oq0 = u^0x80
    233 
    234     strb        r4, [src, #-1]              ; store the result
    235     mov         r4, r4, lsr #8
    236     strb        r5, [src], pstep
    237     mov         r5, r5, lsr #8
    238 
    239     strb        r4, [src, #-1]
    240     mov         r4, r4, lsr #8
    241     strb        r5, [src], pstep
    242     mov         r5, r5, lsr #8
    243 
    244     strb        r4, [src, #-1]
    245     mov         r4, r4, lsr #8
    246     strb        r5, [src], pstep
    247     mov         r5, r5, lsr #8
    248 
    249     strb        r4, [src, #-1]
    250     strb        r5, [src], pstep
    251 
    252 |simple_vskip_filter|
    253     subs        r11, r11, #1
    254 
    255     ; load soure data to r7, r8, r9, r10
    256     ldrneh      r3, [src, #-2]
    257     pld         [src, #23]                  ; preload for next block
    258     ldrneh      r4, [src], pstep
    259 
    260     ldrneh      r5, [src, #-2]
    261     pld         [src, #23]
    262     ldrneh      r6, [src], pstep
    263 
    264     pkhbt       r7, r3, r4, lsl #16
    265 
    266     ldrneh      r3, [src, #-2]
    267     pld         [src, #23]
    268     ldrneh      r4, [src], pstep
    269 
    270     pkhbt       r8, r5, r6, lsl #16
    271 
    272     ldrneh      r5, [src, #-2]
    273     pld         [src, #23]
    274     ldrneh      r6, [src], pstep
    275 
    276     bne         simple_vnext8
    277 
    278     ldmia       sp!, {r4 - r11, pc}
    279     ENDP        ; |vp8_loop_filter_simple_vertical_edge_armv6|
    280 
    281 ; Constant Pool
    282 c0x80808080 DCD     0x80808080
    283 c0x03030303 DCD     0x03030303
    284 c0x04040404 DCD     0x04040404
    285 
    286     END
    287