Home | History | Annotate | Download | only in armv6
      1 ;
      2 ;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12     EXPORT  |vp8_variance_halfpixvar16x16_v_armv6|
     13 
     14     ARM
     15     REQUIRE8
     16     PRESERVE8
     17 
     18     AREA ||.text||, CODE, READONLY, ALIGN=2
     19 
     20 ; r0    unsigned char *src_ptr
     21 ; r1    int source_stride
     22 ; r2    unsigned char *ref_ptr
     23 ; r3    int  recon_stride
     24 ; stack unsigned int *sse
     25 |vp8_variance_halfpixvar16x16_v_armv6| PROC
     26 
     27     stmfd   sp!, {r4-r12, lr}
     28 
     29     pld     [r0, r1, lsl #0]
     30     pld     [r2, r3, lsl #0]
     31 
     32     mov     r8, #0              ; initialize sum = 0
     33     ldr     r10, c80808080
     34     mov     r11, #0             ; initialize sse = 0
     35     mov     r12, #16            ; set loop counter to 16 (=block height)
     36     mov     lr, #0              ; constant zero
     37 loop
     38     add     r9, r0, r1          ; set src pointer to next row
     39     ; 1st 4 pixels
     40     ldr     r4, [r0, #0]        ; load 4 src pixels
     41     ldr     r6, [r9, #0]        ; load 4 src pixels from next row
     42     ldr     r5, [r2, #0]        ; load 4 ref pixels
     43 
     44     ; bilinear interpolation
     45     mvn     r6, r6
     46     uhsub8  r4, r4, r6
     47     eor     r4, r4, r10
     48 
     49     usub8   r6, r4, r5          ; calculate difference
     50     pld     [r0, r1, lsl #1]
     51     sel     r7, r6, lr          ; select bytes with positive difference
     52     usub8   r6, r5, r4          ; calculate difference with reversed operands
     53     pld     [r2, r3, lsl #1]
     54     sel     r6, r6, lr          ; select bytes with negative difference
     55 
     56     ; calculate partial sums
     57     usad8   r4, r7, lr          ; calculate sum of positive differences
     58     usad8   r5, r6, lr          ; calculate sum of negative differences
     59     orr     r6, r6, r7          ; differences of all 4 pixels
     60     ; calculate total sum
     61     adds    r8, r8, r4          ; add positive differences to sum
     62     subs    r8, r8, r5          ; subtract negative differences from sum
     63 
     64     ; calculate sse
     65     uxtb16  r5, r6              ; byte (two pixels) to halfwords
     66     uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
     67     smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
     68 
     69     ; 2nd 4 pixels
     70     ldr     r4, [r0, #4]        ; load 4 src pixels
     71     ldr     r6, [r9, #4]        ; load 4 src pixels from next row
     72     ldr     r5, [r2, #4]        ; load 4 ref pixels
     73 
     74     ; bilinear interpolation
     75     mvn     r6, r6
     76     uhsub8  r4, r4, r6
     77     eor     r4, r4, r10
     78 
     79     smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
     80 
     81     usub8   r6, r4, r5          ; calculate difference
     82     sel     r7, r6, lr          ; select bytes with positive difference
     83     usub8   r6, r5, r4          ; calculate difference with reversed operands
     84     sel     r6, r6, lr          ; select bytes with negative difference
     85 
     86     ; calculate partial sums
     87     usad8   r4, r7, lr          ; calculate sum of positive differences
     88     usad8   r5, r6, lr          ; calculate sum of negative differences
     89     orr     r6, r6, r7          ; differences of all 4 pixels
     90 
     91     ; calculate total sum
     92     add     r8, r8, r4          ; add positive differences to sum
     93     sub     r8, r8, r5          ; subtract negative differences from sum
     94 
     95     ; calculate sse
     96     uxtb16  r5, r6              ; byte (two pixels) to halfwords
     97     uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
     98     smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
     99 
    100     ; 3rd 4 pixels
    101     ldr     r4, [r0, #8]        ; load 4 src pixels
    102     ldr     r6, [r9, #8]        ; load 4 src pixels from next row
    103     ldr     r5, [r2, #8]        ; load 4 ref pixels
    104 
    105     ; bilinear interpolation
    106     mvn     r6, r6
    107     uhsub8  r4, r4, r6
    108     eor     r4, r4, r10
    109 
    110     smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
    111 
    112     usub8   r6, r4, r5          ; calculate difference
    113     sel     r7, r6, lr          ; select bytes with positive difference
    114     usub8   r6, r5, r4          ; calculate difference with reversed operands
    115     sel     r6, r6, lr          ; select bytes with negative difference
    116 
    117     ; calculate partial sums
    118     usad8   r4, r7, lr          ; calculate sum of positive differences
    119     usad8   r5, r6, lr          ; calculate sum of negative differences
    120     orr     r6, r6, r7          ; differences of all 4 pixels
    121 
    122     ; calculate total sum
    123     add     r8, r8, r4          ; add positive differences to sum
    124     sub     r8, r8, r5          ; subtract negative differences from sum
    125 
    126     ; calculate sse
    127     uxtb16  r5, r6              ; byte (two pixels) to halfwords
    128     uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
    129     smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
    130 
    131     ; 4th 4 pixels
    132     ldr     r4, [r0, #12]       ; load 4 src pixels
    133     ldr     r6, [r9, #12]       ; load 4 src pixels from next row
    134     ldr     r5, [r2, #12]       ; load 4 ref pixels
    135 
    136     ; bilinear interpolation
    137     mvn     r6, r6
    138     uhsub8  r4, r4, r6
    139     eor     r4, r4, r10
    140 
    141     smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
    142 
    143     usub8   r6, r4, r5          ; calculate difference
    144     add     r0, r0, r1          ; set src_ptr to next row
    145     sel     r7, r6, lr          ; select bytes with positive difference
    146     usub8   r6, r5, r4          ; calculate difference with reversed operands
    147     add     r2, r2, r3          ; set dst_ptr to next row
    148     sel     r6, r6, lr          ; select bytes with negative difference
    149 
    150     ; calculate partial sums
    151     usad8   r4, r7, lr          ; calculate sum of positive differences
    152     usad8   r5, r6, lr          ; calculate sum of negative differences
    153     orr     r6, r6, r7          ; differences of all 4 pixels
    154 
    155     ; calculate total sum
    156     add     r8, r8, r4          ; add positive differences to sum
    157     sub     r8, r8, r5          ; subtract negative differences from sum
    158 
    159     ; calculate sse
    160     uxtb16  r5, r6              ; byte (two pixels) to halfwords
    161     uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
    162     smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
    163     smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
    164 
    165 
    166     subs    r12, r12, #1
    167 
    168     bne     loop
    169 
    170     ; return stuff
    171     ldr     r6, [sp, #40]       ; get address of sse
    172     mul     r0, r8, r8          ; sum * sum
    173     str     r11, [r6]           ; store sse
    174     sub     r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
    175 
    176     ldmfd   sp!, {r4-r12, pc}
    177 
    178     ENDP
    179 
    180 c80808080
    181     DCD     0x80808080
    182 
    183     END
    184 
    185