Home | History | Annotate | Download | only in armv6
      1 ;
      2 ;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12     EXPORT  |vp8_variance8x8_armv6|
     13 
     14     ARM
     15 
     16     AREA ||.text||, CODE, READONLY, ALIGN=2
     17 
     18 ; r0    unsigned char *src_ptr
     19 ; r1    int source_stride
     20 ; r2    unsigned char *ref_ptr
     21 ; r3    int  recon_stride
     22 ; stack unsigned int *sse
     23 |vp8_variance8x8_armv6| PROC
     24 
     25     push    {r4-r10, lr}
     26     mov     r12, #8             ; set loop counter to 8 (=block height)
     27     mov     r4, #0              ; initialize sum = 0
     28     mov     r5, #0              ; initialize sse = 0
     29 
     30 loop
     31     ; 1st 4 pixels
     32     ldr     r6, [r0, #0x0]      ; load 4 src pixels
     33     ldr     r7, [r2, #0x0]      ; load 4 ref pixels
     34 
     35     mov     lr, #0              ; constant zero
     36 
     37     usub8   r8, r6, r7          ; calculate difference
     38     sel     r10, r8, lr         ; select bytes with positive difference
     39     usub8   r9, r7, r6          ; calculate difference with reversed operands
     40     sel     r8, r9, lr          ; select bytes with negative difference
     41 
     42     ; calculate partial sums
     43     usad8   r6, r10, lr         ; calculate sum of positive differences
     44     usad8   r7, r8, lr          ; calculate sum of negative differences
     45     orr     r8, r8, r10         ; differences of all 4 pixels
     46     ; calculate total sum
     47     add    r4, r4, r6           ; add positive differences to sum
     48     sub    r4, r4, r7           ; substract negative differences from sum
     49 
     50     ; calculate sse
     51     uxtb16  r7, r8              ; byte (two pixels) to halfwords
     52     uxtb16  r10, r8, ror #8     ; another two pixels to halfwords
     53     smlad   r5, r7, r7, r5      ; dual signed multiply, add and accumulate (1)
     54 
     55     ; 2nd 4 pixels
     56     ldr     r6, [r0, #0x4]      ; load 4 src pixels
     57     ldr     r7, [r2, #0x4]      ; load 4 ref pixels
     58     smlad   r5, r10, r10, r5    ; dual signed multiply, add and accumulate (2)
     59 
     60     usub8   r8, r6, r7          ; calculate difference
     61     add     r0, r0, r1          ; set src_ptr to next row
     62     sel     r10, r8, lr         ; select bytes with positive difference
     63     usub8   r9, r7, r6          ; calculate difference with reversed operands
     64     add     r2, r2, r3          ; set dst_ptr to next row
     65     sel     r8, r9, lr          ; select bytes with negative difference
     66 
     67     ; calculate partial sums
     68     usad8   r6, r10, lr         ; calculate sum of positive differences
     69     usad8   r7, r8, lr          ; calculate sum of negative differences
     70     orr     r8, r8, r10         ; differences of all 4 pixels
     71 
     72     ; calculate total sum
     73     add     r4, r4, r6          ; add positive differences to sum
     74     sub     r4, r4, r7          ; substract negative differences from sum
     75 
     76     ; calculate sse
     77     uxtb16  r7, r8              ; byte (two pixels) to halfwords
     78     uxtb16  r10, r8, ror #8     ; another two pixels to halfwords
     79     smlad   r5, r7, r7, r5      ; dual signed multiply, add and accumulate (1)
     80     subs    r12, r12, #1        ; next row
     81     smlad   r5, r10, r10, r5    ; dual signed multiply, add and accumulate (2)
     82 
     83     bne     loop
     84 
     85     ; return stuff
     86     ldr     r8, [sp, #32]       ; get address of sse
     87     mul     r1, r4, r4          ; sum * sum
     88     str     r5, [r8]            ; store sse
     89     sub     r0, r5, r1, ASR #6  ; return (sse - ((sum * sum) >> 6))
     90 
     91     pop     {r4-r10, pc}
     92 
     93     ENDP
     94 
     95     END
     96