Home | History | Annotate | Download | only in ppc
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12     .globl vp8_sad16x16_ppc
     13     .globl vp8_sad16x8_ppc
     14     .globl vp8_sad8x16_ppc
     15     .globl vp8_sad8x8_ppc
     16     .globl vp8_sad4x4_ppc
     17 
     18 .macro load_aligned_16 V R O
     19     lvsl    v3,  0, \R          ;# permutate value for alignment
     20 
     21     lvx     v1,  0, \R
     22     lvx     v2, \O, \R
     23 
     24     vperm   \V, v1, v2, v3
     25 .endm
     26 
     27 .macro prologue
     28     mfspr   r11, 256            ;# get old VRSAVE
     29     oris    r12, r11, 0xffc0
     30     mtspr   256, r12            ;# set VRSAVE
     31 
     32     stwu    r1, -32(r1)         ;# create space on the stack
     33 
     34     li      r10, 16             ;# load offset and loop counter
     35 
     36     vspltisw v8, 0              ;# zero out total to start
     37 .endm
     38 
     39 .macro epilogue
     40     addi    r1, r1, 32          ;# recover stack
     41 
     42     mtspr   256, r11            ;# reset old VRSAVE
     43 .endm
     44 
     45 .macro SAD_16
     46     ;# v6 = abs (v4 - v5)
     47     vsububs v6, v4, v5
     48     vsububs v7, v5, v4
     49     vor     v6, v6, v7
     50 
     51     ;# v8 += abs (v4 - v5)
     52     vsum4ubs v8, v6, v8
     53 .endm
     54 
     55 .macro sad_16_loop loop_label
     56     lvsl    v3,  0, r5          ;# only needs to be done once per block
     57 
     58     ;# preload a line of data before getting into the loop
     59     lvx     v4, 0, r3
     60     lvx     v1,  0, r5
     61     lvx     v2, r10, r5
     62 
     63     add     r5, r5, r6
     64     add     r3, r3, r4
     65 
     66     vperm   v5, v1, v2, v3
     67 
     68     .align 4
     69 \loop_label:
     70     ;# compute difference on first row
     71     vsububs v6, v4, v5
     72     vsububs v7, v5, v4
     73 
     74     ;# load up next set of data
     75     lvx     v9, 0, r3
     76     lvx     v1,  0, r5
     77     lvx     v2, r10, r5
     78 
     79     ;# perform abs() of difference
     80     vor     v6, v6, v7
     81     add     r3, r3, r4
     82 
     83     ;# add to the running tally
     84     vsum4ubs v8, v6, v8
     85 
     86     ;# now onto the next line
     87     vperm   v5, v1, v2, v3
     88     add     r5, r5, r6
     89     lvx     v4, 0, r3
     90 
     91     ;# compute difference on second row
     92     vsububs v6, v9, v5
     93     lvx     v1,  0, r5
     94     vsububs v7, v5, v9
     95     lvx     v2, r10, r5
     96     vor     v6, v6, v7
     97     add     r3, r3, r4
     98     vsum4ubs v8, v6, v8
     99     vperm   v5, v1, v2, v3
    100     add     r5, r5, r6
    101 
    102     bdnz    \loop_label
    103 
    104     vspltisw v7, 0
    105 
    106     vsumsws v8, v8, v7
    107 
    108     stvx    v8, 0, r1
    109     lwz     r3, 12(r1)
    110 .endm
    111 
    112 .macro sad_8_loop loop_label
    113     .align 4
    114 \loop_label:
    115     ;# only one of the inputs should need to be aligned.
    116     load_aligned_16 v4, r3, r10
    117     load_aligned_16 v5, r5, r10
    118 
    119     ;# move onto the next line
    120     add     r3, r3, r4
    121     add     r5, r5, r6
    122 
    123     ;# only one of the inputs should need to be aligned.
    124     load_aligned_16 v6, r3, r10
    125     load_aligned_16 v7, r5, r10
    126 
    127     ;# move onto the next line
    128     add     r3, r3, r4
    129     add     r5, r5, r6
    130 
    131     vmrghb  v4, v4, v6
    132     vmrghb  v5, v5, v7
    133 
    134     SAD_16
    135 
    136     bdnz    \loop_label
    137 
    138     vspltisw v7, 0
    139 
    140     vsumsws v8, v8, v7
    141 
    142     stvx    v8, 0, r1
    143     lwz     r3, 12(r1)
    144 .endm
    145 
    146     .align 2
    147 ;# r3 unsigned char *src_ptr
    148 ;# r4 int  src_stride
    149 ;# r5 unsigned char *ref_ptr
    150 ;# r6 int  ref_stride
    151 ;#
    152 ;# r3 return value
    153 vp8_sad16x16_ppc:
    154 
    155     prologue
    156 
    157     li      r9, 8
    158     mtctr   r9
    159 
    160     sad_16_loop sad16x16_loop
    161 
    162     epilogue
    163 
    164     blr
    165 
    166     .align 2
    167 ;# r3 unsigned char *src_ptr
    168 ;# r4 int  src_stride
    169 ;# r5 unsigned char *ref_ptr
    170 ;# r6 int  ref_stride
    171 ;#
    172 ;# r3 return value
    173 vp8_sad16x8_ppc:
    174 
    175     prologue
    176 
    177     li      r9, 4
    178     mtctr   r9
    179 
    180     sad_16_loop sad16x8_loop
    181 
    182     epilogue
    183 
    184     blr
    185 
    186     .align 2
    187 ;# r3 unsigned char *src_ptr
    188 ;# r4 int  src_stride
    189 ;# r5 unsigned char *ref_ptr
    190 ;# r6 int  ref_stride
    191 ;#
    192 ;# r3 return value
    193 vp8_sad8x16_ppc:
    194 
    195     prologue
    196 
    197     li      r9, 8
    198     mtctr   r9
    199 
    200     sad_8_loop sad8x16_loop
    201 
    202     epilogue
    203 
    204     blr
    205 
    206     .align 2
    207 ;# r3 unsigned char *src_ptr
    208 ;# r4 int  src_stride
    209 ;# r5 unsigned char *ref_ptr
    210 ;# r6 int  ref_stride
    211 ;#
    212 ;# r3 return value
    213 vp8_sad8x8_ppc:
    214 
    215     prologue
    216 
    217     li      r9, 4
    218     mtctr   r9
    219 
    220     sad_8_loop sad8x8_loop
    221 
    222     epilogue
    223 
    224     blr
    225 
    226 .macro transfer_4x4 I P
    227     lwz     r0, 0(\I)
    228     add     \I, \I, \P
    229 
    230     lwz     r7, 0(\I)
    231     add     \I, \I, \P
    232 
    233     lwz     r8, 0(\I)
    234     add     \I, \I, \P
    235 
    236     lwz     r9, 0(\I)
    237 
    238     stw     r0,  0(r1)
    239     stw     r7,  4(r1)
    240     stw     r8,  8(r1)
    241     stw     r9, 12(r1)
    242 .endm
    243 
    244     .align 2
    245 ;# r3 unsigned char *src_ptr
    246 ;# r4 int  src_stride
    247 ;# r5 unsigned char *ref_ptr
    248 ;# r6 int  ref_stride
    249 ;#
    250 ;# r3 return value
    251 vp8_sad4x4_ppc:
    252 
    253     prologue
    254 
    255     transfer_4x4 r3, r4
    256     lvx     v4, 0, r1
    257 
    258     transfer_4x4 r5, r6
    259     lvx     v5, 0, r1
    260 
    261     vspltisw v8, 0              ;# zero out total to start
    262 
    263     ;# v6 = abs (v4 - v5)
    264     vsububs v6, v4, v5
    265     vsububs v7, v5, v4
    266     vor     v6, v6, v7
    267 
    268     ;# v8 += abs (v4 - v5)
    269     vsum4ubs v7, v6, v8
    270     vsumsws v7, v7, v8
    271 
    272     stvx    v7, 0, r1
    273     lwz     r3, 12(r1)
    274 
    275     epilogue
    276 
    277     blr
    278