Home | History | Annotate | Download | only in ppc
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12     .globl vp8_get8x8var_ppc
     13     .globl vp8_get16x16var_ppc
     14     .globl vp8_mse16x16_ppc
     15     .globl vp8_variance16x16_ppc
     16     .globl vp8_variance16x8_ppc
     17     .globl vp8_variance8x16_ppc
     18     .globl vp8_variance8x8_ppc
     19     .globl vp8_variance4x4_ppc
     20 
     21 .macro load_aligned_16 V R O
     22     lvsl    v3,  0, \R          ;# permutate value for alignment
     23 
     24     lvx     v1,  0, \R
     25     lvx     v2, \O, \R
     26 
     27     vperm   \V, v1, v2, v3
     28 .endm
     29 
     30 .macro prologue
     31     mfspr   r11, 256            ;# get old VRSAVE
     32     oris    r12, r11, 0xffc0
     33     mtspr   256, r12            ;# set VRSAVE
     34 
     35     stwu    r1, -32(r1)         ;# create space on the stack
     36 
     37     li      r10, 16             ;# load offset and loop counter
     38 
     39     vspltisw v7, 0              ;# zero for merging
     40     vspltisw v8, 0              ;# zero out total to start
     41     vspltisw v9, 0              ;# zero out total for dif^2
     42 .endm
     43 
     44 .macro epilogue
     45     addi    r1, r1, 32          ;# recover stack
     46 
     47     mtspr   256, r11            ;# reset old VRSAVE
     48 .endm
     49 
     50 .macro compute_sum_sse
     51     ;# Compute sum first.  Unpack to so signed subract
     52     ;#  can be used.  Only have a half word signed
     53     ;#  subract.  Do high, then low.
     54     vmrghb  v2, v7, v4
     55     vmrghb  v3, v7, v5
     56     vsubshs v2, v2, v3
     57     vsum4shs v8, v2, v8
     58 
     59     vmrglb  v2, v7, v4
     60     vmrglb  v3, v7, v5
     61     vsubshs v2, v2, v3
     62     vsum4shs v8, v2, v8
     63 
     64     ;# Now compute sse.
     65     vsububs v2, v4, v5
     66     vsububs v3, v5, v4
     67     vor     v2, v2, v3
     68 
     69     vmsumubm v9, v2, v2, v9
     70 .endm
     71 
     72 .macro variance_16 DS loop_label store_sum
     73 \loop_label:
     74     ;# only one of the inputs should need to be aligned.
     75     load_aligned_16 v4, r3, r10
     76     load_aligned_16 v5, r5, r10
     77 
     78     ;# move onto the next line
     79     add     r3, r3, r4
     80     add     r5, r5, r6
     81 
     82     compute_sum_sse
     83 
     84     bdnz    \loop_label
     85 
     86     vsumsws v8, v8, v7
     87     vsumsws v9, v9, v7
     88 
     89     stvx    v8, 0, r1
     90     lwz     r3, 12(r1)
     91 
     92     stvx    v9, 0, r1
     93     lwz     r4, 12(r1)
     94 
     95 .if \store_sum
     96     stw     r3, 0(r8)           ;# sum
     97 .endif
     98     stw     r4, 0(r7)           ;# sse
     99 
    100     mullw   r3, r3, r3          ;# sum*sum
    101     srlwi   r3, r3, \DS         ;# (sum*sum) >> DS
    102     subf    r3, r3, r4          ;# sse - ((sum*sum) >> DS)
    103 .endm
    104 
    105 .macro variance_8 DS loop_label store_sum
    106 \loop_label:
    107     ;# only one of the inputs should need to be aligned.
    108     load_aligned_16 v4, r3, r10
    109     load_aligned_16 v5, r5, r10
    110 
    111     ;# move onto the next line
    112     add     r3, r3, r4
    113     add     r5, r5, r6
    114 
    115     ;# only one of the inputs should need to be aligned.
    116     load_aligned_16 v6, r3, r10
    117     load_aligned_16 v0, r5, r10
    118 
    119     ;# move onto the next line
    120     add     r3, r3, r4
    121     add     r5, r5, r6
    122 
    123     vmrghb  v4, v4, v6
    124     vmrghb  v5, v5, v0
    125 
    126     compute_sum_sse
    127 
    128     bdnz    \loop_label
    129 
    130     vsumsws v8, v8, v7
    131     vsumsws v9, v9, v7
    132 
    133     stvx    v8, 0, r1
    134     lwz     r3, 12(r1)
    135 
    136     stvx    v9, 0, r1
    137     lwz     r4, 12(r1)
    138 
    139 .if \store_sum
    140     stw     r3, 0(r8)           ;# sum
    141 .endif
    142     stw     r4, 0(r7)           ;# sse
    143 
    144     mullw   r3, r3, r3          ;# sum*sum
    145     srlwi   r3, r3, \DS         ;# (sum*sum) >> 8
    146     subf    r3, r3, r4          ;# sse - ((sum*sum) >> 8)
    147 .endm
    148 
    149     .align 2
    150 ;# r3 unsigned char *src_ptr
    151 ;# r4 int  source_stride
    152 ;# r5 unsigned char *ref_ptr
    153 ;# r6 int  recon_stride
    154 ;# r7 unsigned int *SSE
    155 ;# r8 int *Sum
    156 ;#
    157 ;# r3 return value
    158 vp8_get8x8var_ppc:
    159 
    160     prologue
    161 
    162     li      r9, 4
    163     mtctr   r9
    164 
    165     variance_8 6, get8x8var_loop, 1
    166 
    167     epilogue
    168 
    169     blr
    170 
    171     .align 2
    172 ;# r3 unsigned char *src_ptr
    173 ;# r4 int  source_stride
    174 ;# r5 unsigned char *ref_ptr
    175 ;# r6 int  recon_stride
    176 ;# r7 unsigned int *SSE
    177 ;# r8 int *Sum
    178 ;#
    179 ;# r3 return value
    180 vp8_get16x16var_ppc:
    181 
    182     prologue
    183 
    184     mtctr   r10
    185 
    186     variance_16 8, get16x16var_loop, 1
    187 
    188     epilogue
    189 
    190     blr
    191 
    192     .align 2
    193 ;# r3 unsigned char *src_ptr
    194 ;# r4 int  source_stride
    195 ;# r5 unsigned char *ref_ptr
    196 ;# r6 int  recon_stride
    197 ;# r7 unsigned int *sse
    198 ;#
    199 ;# r 3 return value
    200 vp8_mse16x16_ppc:
    201     prologue
    202 
    203     mtctr   r10
    204 
    205 mse16x16_loop:
    206     ;# only one of the inputs should need to be aligned.
    207     load_aligned_16 v4, r3, r10
    208     load_aligned_16 v5, r5, r10
    209 
    210     ;# move onto the next line
    211     add     r3, r3, r4
    212     add     r5, r5, r6
    213 
    214     ;# Now compute sse.
    215     vsububs v2, v4, v5
    216     vsububs v3, v5, v4
    217     vor     v2, v2, v3
    218 
    219     vmsumubm v9, v2, v2, v9
    220 
    221     bdnz    mse16x16_loop
    222 
    223     vsumsws v9, v9, v7
    224 
    225     stvx    v9, 0, r1
    226     lwz     r3, 12(r1)
    227 
    228     stvx    v9, 0, r1
    229     lwz     r3, 12(r1)
    230 
    231     stw     r3, 0(r7)           ;# sse
    232 
    233     epilogue
    234 
    235     blr
    236 
    237     .align 2
    238 ;# r3 unsigned char *src_ptr
    239 ;# r4 int  source_stride
    240 ;# r5 unsigned char *ref_ptr
    241 ;# r6 int  recon_stride
    242 ;# r7 unsigned int *sse
    243 ;#
    244 ;# r3 return value
    245 vp8_variance16x16_ppc:
    246 
    247     prologue
    248 
    249     mtctr   r10
    250 
    251     variance_16 8, variance16x16_loop, 0
    252 
    253     epilogue
    254 
    255     blr
    256 
    257     .align 2
    258 ;# r3 unsigned char *src_ptr
    259 ;# r4 int  source_stride
    260 ;# r5 unsigned char *ref_ptr
    261 ;# r6 int  recon_stride
    262 ;# r7 unsigned int *sse
    263 ;#
    264 ;# r3 return value
    265 vp8_variance16x8_ppc:
    266 
    267     prologue
    268 
    269     li      r9, 8
    270     mtctr   r9
    271 
    272     variance_16 7, variance16x8_loop, 0
    273 
    274     epilogue
    275 
    276     blr
    277 
    278     .align 2
    279 ;# r3 unsigned char *src_ptr
    280 ;# r4 int  source_stride
    281 ;# r5 unsigned char *ref_ptr
    282 ;# r6 int  recon_stride
    283 ;# r7 unsigned int *sse
    284 ;#
    285 ;# r3 return value
    286 vp8_variance8x16_ppc:
    287 
    288     prologue
    289 
    290     li      r9, 8
    291     mtctr   r9
    292 
    293     variance_8 7, variance8x16_loop, 0
    294 
    295     epilogue
    296 
    297     blr
    298 
    299     .align 2
    300 ;# r3 unsigned char *src_ptr
    301 ;# r4 int  source_stride
    302 ;# r5 unsigned char *ref_ptr
    303 ;# r6 int  recon_stride
    304 ;# r7 unsigned int *sse
    305 ;#
    306 ;# r3 return value
    307 vp8_variance8x8_ppc:
    308 
    309     prologue
    310 
    311     li      r9, 4
    312     mtctr   r9
    313 
    314     variance_8 6, variance8x8_loop, 0
    315 
    316     epilogue
    317 
    318     blr
    319 
    320 .macro transfer_4x4 I P
    321     lwz     r0, 0(\I)
    322     add     \I, \I, \P
    323 
    324     lwz     r10,0(\I)
    325     add     \I, \I, \P
    326 
    327     lwz     r8, 0(\I)
    328     add     \I, \I, \P
    329 
    330     lwz     r9, 0(\I)
    331 
    332     stw     r0,  0(r1)
    333     stw     r10, 4(r1)
    334     stw     r8,  8(r1)
    335     stw     r9, 12(r1)
    336 .endm
    337 
    338     .align 2
    339 ;# r3 unsigned char *src_ptr
    340 ;# r4 int  source_stride
    341 ;# r5 unsigned char *ref_ptr
    342 ;# r6 int  recon_stride
    343 ;# r7 unsigned int *sse
    344 ;#
    345 ;# r3 return value
    346 vp8_variance4x4_ppc:
    347 
    348     prologue
    349 
    350     transfer_4x4 r3, r4
    351     lvx     v4, 0, r1
    352 
    353     transfer_4x4 r5, r6
    354     lvx     v5, 0, r1
    355 
    356     compute_sum_sse
    357 
    358     vsumsws v8, v8, v7
    359     vsumsws v9, v9, v7
    360 
    361     stvx    v8, 0, r1
    362     lwz     r3, 12(r1)
    363 
    364     stvx    v9, 0, r1
    365     lwz     r4, 12(r1)
    366 
    367     stw     r4, 0(r7)           ;# sse
    368 
    369     mullw   r3, r3, r3          ;# sum*sum
    370     srlwi   r3, r3, 4           ;# (sum*sum) >> 4
    371     subf    r3, r3, r4          ;# sse - ((sum*sum) >> 4)
    372 
    373     epilogue
    374 
    375     blr
    376