1 ; 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3 ; 4 ; Use of this source code is governed by a BSD-style license 5 ; that can be found in the LICENSE file in the root of the source 6 ; tree. An additional intellectual property rights grant can be found 7 ; in the file PATENTS. All contributing project authors may 8 ; be found in the AUTHORS file in the root of the source tree. 9 ; 10 11 12 .globl vp8_get8x8var_ppc 13 .globl vp8_get16x16var_ppc 14 .globl vp8_mse16x16_ppc 15 .globl vp8_variance16x16_ppc 16 .globl vp8_variance16x8_ppc 17 .globl vp8_variance8x16_ppc 18 .globl vp8_variance8x8_ppc 19 .globl vp8_variance4x4_ppc 20 21 .macro load_aligned_16 V R O 22 lvsl v3, 0, \R ;# permutate value for alignment 23 24 lvx v1, 0, \R 25 lvx v2, \O, \R 26 27 vperm \V, v1, v2, v3 28 .endm 29 30 .macro prologue 31 mfspr r11, 256 ;# get old VRSAVE 32 oris r12, r11, 0xffc0 33 mtspr 256, r12 ;# set VRSAVE 34 35 stwu r1, -32(r1) ;# create space on the stack 36 37 li r10, 16 ;# load offset and loop counter 38 39 vspltisw v7, 0 ;# zero for merging 40 vspltisw v8, 0 ;# zero out total to start 41 vspltisw v9, 0 ;# zero out total for dif^2 42 .endm 43 44 .macro epilogue 45 addi r1, r1, 32 ;# recover stack 46 47 mtspr 256, r11 ;# reset old VRSAVE 48 .endm 49 50 .macro compute_sum_sse 51 ;# Compute sum first. Unpack to so signed subract 52 ;# can be used. Only have a half word signed 53 ;# subract. Do high, then low. 54 vmrghb v2, v7, v4 55 vmrghb v3, v7, v5 56 vsubshs v2, v2, v3 57 vsum4shs v8, v2, v8 58 59 vmrglb v2, v7, v4 60 vmrglb v3, v7, v5 61 vsubshs v2, v2, v3 62 vsum4shs v8, v2, v8 63 64 ;# Now compute sse. 65 vsububs v2, v4, v5 66 vsububs v3, v5, v4 67 vor v2, v2, v3 68 69 vmsumubm v9, v2, v2, v9 70 .endm 71 72 .macro variance_16 DS loop_label store_sum 73 \loop_label: 74 ;# only one of the inputs should need to be aligned. 75 load_aligned_16 v4, r3, r10 76 load_aligned_16 v5, r5, r10 77 78 ;# move onto the next line 79 add r3, r3, r4 80 add r5, r5, r6 81 82 compute_sum_sse 83 84 bdnz \loop_label 85 86 vsumsws v8, v8, v7 87 vsumsws v9, v9, v7 88 89 stvx v8, 0, r1 90 lwz r3, 12(r1) 91 92 stvx v9, 0, r1 93 lwz r4, 12(r1) 94 95 .if \store_sum 96 stw r3, 0(r8) ;# sum 97 .endif 98 stw r4, 0(r7) ;# sse 99 100 mullw r3, r3, r3 ;# sum*sum 101 srawi r3, r3, \DS ;# (sum*sum) >> DS 102 subf r3, r3, r4 ;# sse - ((sum*sum) >> DS) 103 .endm 104 105 .macro variance_8 DS loop_label store_sum 106 \loop_label: 107 ;# only one of the inputs should need to be aligned. 108 load_aligned_16 v4, r3, r10 109 load_aligned_16 v5, r5, r10 110 111 ;# move onto the next line 112 add r3, r3, r4 113 add r5, r5, r6 114 115 ;# only one of the inputs should need to be aligned. 116 load_aligned_16 v6, r3, r10 117 load_aligned_16 v0, r5, r10 118 119 ;# move onto the next line 120 add r3, r3, r4 121 add r5, r5, r6 122 123 vmrghb v4, v4, v6 124 vmrghb v5, v5, v0 125 126 compute_sum_sse 127 128 bdnz \loop_label 129 130 vsumsws v8, v8, v7 131 vsumsws v9, v9, v7 132 133 stvx v8, 0, r1 134 lwz r3, 12(r1) 135 136 stvx v9, 0, r1 137 lwz r4, 12(r1) 138 139 .if \store_sum 140 stw r3, 0(r8) ;# sum 141 .endif 142 stw r4, 0(r7) ;# sse 143 144 mullw r3, r3, r3 ;# sum*sum 145 srawi r3, r3, \DS ;# (sum*sum) >> 8 146 subf r3, r3, r4 ;# sse - ((sum*sum) >> 8) 147 .endm 148 149 .align 2 150 ;# r3 unsigned char *src_ptr 151 ;# r4 int source_stride 152 ;# r5 unsigned char *ref_ptr 153 ;# r6 int recon_stride 154 ;# r7 unsigned int *SSE 155 ;# r8 int *Sum 156 ;# 157 ;# r3 return value 158 vp8_get8x8var_ppc: 159 160 prologue 161 162 li r9, 4 163 mtctr r9 164 165 variance_8 6, get8x8var_loop, 1 166 167 epilogue 168 169 blr 170 171 .align 2 172 ;# r3 unsigned char *src_ptr 173 ;# r4 int source_stride 174 ;# r5 unsigned char *ref_ptr 175 ;# r6 int recon_stride 176 ;# r7 unsigned int *SSE 177 ;# r8 int *Sum 178 ;# 179 ;# r3 return value 180 vp8_get16x16var_ppc: 181 182 prologue 183 184 mtctr r10 185 186 variance_16 8, get16x16var_loop, 1 187 188 epilogue 189 190 blr 191 192 .align 2 193 ;# r3 unsigned char *src_ptr 194 ;# r4 int source_stride 195 ;# r5 unsigned char *ref_ptr 196 ;# r6 int recon_stride 197 ;# r7 unsigned int *sse 198 ;# 199 ;# r 3 return value 200 vp8_mse16x16_ppc: 201 prologue 202 203 mtctr r10 204 205 mse16x16_loop: 206 ;# only one of the inputs should need to be aligned. 207 load_aligned_16 v4, r3, r10 208 load_aligned_16 v5, r5, r10 209 210 ;# move onto the next line 211 add r3, r3, r4 212 add r5, r5, r6 213 214 ;# Now compute sse. 215 vsububs v2, v4, v5 216 vsububs v3, v5, v4 217 vor v2, v2, v3 218 219 vmsumubm v9, v2, v2, v9 220 221 bdnz mse16x16_loop 222 223 vsumsws v9, v9, v7 224 225 stvx v9, 0, r1 226 lwz r3, 12(r1) 227 228 stvx v9, 0, r1 229 lwz r3, 12(r1) 230 231 stw r3, 0(r7) ;# sse 232 233 epilogue 234 235 blr 236 237 .align 2 238 ;# r3 unsigned char *src_ptr 239 ;# r4 int source_stride 240 ;# r5 unsigned char *ref_ptr 241 ;# r6 int recon_stride 242 ;# r7 unsigned int *sse 243 ;# 244 ;# r3 return value 245 vp8_variance16x16_ppc: 246 247 prologue 248 249 mtctr r10 250 251 variance_16 8, variance16x16_loop, 0 252 253 epilogue 254 255 blr 256 257 .align 2 258 ;# r3 unsigned char *src_ptr 259 ;# r4 int source_stride 260 ;# r5 unsigned char *ref_ptr 261 ;# r6 int recon_stride 262 ;# r7 unsigned int *sse 263 ;# 264 ;# r3 return value 265 vp8_variance16x8_ppc: 266 267 prologue 268 269 li r9, 8 270 mtctr r9 271 272 variance_16 7, variance16x8_loop, 0 273 274 epilogue 275 276 blr 277 278 .align 2 279 ;# r3 unsigned char *src_ptr 280 ;# r4 int source_stride 281 ;# r5 unsigned char *ref_ptr 282 ;# r6 int recon_stride 283 ;# r7 unsigned int *sse 284 ;# 285 ;# r3 return value 286 vp8_variance8x16_ppc: 287 288 prologue 289 290 li r9, 8 291 mtctr r9 292 293 variance_8 7, variance8x16_loop, 0 294 295 epilogue 296 297 blr 298 299 .align 2 300 ;# r3 unsigned char *src_ptr 301 ;# r4 int source_stride 302 ;# r5 unsigned char *ref_ptr 303 ;# r6 int recon_stride 304 ;# r7 unsigned int *sse 305 ;# 306 ;# r3 return value 307 vp8_variance8x8_ppc: 308 309 prologue 310 311 li r9, 4 312 mtctr r9 313 314 variance_8 6, variance8x8_loop, 0 315 316 epilogue 317 318 blr 319 320 .macro transfer_4x4 I P 321 lwz r0, 0(\I) 322 add \I, \I, \P 323 324 lwz r10,0(\I) 325 add \I, \I, \P 326 327 lwz r8, 0(\I) 328 add \I, \I, \P 329 330 lwz r9, 0(\I) 331 332 stw r0, 0(r1) 333 stw r10, 4(r1) 334 stw r8, 8(r1) 335 stw r9, 12(r1) 336 .endm 337 338 .align 2 339 ;# r3 unsigned char *src_ptr 340 ;# r4 int source_stride 341 ;# r5 unsigned char *ref_ptr 342 ;# r6 int recon_stride 343 ;# r7 unsigned int *sse 344 ;# 345 ;# r3 return value 346 vp8_variance4x4_ppc: 347 348 prologue 349 350 transfer_4x4 r3, r4 351 lvx v4, 0, r1 352 353 transfer_4x4 r5, r6 354 lvx v5, 0, r1 355 356 compute_sum_sse 357 358 vsumsws v8, v8, v7 359 vsumsws v9, v9, v7 360 361 stvx v8, 0, r1 362 lwz r3, 12(r1) 363 364 stvx v9, 0, r1 365 lwz r4, 12(r1) 366 367 stw r4, 0(r7) ;# sse 368 369 mullw r3, r3, r3 ;# sum*sum 370 srawi r3, r3, 4 ;# (sum*sum) >> 4 371 subf r3, r3, r4 ;# sse - ((sum*sum) >> 4) 372 373 epilogue 374 375 blr 376