1 ; 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3 ; 4 ; Use of this source code is governed by a BSD-style license 5 ; that can be found in the LICENSE file in the root of the source 6 ; tree. An additional intellectual property rights grant can be found 7 ; in the file PATENTS. All contributing project authors may 8 ; be found in the AUTHORS file in the root of the source tree. 9 ; 10 11 12 .globl vp8_sad16x16_ppc 13 .globl vp8_sad16x8_ppc 14 .globl vp8_sad8x16_ppc 15 .globl vp8_sad8x8_ppc 16 .globl vp8_sad4x4_ppc 17 18 .macro load_aligned_16 V R O 19 lvsl v3, 0, \R ;# permutate value for alignment 20 21 lvx v1, 0, \R 22 lvx v2, \O, \R 23 24 vperm \V, v1, v2, v3 25 .endm 26 27 .macro prologue 28 mfspr r11, 256 ;# get old VRSAVE 29 oris r12, r11, 0xffc0 30 mtspr 256, r12 ;# set VRSAVE 31 32 stwu r1, -32(r1) ;# create space on the stack 33 34 li r10, 16 ;# load offset and loop counter 35 36 vspltisw v8, 0 ;# zero out total to start 37 .endm 38 39 .macro epilogue 40 addi r1, r1, 32 ;# recover stack 41 42 mtspr 256, r11 ;# reset old VRSAVE 43 .endm 44 45 .macro SAD_16 46 ;# v6 = abs (v4 - v5) 47 vsububs v6, v4, v5 48 vsububs v7, v5, v4 49 vor v6, v6, v7 50 51 ;# v8 += abs (v4 - v5) 52 vsum4ubs v8, v6, v8 53 .endm 54 55 .macro sad_16_loop loop_label 56 lvsl v3, 0, r5 ;# only needs to be done once per block 57 58 ;# preload a line of data before getting into the loop 59 lvx v4, 0, r3 60 lvx v1, 0, r5 61 lvx v2, r10, r5 62 63 add r5, r5, r6 64 add r3, r3, r4 65 66 vperm v5, v1, v2, v3 67 68 .align 4 69 \loop_label: 70 ;# compute difference on first row 71 vsububs v6, v4, v5 72 vsububs v7, v5, v4 73 74 ;# load up next set of data 75 lvx v9, 0, r3 76 lvx v1, 0, r5 77 lvx v2, r10, r5 78 79 ;# perform abs() of difference 80 vor v6, v6, v7 81 add r3, r3, r4 82 83 ;# add to the running tally 84 vsum4ubs v8, v6, v8 85 86 ;# now onto the next line 87 vperm v5, v1, v2, v3 88 add r5, r5, r6 89 lvx v4, 0, r3 90 91 ;# compute difference on second row 92 vsububs v6, v9, v5 93 lvx v1, 0, r5 94 vsububs v7, v5, v9 95 lvx v2, r10, r5 96 vor v6, v6, v7 97 add r3, r3, r4 98 vsum4ubs v8, v6, v8 99 vperm v5, v1, v2, v3 100 add r5, r5, r6 101 102 bdnz \loop_label 103 104 vspltisw v7, 0 105 106 vsumsws v8, v8, v7 107 108 stvx v8, 0, r1 109 lwz r3, 12(r1) 110 .endm 111 112 .macro sad_8_loop loop_label 113 .align 4 114 \loop_label: 115 ;# only one of the inputs should need to be aligned. 116 load_aligned_16 v4, r3, r10 117 load_aligned_16 v5, r5, r10 118 119 ;# move onto the next line 120 add r3, r3, r4 121 add r5, r5, r6 122 123 ;# only one of the inputs should need to be aligned. 124 load_aligned_16 v6, r3, r10 125 load_aligned_16 v7, r5, r10 126 127 ;# move onto the next line 128 add r3, r3, r4 129 add r5, r5, r6 130 131 vmrghb v4, v4, v6 132 vmrghb v5, v5, v7 133 134 SAD_16 135 136 bdnz \loop_label 137 138 vspltisw v7, 0 139 140 vsumsws v8, v8, v7 141 142 stvx v8, 0, r1 143 lwz r3, 12(r1) 144 .endm 145 146 .align 2 147 ;# r3 unsigned char *src_ptr 148 ;# r4 int src_stride 149 ;# r5 unsigned char *ref_ptr 150 ;# r6 int ref_stride 151 ;# 152 ;# r3 return value 153 vp8_sad16x16_ppc: 154 155 prologue 156 157 li r9, 8 158 mtctr r9 159 160 sad_16_loop sad16x16_loop 161 162 epilogue 163 164 blr 165 166 .align 2 167 ;# r3 unsigned char *src_ptr 168 ;# r4 int src_stride 169 ;# r5 unsigned char *ref_ptr 170 ;# r6 int ref_stride 171 ;# 172 ;# r3 return value 173 vp8_sad16x8_ppc: 174 175 prologue 176 177 li r9, 4 178 mtctr r9 179 180 sad_16_loop sad16x8_loop 181 182 epilogue 183 184 blr 185 186 .align 2 187 ;# r3 unsigned char *src_ptr 188 ;# r4 int src_stride 189 ;# r5 unsigned char *ref_ptr 190 ;# r6 int ref_stride 191 ;# 192 ;# r3 return value 193 vp8_sad8x16_ppc: 194 195 prologue 196 197 li r9, 8 198 mtctr r9 199 200 sad_8_loop sad8x16_loop 201 202 epilogue 203 204 blr 205 206 .align 2 207 ;# r3 unsigned char *src_ptr 208 ;# r4 int src_stride 209 ;# r5 unsigned char *ref_ptr 210 ;# r6 int ref_stride 211 ;# 212 ;# r3 return value 213 vp8_sad8x8_ppc: 214 215 prologue 216 217 li r9, 4 218 mtctr r9 219 220 sad_8_loop sad8x8_loop 221 222 epilogue 223 224 blr 225 226 .macro transfer_4x4 I P 227 lwz r0, 0(\I) 228 add \I, \I, \P 229 230 lwz r7, 0(\I) 231 add \I, \I, \P 232 233 lwz r8, 0(\I) 234 add \I, \I, \P 235 236 lwz r9, 0(\I) 237 238 stw r0, 0(r1) 239 stw r7, 4(r1) 240 stw r8, 8(r1) 241 stw r9, 12(r1) 242 .endm 243 244 .align 2 245 ;# r3 unsigned char *src_ptr 246 ;# r4 int src_stride 247 ;# r5 unsigned char *ref_ptr 248 ;# r6 int ref_stride 249 ;# 250 ;# r3 return value 251 vp8_sad4x4_ppc: 252 253 prologue 254 255 transfer_4x4 r3, r4 256 lvx v4, 0, r1 257 258 transfer_4x4 r5, r6 259 lvx v5, 0, r1 260 261 vspltisw v8, 0 ;# zero out total to start 262 263 ;# v6 = abs (v4 - v5) 264 vsububs v6, v4, v5 265 vsububs v7, v5, v4 266 vor v6, v6, v7 267 268 ;# v8 += abs (v4 - v5) 269 vsum4ubs v7, v6, v8 270 vsumsws v7, v7, v8 271 272 stvx v7, 0, r1 273 lwz r3, 12(r1) 274 275 epilogue 276 277 blr 278