1 /* 2 * Copyright (c) 2017 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <stdlib.h> 12 13 #include "./vpx_dsp_rtcd.h" 14 15 #include "vpx_dsp/ppc/types_vsx.h" 16 17 #include "vpx/vpx_integer.h" 18 #include "vpx_ports/mem.h" 19 20 #define PROCESS16(offset) \ 21 v_a = vec_vsx_ld(offset, a); \ 22 v_b = vec_vsx_ld(offset, b); \ 23 v_ah = unpack_to_s16_h(v_a); \ 24 v_al = unpack_to_s16_l(v_a); \ 25 v_bh = unpack_to_s16_h(v_b); \ 26 v_bl = unpack_to_s16_l(v_b); \ 27 v_subh = vec_sub(v_ah, v_bh); \ 28 v_subl = vec_sub(v_al, v_bl); \ 29 v_absh = vec_abs(v_subh); \ 30 v_absl = vec_abs(v_subl); \ 31 v_sad = vec_sum4s(v_absh, v_sad); \ 32 v_sad = vec_sum4s(v_absl, v_sad); 33 34 #define SAD16(height) \ 35 unsigned int vpx_sad16x##height##_vsx(const uint8_t *a, int a_stride, \ 36 const uint8_t *b, int b_stride) { \ 37 int y; \ 38 unsigned int sad[4]; \ 39 uint8x16_t v_a, v_b; \ 40 int16x8_t v_ah, v_al, v_bh, v_bl, v_absh, v_absl, v_subh, v_subl; \ 41 int32x4_t v_sad = vec_splat_s32(0); \ 42 \ 43 for (y = 0; y < height; y++) { \ 44 PROCESS16(0); \ 45 \ 46 a += a_stride; \ 47 b += b_stride; \ 48 } \ 49 vec_vsx_st((uint32x4_t)v_sad, 0, sad); \ 50 \ 51 return sad[3] + sad[2] + sad[1] + sad[0]; \ 52 } 53 54 #define SAD32(height) \ 55 unsigned int vpx_sad32x##height##_vsx(const uint8_t *a, int a_stride, \ 56 const uint8_t *b, int b_stride) { \ 57 int y; \ 58 unsigned int sad[4]; \ 59 uint8x16_t v_a, v_b; \ 60 int16x8_t v_ah, v_al, v_bh, v_bl, v_absh, v_absl, v_subh, v_subl; \ 61 int32x4_t v_sad = vec_splat_s32(0); \ 62 \ 63 for (y = 0; y < height; y++) { \ 64 PROCESS16(0); \ 65 PROCESS16(16); \ 66 \ 67 a += a_stride; \ 68 b += b_stride; \ 69 } \ 70 vec_vsx_st((uint32x4_t)v_sad, 0, sad); \ 71 \ 72 return sad[3] + sad[2] + sad[1] + sad[0]; \ 73 } 74 75 #define SAD64(height) \ 76 unsigned int vpx_sad64x##height##_vsx(const uint8_t *a, int a_stride, \ 77 const uint8_t *b, int b_stride) { \ 78 int y; \ 79 unsigned int sad[4]; \ 80 uint8x16_t v_a, v_b; \ 81 int16x8_t v_ah, v_al, v_bh, v_bl, v_absh, v_absl, v_subh, v_subl; \ 82 int32x4_t v_sad = vec_splat_s32(0); \ 83 \ 84 for (y = 0; y < height; y++) { \ 85 PROCESS16(0); \ 86 PROCESS16(16); \ 87 PROCESS16(32); \ 88 PROCESS16(48); \ 89 \ 90 a += a_stride; \ 91 b += b_stride; \ 92 } \ 93 vec_vsx_st((uint32x4_t)v_sad, 0, sad); \ 94 \ 95 return sad[3] + sad[2] + sad[1] + sad[0]; \ 96 } 97 98 SAD16(8); 99 SAD16(16); 100 SAD16(32); 101 SAD32(16); 102 SAD32(32); 103 SAD32(64); 104 SAD64(32); 105 SAD64(64); 106 107 #define SAD16AVG(height) \ 108 unsigned int vpx_sad16x##height##_avg_vsx( \ 109 const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ 110 const uint8_t *second_pred) { \ 111 DECLARE_ALIGNED(16, uint8_t, comp_pred[16 * height]); \ 112 vpx_comp_avg_pred_vsx(comp_pred, second_pred, 16, height, ref, \ 113 ref_stride); \ 114 \ 115 return vpx_sad16x##height##_vsx(src, src_stride, comp_pred, 16); \ 116 } 117 118 #define SAD32AVG(height) \ 119 unsigned int vpx_sad32x##height##_avg_vsx( \ 120 const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ 121 const uint8_t *second_pred) { \ 122 DECLARE_ALIGNED(32, uint8_t, comp_pred[32 * height]); \ 123 vpx_comp_avg_pred_vsx(comp_pred, second_pred, 32, height, ref, \ 124 ref_stride); \ 125 \ 126 return vpx_sad32x##height##_vsx(src, src_stride, comp_pred, 32); \ 127 } 128 129 #define SAD64AVG(height) \ 130 unsigned int vpx_sad64x##height##_avg_vsx( \ 131 const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ 132 const uint8_t *second_pred) { \ 133 DECLARE_ALIGNED(64, uint8_t, comp_pred[64 * height]); \ 134 vpx_comp_avg_pred_vsx(comp_pred, second_pred, 64, height, ref, \ 135 ref_stride); \ 136 return vpx_sad64x##height##_vsx(src, src_stride, comp_pred, 64); \ 137 } 138 139 SAD16AVG(8); 140 SAD16AVG(16); 141 SAD16AVG(32); 142 SAD32AVG(16); 143 SAD32AVG(32); 144 SAD32AVG(64); 145 SAD64AVG(32); 146 SAD64AVG(64); 147 148 #define PROCESS16_4D(offset, ref, v_h, v_l) \ 149 v_b = vec_vsx_ld(offset, ref); \ 150 v_bh = unpack_to_s16_h(v_b); \ 151 v_bl = unpack_to_s16_l(v_b); \ 152 v_subh = vec_sub(v_h, v_bh); \ 153 v_subl = vec_sub(v_l, v_bl); \ 154 v_absh = vec_abs(v_subh); \ 155 v_absl = vec_abs(v_subl); \ 156 v_sad = vec_sum4s(v_absh, v_sad); \ 157 v_sad = vec_sum4s(v_absl, v_sad); 158 159 #define UNPACK_SRC(offset, srcv_h, srcv_l) \ 160 v_a = vec_vsx_ld(offset, src); \ 161 srcv_h = unpack_to_s16_h(v_a); \ 162 srcv_l = unpack_to_s16_l(v_a); 163 164 #define SAD16_4D(height) \ 165 void vpx_sad16x##height##x4d_vsx(const uint8_t *src, int src_stride, \ 166 const uint8_t *const ref_array[], \ 167 int ref_stride, uint32_t *sad_array) { \ 168 int i; \ 169 int y; \ 170 unsigned int sad[4]; \ 171 uint8x16_t v_a, v_b; \ 172 int16x8_t v_ah, v_al, v_bh, v_bl, v_absh, v_absl, v_subh, v_subl; \ 173 \ 174 for (i = 0; i < 4; i++) sad_array[i] = 0; \ 175 \ 176 for (y = 0; y < height; y++) { \ 177 UNPACK_SRC(y *src_stride, v_ah, v_al); \ 178 for (i = 0; i < 4; i++) { \ 179 int32x4_t v_sad = vec_splat_s32(0); \ 180 PROCESS16_4D(y *ref_stride, ref_array[i], v_ah, v_al); \ 181 \ 182 vec_vsx_st((uint32x4_t)v_sad, 0, sad); \ 183 sad_array[i] += (sad[3] + sad[2] + sad[1] + sad[0]); \ 184 } \ 185 } \ 186 } 187 188 #define SAD32_4D(height) \ 189 void vpx_sad32x##height##x4d_vsx(const uint8_t *src, int src_stride, \ 190 const uint8_t *const ref_array[], \ 191 int ref_stride, uint32_t *sad_array) { \ 192 int i; \ 193 int y; \ 194 unsigned int sad[4]; \ 195 uint8x16_t v_a, v_b; \ 196 int16x8_t v_ah1, v_al1, v_ah2, v_al2, v_bh, v_bl; \ 197 int16x8_t v_absh, v_absl, v_subh, v_subl; \ 198 \ 199 for (i = 0; i < 4; i++) sad_array[i] = 0; \ 200 \ 201 for (y = 0; y < height; y++) { \ 202 UNPACK_SRC(y *src_stride, v_ah1, v_al1); \ 203 UNPACK_SRC(y *src_stride + 16, v_ah2, v_al2); \ 204 for (i = 0; i < 4; i++) { \ 205 int32x4_t v_sad = vec_splat_s32(0); \ 206 PROCESS16_4D(y *ref_stride, ref_array[i], v_ah1, v_al1); \ 207 PROCESS16_4D(y *ref_stride + 16, ref_array[i], v_ah2, v_al2); \ 208 \ 209 vec_vsx_st((uint32x4_t)v_sad, 0, sad); \ 210 sad_array[i] += (sad[3] + sad[2] + sad[1] + sad[0]); \ 211 } \ 212 } \ 213 } 214 215 #define SAD64_4D(height) \ 216 void vpx_sad64x##height##x4d_vsx(const uint8_t *src, int src_stride, \ 217 const uint8_t *const ref_array[], \ 218 int ref_stride, uint32_t *sad_array) { \ 219 int i; \ 220 int y; \ 221 unsigned int sad[4]; \ 222 uint8x16_t v_a, v_b; \ 223 int16x8_t v_ah1, v_al1, v_ah2, v_al2, v_bh, v_bl; \ 224 int16x8_t v_ah3, v_al3, v_ah4, v_al4; \ 225 int16x8_t v_absh, v_absl, v_subh, v_subl; \ 226 \ 227 for (i = 0; i < 4; i++) sad_array[i] = 0; \ 228 \ 229 for (y = 0; y < height; y++) { \ 230 UNPACK_SRC(y *src_stride, v_ah1, v_al1); \ 231 UNPACK_SRC(y *src_stride + 16, v_ah2, v_al2); \ 232 UNPACK_SRC(y *src_stride + 32, v_ah3, v_al3); \ 233 UNPACK_SRC(y *src_stride + 48, v_ah4, v_al4); \ 234 for (i = 0; i < 4; i++) { \ 235 int32x4_t v_sad = vec_splat_s32(0); \ 236 PROCESS16_4D(y *ref_stride, ref_array[i], v_ah1, v_al1); \ 237 PROCESS16_4D(y *ref_stride + 16, ref_array[i], v_ah2, v_al2); \ 238 PROCESS16_4D(y *ref_stride + 32, ref_array[i], v_ah3, v_al3); \ 239 PROCESS16_4D(y *ref_stride + 48, ref_array[i], v_ah4, v_al4); \ 240 \ 241 vec_vsx_st((uint32x4_t)v_sad, 0, sad); \ 242 sad_array[i] += (sad[3] + sad[2] + sad[1] + sad[0]); \ 243 } \ 244 } \ 245 } 246 247 SAD16_4D(8); 248 SAD16_4D(16); 249 SAD16_4D(32); 250 SAD32_4D(16); 251 SAD32_4D(32); 252 SAD32_4D(64); 253 SAD64_4D(32); 254 SAD64_4D(64); 255