1 /* 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <emmintrin.h> 12 13 #include "./vp9_rtcd.h" 14 #include "vpx_ports/mem.h" 15 16 void vp9_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp, 17 int *min, int *max) { 18 __m128i u0, s0, d0, diff, maxabsdiff, minabsdiff, negdiff, absdiff0, absdiff; 19 u0 = _mm_setzero_si128(); 20 // Row 0 21 s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0); 22 d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d)), u0); 23 diff = _mm_subs_epi16(s0, d0); 24 negdiff = _mm_subs_epi16(u0, diff); 25 absdiff0 = _mm_max_epi16(diff, negdiff); 26 // Row 1 27 s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0); 28 d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + dp)), u0); 29 diff = _mm_subs_epi16(s0, d0); 30 negdiff = _mm_subs_epi16(u0, diff); 31 absdiff = _mm_max_epi16(diff, negdiff); 32 maxabsdiff = _mm_max_epi16(absdiff0, absdiff); 33 minabsdiff = _mm_min_epi16(absdiff0, absdiff); 34 // Row 2 35 s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0); 36 d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 2 * dp)), u0); 37 diff = _mm_subs_epi16(s0, d0); 38 negdiff = _mm_subs_epi16(u0, diff); 39 absdiff = _mm_max_epi16(diff, negdiff); 40 maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); 41 minabsdiff = _mm_min_epi16(minabsdiff, absdiff); 42 // Row 3 43 s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0); 44 d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 3 * dp)), u0); 45 diff = _mm_subs_epi16(s0, d0); 46 negdiff = _mm_subs_epi16(u0, diff); 47 absdiff = _mm_max_epi16(diff, negdiff); 48 maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); 49 minabsdiff = _mm_min_epi16(minabsdiff, absdiff); 50 // Row 4 51 s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0); 52 d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 4 * dp)), u0); 53 diff = _mm_subs_epi16(s0, d0); 54 negdiff = _mm_subs_epi16(u0, diff); 55 absdiff = _mm_max_epi16(diff, negdiff); 56 maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); 57 minabsdiff = _mm_min_epi16(minabsdiff, absdiff); 58 // Row 5 59 s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0); 60 d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 5 * dp)), u0); 61 diff = _mm_subs_epi16(s0, d0); 62 negdiff = _mm_subs_epi16(u0, diff); 63 absdiff = _mm_max_epi16(diff, negdiff); 64 maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); 65 minabsdiff = _mm_min_epi16(minabsdiff, absdiff); 66 // Row 6 67 s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0); 68 d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 6 * dp)), u0); 69 diff = _mm_subs_epi16(s0, d0); 70 negdiff = _mm_subs_epi16(u0, diff); 71 absdiff = _mm_max_epi16(diff, negdiff); 72 maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); 73 minabsdiff = _mm_min_epi16(minabsdiff, absdiff); 74 // Row 7 75 s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0); 76 d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 7 * dp)), u0); 77 diff = _mm_subs_epi16(s0, d0); 78 negdiff = _mm_subs_epi16(u0, diff); 79 absdiff = _mm_max_epi16(diff, negdiff); 80 maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); 81 minabsdiff = _mm_min_epi16(minabsdiff, absdiff); 82 83 maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_si128(maxabsdiff, 8)); 84 maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 32)); 85 maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 16)); 86 *max = _mm_extract_epi16(maxabsdiff, 0); 87 88 minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_si128(minabsdiff, 8)); 89 minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 32)); 90 minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 16)); 91 *min = _mm_extract_epi16(minabsdiff, 0); 92 } 93 94 unsigned int vp9_avg_8x8_sse2(const uint8_t *s, int p) { 95 __m128i s0, s1, u0; 96 unsigned int avg = 0; 97 u0 = _mm_setzero_si128(); 98 s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0); 99 s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0); 100 s0 = _mm_adds_epu16(s0, s1); 101 s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0); 102 s0 = _mm_adds_epu16(s0, s1); 103 s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0); 104 s0 = _mm_adds_epu16(s0, s1); 105 s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0); 106 s0 = _mm_adds_epu16(s0, s1); 107 s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0); 108 s0 = _mm_adds_epu16(s0, s1); 109 s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0); 110 s0 = _mm_adds_epu16(s0, s1); 111 s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0); 112 s0 = _mm_adds_epu16(s0, s1); 113 114 s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 8)); 115 s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 32)); 116 s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16)); 117 avg = _mm_extract_epi16(s0, 0); 118 return (avg + 32) >> 6; 119 } 120 121 unsigned int vp9_avg_4x4_sse2(const uint8_t *s, int p) { 122 __m128i s0, s1, u0; 123 unsigned int avg = 0; 124 u0 = _mm_setzero_si128(); 125 s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0); 126 s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0); 127 s0 = _mm_adds_epu16(s0, s1); 128 s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0); 129 s0 = _mm_adds_epu16(s0, s1); 130 s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0); 131 s0 = _mm_adds_epu16(s0, s1); 132 133 s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 4)); 134 s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16)); 135 avg = _mm_extract_epi16(s0, 0); 136 return (avg + 8) >> 4; 137 } 138 139 static void hadamard_col8_sse2(__m128i *in, int iter) { 140 __m128i a0 = in[0]; 141 __m128i a1 = in[1]; 142 __m128i a2 = in[2]; 143 __m128i a3 = in[3]; 144 __m128i a4 = in[4]; 145 __m128i a5 = in[5]; 146 __m128i a6 = in[6]; 147 __m128i a7 = in[7]; 148 149 __m128i b0 = _mm_add_epi16(a0, a1); 150 __m128i b1 = _mm_sub_epi16(a0, a1); 151 __m128i b2 = _mm_add_epi16(a2, a3); 152 __m128i b3 = _mm_sub_epi16(a2, a3); 153 __m128i b4 = _mm_add_epi16(a4, a5); 154 __m128i b5 = _mm_sub_epi16(a4, a5); 155 __m128i b6 = _mm_add_epi16(a6, a7); 156 __m128i b7 = _mm_sub_epi16(a6, a7); 157 158 a0 = _mm_add_epi16(b0, b2); 159 a1 = _mm_add_epi16(b1, b3); 160 a2 = _mm_sub_epi16(b0, b2); 161 a3 = _mm_sub_epi16(b1, b3); 162 a4 = _mm_add_epi16(b4, b6); 163 a5 = _mm_add_epi16(b5, b7); 164 a6 = _mm_sub_epi16(b4, b6); 165 a7 = _mm_sub_epi16(b5, b7); 166 167 if (iter == 0) { 168 b0 = _mm_add_epi16(a0, a4); 169 b7 = _mm_add_epi16(a1, a5); 170 b3 = _mm_add_epi16(a2, a6); 171 b4 = _mm_add_epi16(a3, a7); 172 b2 = _mm_sub_epi16(a0, a4); 173 b6 = _mm_sub_epi16(a1, a5); 174 b1 = _mm_sub_epi16(a2, a6); 175 b5 = _mm_sub_epi16(a3, a7); 176 177 a0 = _mm_unpacklo_epi16(b0, b1); 178 a1 = _mm_unpacklo_epi16(b2, b3); 179 a2 = _mm_unpackhi_epi16(b0, b1); 180 a3 = _mm_unpackhi_epi16(b2, b3); 181 a4 = _mm_unpacklo_epi16(b4, b5); 182 a5 = _mm_unpacklo_epi16(b6, b7); 183 a6 = _mm_unpackhi_epi16(b4, b5); 184 a7 = _mm_unpackhi_epi16(b6, b7); 185 186 b0 = _mm_unpacklo_epi32(a0, a1); 187 b1 = _mm_unpacklo_epi32(a4, a5); 188 b2 = _mm_unpackhi_epi32(a0, a1); 189 b3 = _mm_unpackhi_epi32(a4, a5); 190 b4 = _mm_unpacklo_epi32(a2, a3); 191 b5 = _mm_unpacklo_epi32(a6, a7); 192 b6 = _mm_unpackhi_epi32(a2, a3); 193 b7 = _mm_unpackhi_epi32(a6, a7); 194 195 in[0] = _mm_unpacklo_epi64(b0, b1); 196 in[1] = _mm_unpackhi_epi64(b0, b1); 197 in[2] = _mm_unpacklo_epi64(b2, b3); 198 in[3] = _mm_unpackhi_epi64(b2, b3); 199 in[4] = _mm_unpacklo_epi64(b4, b5); 200 in[5] = _mm_unpackhi_epi64(b4, b5); 201 in[6] = _mm_unpacklo_epi64(b6, b7); 202 in[7] = _mm_unpackhi_epi64(b6, b7); 203 } else { 204 in[0] = _mm_add_epi16(a0, a4); 205 in[7] = _mm_add_epi16(a1, a5); 206 in[3] = _mm_add_epi16(a2, a6); 207 in[4] = _mm_add_epi16(a3, a7); 208 in[2] = _mm_sub_epi16(a0, a4); 209 in[6] = _mm_sub_epi16(a1, a5); 210 in[1] = _mm_sub_epi16(a2, a6); 211 in[5] = _mm_sub_epi16(a3, a7); 212 } 213 } 214 215 void vp9_hadamard_8x8_sse2(int16_t const *src_diff, int src_stride, 216 int16_t *coeff) { 217 __m128i src[8]; 218 src[0] = _mm_load_si128((const __m128i *)src_diff); 219 src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); 220 src[2] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); 221 src[3] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); 222 src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); 223 src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); 224 src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); 225 src[7] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); 226 227 hadamard_col8_sse2(src, 0); 228 hadamard_col8_sse2(src, 1); 229 230 _mm_store_si128((__m128i *)coeff, src[0]); 231 coeff += 8; 232 _mm_store_si128((__m128i *)coeff, src[1]); 233 coeff += 8; 234 _mm_store_si128((__m128i *)coeff, src[2]); 235 coeff += 8; 236 _mm_store_si128((__m128i *)coeff, src[3]); 237 coeff += 8; 238 _mm_store_si128((__m128i *)coeff, src[4]); 239 coeff += 8; 240 _mm_store_si128((__m128i *)coeff, src[5]); 241 coeff += 8; 242 _mm_store_si128((__m128i *)coeff, src[6]); 243 coeff += 8; 244 _mm_store_si128((__m128i *)coeff, src[7]); 245 } 246 247 void vp9_hadamard_16x16_sse2(int16_t const *src_diff, int src_stride, 248 int16_t *coeff) { 249 int idx; 250 for (idx = 0; idx < 4; ++idx) { 251 int16_t const *src_ptr = src_diff + (idx >> 1) * 8 * src_stride 252 + (idx & 0x01) * 8; 253 vp9_hadamard_8x8_sse2(src_ptr, src_stride, coeff + idx * 64); 254 } 255 256 for (idx = 0; idx < 64; idx += 8) { 257 __m128i coeff0 = _mm_load_si128((const __m128i *)coeff); 258 __m128i coeff1 = _mm_load_si128((const __m128i *)(coeff + 64)); 259 __m128i coeff2 = _mm_load_si128((const __m128i *)(coeff + 128)); 260 __m128i coeff3 = _mm_load_si128((const __m128i *)(coeff + 192)); 261 262 __m128i b0 = _mm_add_epi16(coeff0, coeff1); 263 __m128i b1 = _mm_sub_epi16(coeff0, coeff1); 264 __m128i b2 = _mm_add_epi16(coeff2, coeff3); 265 __m128i b3 = _mm_sub_epi16(coeff2, coeff3); 266 267 b0 = _mm_srai_epi16(b0, 1); 268 b1 = _mm_srai_epi16(b1, 1); 269 b2 = _mm_srai_epi16(b2, 1); 270 b3 = _mm_srai_epi16(b3, 1); 271 272 coeff0 = _mm_add_epi16(b0, b2); 273 coeff1 = _mm_add_epi16(b1, b3); 274 _mm_store_si128((__m128i *)coeff, coeff0); 275 _mm_store_si128((__m128i *)(coeff + 64), coeff1); 276 277 coeff2 = _mm_sub_epi16(b0, b2); 278 coeff3 = _mm_sub_epi16(b1, b3); 279 _mm_store_si128((__m128i *)(coeff + 128), coeff2); 280 _mm_store_si128((__m128i *)(coeff + 192), coeff3); 281 282 coeff += 8; 283 } 284 } 285 286 int16_t vp9_satd_sse2(const int16_t *coeff, int length) { 287 int i; 288 __m128i sum = _mm_load_si128((const __m128i *)coeff); 289 __m128i sign = _mm_srai_epi16(sum, 15); 290 __m128i val = _mm_xor_si128(sum, sign); 291 sum = _mm_sub_epi16(val, sign); 292 coeff += 8; 293 294 for (i = 8; i < length; i += 8) { 295 __m128i src_line = _mm_load_si128((const __m128i *)coeff); 296 sign = _mm_srai_epi16(src_line, 15); 297 val = _mm_xor_si128(src_line, sign); 298 val = _mm_sub_epi16(val, sign); 299 sum = _mm_add_epi16(sum, val); 300 coeff += 8; 301 } 302 303 val = _mm_srli_si128(sum, 8); 304 sum = _mm_add_epi16(sum, val); 305 val = _mm_srli_epi64(sum, 32); 306 sum = _mm_add_epi16(sum, val); 307 val = _mm_srli_epi32(sum, 16); 308 sum = _mm_add_epi16(sum, val); 309 310 return _mm_extract_epi16(sum, 0); 311 } 312 313 void vp9_int_pro_row_sse2(int16_t *hbuf, uint8_t const*ref, 314 const int ref_stride, const int height) { 315 int idx; 316 __m128i zero = _mm_setzero_si128(); 317 __m128i src_line = _mm_loadu_si128((const __m128i *)ref); 318 __m128i s0 = _mm_unpacklo_epi8(src_line, zero); 319 __m128i s1 = _mm_unpackhi_epi8(src_line, zero); 320 __m128i t0, t1; 321 int height_1 = height - 1; 322 ref += ref_stride; 323 324 for (idx = 1; idx < height_1; idx += 2) { 325 src_line = _mm_loadu_si128((const __m128i *)ref); 326 t0 = _mm_unpacklo_epi8(src_line, zero); 327 t1 = _mm_unpackhi_epi8(src_line, zero); 328 s0 = _mm_adds_epu16(s0, t0); 329 s1 = _mm_adds_epu16(s1, t1); 330 ref += ref_stride; 331 332 src_line = _mm_loadu_si128((const __m128i *)ref); 333 t0 = _mm_unpacklo_epi8(src_line, zero); 334 t1 = _mm_unpackhi_epi8(src_line, zero); 335 s0 = _mm_adds_epu16(s0, t0); 336 s1 = _mm_adds_epu16(s1, t1); 337 ref += ref_stride; 338 } 339 340 src_line = _mm_loadu_si128((const __m128i *)ref); 341 t0 = _mm_unpacklo_epi8(src_line, zero); 342 t1 = _mm_unpackhi_epi8(src_line, zero); 343 s0 = _mm_adds_epu16(s0, t0); 344 s1 = _mm_adds_epu16(s1, t1); 345 346 if (height == 64) { 347 s0 = _mm_srai_epi16(s0, 5); 348 s1 = _mm_srai_epi16(s1, 5); 349 } else if (height == 32) { 350 s0 = _mm_srai_epi16(s0, 4); 351 s1 = _mm_srai_epi16(s1, 4); 352 } else { 353 s0 = _mm_srai_epi16(s0, 3); 354 s1 = _mm_srai_epi16(s1, 3); 355 } 356 357 _mm_storeu_si128((__m128i *)hbuf, s0); 358 hbuf += 8; 359 _mm_storeu_si128((__m128i *)hbuf, s1); 360 } 361 362 int16_t vp9_int_pro_col_sse2(uint8_t const *ref, const int width) { 363 __m128i zero = _mm_setzero_si128(); 364 __m128i src_line = _mm_load_si128((const __m128i *)ref); 365 __m128i s0 = _mm_sad_epu8(src_line, zero); 366 __m128i s1; 367 int i; 368 369 for (i = 16; i < width; i += 16) { 370 ref += 16; 371 src_line = _mm_load_si128((const __m128i *)ref); 372 s1 = _mm_sad_epu8(src_line, zero); 373 s0 = _mm_adds_epu16(s0, s1); 374 } 375 376 s1 = _mm_srli_si128(s0, 8); 377 s0 = _mm_adds_epu16(s0, s1); 378 379 return _mm_extract_epi16(s0, 0); 380 } 381 382 int vp9_vector_var_sse2(int16_t const *ref, int16_t const *src, 383 const int bwl) { 384 int idx; 385 int width = 4 << bwl; 386 int16_t mean; 387 __m128i v0 = _mm_loadu_si128((const __m128i *)ref); 388 __m128i v1 = _mm_load_si128((const __m128i *)src); 389 __m128i diff = _mm_subs_epi16(v0, v1); 390 __m128i sum = diff; 391 __m128i sse = _mm_madd_epi16(diff, diff); 392 393 ref += 8; 394 src += 8; 395 396 for (idx = 8; idx < width; idx += 8) { 397 v0 = _mm_loadu_si128((const __m128i *)ref); 398 v1 = _mm_load_si128((const __m128i *)src); 399 diff = _mm_subs_epi16(v0, v1); 400 401 sum = _mm_add_epi16(sum, diff); 402 v0 = _mm_madd_epi16(diff, diff); 403 sse = _mm_add_epi32(sse, v0); 404 405 ref += 8; 406 src += 8; 407 } 408 409 v0 = _mm_srli_si128(sum, 8); 410 sum = _mm_add_epi16(sum, v0); 411 v0 = _mm_srli_epi64(sum, 32); 412 sum = _mm_add_epi16(sum, v0); 413 v0 = _mm_srli_epi32(sum, 16); 414 sum = _mm_add_epi16(sum, v0); 415 416 v1 = _mm_srli_si128(sse, 8); 417 sse = _mm_add_epi32(sse, v1); 418 v1 = _mm_srli_epi64(sse, 32); 419 sse = _mm_add_epi32(sse, v1); 420 421 mean = _mm_extract_epi16(sum, 0); 422 423 return _mm_cvtsi128_si32(sse) - ((mean * mean) >> (bwl + 2)); 424 } 425