1 /* 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <emmintrin.h> // SSE2 12 13 #include "./vpx_dsp_rtcd.h" 14 #include "vpx_dsp/x86/highbd_inv_txfm_sse2.h" 15 #include "vpx_dsp/x86/inv_txfm_sse2.h" 16 #include "vpx_dsp/x86/transpose_sse2.h" 17 #include "vpx_dsp/x86/txfm_common_sse2.h" 18 19 static INLINE void highbd_idct16_4col_stage5(const __m128i *const in, 20 __m128i *const out) { 21 // stage 5 22 out[0] = _mm_add_epi32(in[0], in[3]); 23 out[1] = _mm_add_epi32(in[1], in[2]); 24 out[2] = _mm_sub_epi32(in[1], in[2]); 25 out[3] = _mm_sub_epi32(in[0], in[3]); 26 highbd_butterfly_cospi16_sse2(in[6], in[5], &out[6], &out[5]); 27 out[8] = _mm_add_epi32(in[8], in[11]); 28 out[9] = _mm_add_epi32(in[9], in[10]); 29 out[10] = _mm_sub_epi32(in[9], in[10]); 30 out[11] = _mm_sub_epi32(in[8], in[11]); 31 out[12] = _mm_sub_epi32(in[15], in[12]); 32 out[13] = _mm_sub_epi32(in[14], in[13]); 33 out[14] = _mm_add_epi32(in[14], in[13]); 34 out[15] = _mm_add_epi32(in[15], in[12]); 35 } 36 37 static INLINE void highbd_idct16_4col_stage6(const __m128i *const in, 38 __m128i *const out) { 39 out[0] = _mm_add_epi32(in[0], in[7]); 40 out[1] = _mm_add_epi32(in[1], in[6]); 41 out[2] = _mm_add_epi32(in[2], in[5]); 42 out[3] = _mm_add_epi32(in[3], in[4]); 43 out[4] = _mm_sub_epi32(in[3], in[4]); 44 out[5] = _mm_sub_epi32(in[2], in[5]); 45 out[6] = _mm_sub_epi32(in[1], in[6]); 46 out[7] = _mm_sub_epi32(in[0], in[7]); 47 out[8] = in[8]; 48 out[9] = in[9]; 49 highbd_butterfly_cospi16_sse2(in[13], in[10], &out[13], &out[10]); 50 highbd_butterfly_cospi16_sse2(in[12], in[11], &out[12], &out[11]); 51 out[14] = in[14]; 52 out[15] = in[15]; 53 } 54 55 static INLINE void highbd_idct16_4col(__m128i *const io /*io[16]*/) { 56 __m128i step1[16], step2[16]; 57 58 // stage 2 59 highbd_butterfly_sse2(io[1], io[15], cospi_30_64, cospi_2_64, &step2[8], 60 &step2[15]); 61 highbd_butterfly_sse2(io[9], io[7], cospi_14_64, cospi_18_64, &step2[9], 62 &step2[14]); 63 highbd_butterfly_sse2(io[5], io[11], cospi_22_64, cospi_10_64, &step2[10], 64 &step2[13]); 65 highbd_butterfly_sse2(io[13], io[3], cospi_6_64, cospi_26_64, &step2[11], 66 &step2[12]); 67 68 // stage 3 69 highbd_butterfly_sse2(io[2], io[14], cospi_28_64, cospi_4_64, &step1[4], 70 &step1[7]); 71 highbd_butterfly_sse2(io[10], io[6], cospi_12_64, cospi_20_64, &step1[5], 72 &step1[6]); 73 step1[8] = _mm_add_epi32(step2[8], step2[9]); 74 step1[9] = _mm_sub_epi32(step2[8], step2[9]); 75 step1[10] = _mm_sub_epi32(step2[10], step2[11]); // step1[10] = -step1[10] 76 step1[11] = _mm_add_epi32(step2[10], step2[11]); 77 step1[12] = _mm_add_epi32(step2[13], step2[12]); 78 step1[13] = _mm_sub_epi32(step2[13], step2[12]); // step1[13] = -step1[13] 79 step1[14] = _mm_sub_epi32(step2[15], step2[14]); 80 step1[15] = _mm_add_epi32(step2[15], step2[14]); 81 82 // stage 4 83 highbd_butterfly_cospi16_sse2(io[0], io[8], &step2[0], &step2[1]); 84 highbd_butterfly_sse2(io[4], io[12], cospi_24_64, cospi_8_64, &step2[2], 85 &step2[3]); 86 highbd_butterfly_sse2(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9], 87 &step2[14]); 88 highbd_butterfly_sse2(step1[10], step1[13], cospi_8_64, cospi_24_64, 89 &step2[13], &step2[10]); 90 step2[5] = _mm_sub_epi32(step1[4], step1[5]); 91 step1[4] = _mm_add_epi32(step1[4], step1[5]); 92 step2[6] = _mm_sub_epi32(step1[7], step1[6]); 93 step1[7] = _mm_add_epi32(step1[7], step1[6]); 94 step2[8] = step1[8]; 95 step2[11] = step1[11]; 96 step2[12] = step1[12]; 97 step2[15] = step1[15]; 98 99 highbd_idct16_4col_stage5(step2, step1); 100 highbd_idct16_4col_stage6(step1, step2); 101 highbd_idct16_4col_stage7(step2, io); 102 } 103 104 static INLINE void highbd_idct16x16_38_4col(__m128i *const io /*io[16]*/) { 105 __m128i step1[16], step2[16]; 106 __m128i temp1[2], sign[2]; 107 108 // stage 2 109 highbd_partial_butterfly_sse2(io[1], cospi_30_64, cospi_2_64, &step2[8], 110 &step2[15]); 111 highbd_partial_butterfly_neg_sse2(io[7], cospi_14_64, cospi_18_64, &step2[9], 112 &step2[14]); 113 highbd_partial_butterfly_sse2(io[5], cospi_22_64, cospi_10_64, &step2[10], 114 &step2[13]); 115 highbd_partial_butterfly_neg_sse2(io[3], cospi_6_64, cospi_26_64, &step2[11], 116 &step2[12]); 117 118 // stage 3 119 highbd_partial_butterfly_sse2(io[2], cospi_28_64, cospi_4_64, &step1[4], 120 &step1[7]); 121 highbd_partial_butterfly_neg_sse2(io[6], cospi_12_64, cospi_20_64, &step1[5], 122 &step1[6]); 123 step1[8] = _mm_add_epi32(step2[8], step2[9]); 124 step1[9] = _mm_sub_epi32(step2[8], step2[9]); 125 step1[10] = _mm_sub_epi32(step2[10], step2[11]); // step1[10] = -step1[10] 126 step1[11] = _mm_add_epi32(step2[10], step2[11]); 127 step1[12] = _mm_add_epi32(step2[13], step2[12]); 128 step1[13] = _mm_sub_epi32(step2[13], step2[12]); // step1[13] = -step1[13] 129 step1[14] = _mm_sub_epi32(step2[15], step2[14]); 130 step1[15] = _mm_add_epi32(step2[15], step2[14]); 131 132 // stage 4 133 abs_extend_64bit_sse2(io[0], temp1, sign); 134 step2[0] = multiplication_round_shift_sse2(temp1, sign, cospi_16_64); 135 step2[1] = step2[0]; 136 highbd_partial_butterfly_sse2(io[4], cospi_24_64, cospi_8_64, &step2[2], 137 &step2[3]); 138 highbd_butterfly_sse2(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9], 139 &step2[14]); 140 highbd_butterfly_sse2(step1[10], step1[13], cospi_8_64, cospi_24_64, 141 &step2[13], &step2[10]); 142 step2[5] = _mm_sub_epi32(step1[4], step1[5]); 143 step1[4] = _mm_add_epi32(step1[4], step1[5]); 144 step2[6] = _mm_sub_epi32(step1[7], step1[6]); 145 step1[7] = _mm_add_epi32(step1[7], step1[6]); 146 step2[8] = step1[8]; 147 step2[11] = step1[11]; 148 step2[12] = step1[12]; 149 step2[15] = step1[15]; 150 151 highbd_idct16_4col_stage5(step2, step1); 152 highbd_idct16_4col_stage6(step1, step2); 153 highbd_idct16_4col_stage7(step2, io); 154 } 155 156 static INLINE void highbd_idct16x16_10_4col(__m128i *const io /*io[16]*/) { 157 __m128i step1[16], step2[16]; 158 __m128i temp[2], sign[2]; 159 160 // stage 2 161 highbd_partial_butterfly_sse2(io[1], cospi_30_64, cospi_2_64, &step2[8], 162 &step2[15]); 163 highbd_partial_butterfly_neg_sse2(io[3], cospi_6_64, cospi_26_64, &step2[11], 164 &step2[12]); 165 166 // stage 3 167 highbd_partial_butterfly_sse2(io[2], cospi_28_64, cospi_4_64, &step1[4], 168 &step1[7]); 169 step1[8] = step2[8]; 170 step1[9] = step2[8]; 171 step1[10] = 172 _mm_sub_epi32(_mm_setzero_si128(), step2[11]); // step1[10] = -step1[10] 173 step1[11] = step2[11]; 174 step1[12] = step2[12]; 175 step1[13] = 176 _mm_sub_epi32(_mm_setzero_si128(), step2[12]); // step1[13] = -step1[13] 177 step1[14] = step2[15]; 178 step1[15] = step2[15]; 179 180 // stage 4 181 abs_extend_64bit_sse2(io[0], temp, sign); 182 step2[0] = multiplication_round_shift_sse2(temp, sign, cospi_16_64); 183 step2[1] = step2[0]; 184 step2[2] = _mm_setzero_si128(); 185 step2[3] = _mm_setzero_si128(); 186 highbd_butterfly_sse2(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9], 187 &step2[14]); 188 highbd_butterfly_sse2(step1[10], step1[13], cospi_8_64, cospi_24_64, 189 &step2[13], &step2[10]); 190 step2[5] = step1[4]; 191 step2[6] = step1[7]; 192 step2[8] = step1[8]; 193 step2[11] = step1[11]; 194 step2[12] = step1[12]; 195 step2[15] = step1[15]; 196 197 highbd_idct16_4col_stage5(step2, step1); 198 highbd_idct16_4col_stage6(step1, step2); 199 highbd_idct16_4col_stage7(step2, io); 200 } 201 202 void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint16_t *dest, 203 int stride, int bd) { 204 int i; 205 __m128i out[16], *in; 206 207 if (bd == 8) { 208 __m128i l[16], r[16]; 209 210 in = l; 211 for (i = 0; i < 2; i++) { 212 highbd_load_pack_transpose_32bit_8x8(&input[0], 16, &in[0]); 213 highbd_load_pack_transpose_32bit_8x8(&input[8], 16, &in[8]); 214 idct16_8col(in, in); 215 in = r; 216 input += 128; 217 } 218 219 for (i = 0; i < 16; i += 8) { 220 int j; 221 transpose_16bit_8x8(l + i, out); 222 transpose_16bit_8x8(r + i, out + 8); 223 idct16_8col(out, out); 224 225 for (j = 0; j < 16; ++j) { 226 highbd_write_buffer_8(dest + j * stride, out[j], bd); 227 } 228 dest += 8; 229 } 230 } else { 231 __m128i all[4][16]; 232 233 for (i = 0; i < 4; i++) { 234 in = all[i]; 235 highbd_load_transpose_32bit_8x4(&input[0], 16, &in[0]); 236 highbd_load_transpose_32bit_8x4(&input[8], 16, &in[8]); 237 highbd_idct16_4col(in); 238 input += 4 * 16; 239 } 240 241 for (i = 0; i < 16; i += 4) { 242 int j; 243 transpose_32bit_4x4(all[0] + i, out + 0); 244 transpose_32bit_4x4(all[1] + i, out + 4); 245 transpose_32bit_4x4(all[2] + i, out + 8); 246 transpose_32bit_4x4(all[3] + i, out + 12); 247 highbd_idct16_4col(out); 248 249 for (j = 0; j < 16; ++j) { 250 highbd_write_buffer_4(dest + j * stride, out[j], bd); 251 } 252 dest += 4; 253 } 254 } 255 } 256 257 void vpx_highbd_idct16x16_38_add_sse2(const tran_low_t *input, uint16_t *dest, 258 int stride, int bd) { 259 int i; 260 __m128i out[16]; 261 262 if (bd == 8) { 263 __m128i in[16], temp[16]; 264 265 highbd_load_pack_transpose_32bit_8x8(input, 16, in); 266 for (i = 8; i < 16; i++) { 267 in[i] = _mm_setzero_si128(); 268 } 269 idct16_8col(in, temp); 270 271 for (i = 0; i < 16; i += 8) { 272 int j; 273 transpose_16bit_8x8(temp + i, in); 274 idct16_8col(in, out); 275 276 for (j = 0; j < 16; ++j) { 277 highbd_write_buffer_8(dest + j * stride, out[j], bd); 278 } 279 dest += 8; 280 } 281 } else { 282 __m128i all[2][16], *in; 283 284 for (i = 0; i < 2; i++) { 285 in = all[i]; 286 highbd_load_transpose_32bit_8x4(input, 16, in); 287 highbd_idct16x16_38_4col(in); 288 input += 4 * 16; 289 } 290 291 for (i = 0; i < 16; i += 4) { 292 int j; 293 transpose_32bit_4x4(all[0] + i, out + 0); 294 transpose_32bit_4x4(all[1] + i, out + 4); 295 highbd_idct16x16_38_4col(out); 296 297 for (j = 0; j < 16; ++j) { 298 highbd_write_buffer_4(dest + j * stride, out[j], bd); 299 } 300 dest += 4; 301 } 302 } 303 } 304 305 void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint16_t *dest, 306 int stride, int bd) { 307 int i; 308 __m128i out[16]; 309 310 if (bd == 8) { 311 __m128i in[16], l[16]; 312 313 in[0] = load_pack_8_32bit(input + 0 * 16); 314 in[1] = load_pack_8_32bit(input + 1 * 16); 315 in[2] = load_pack_8_32bit(input + 2 * 16); 316 in[3] = load_pack_8_32bit(input + 3 * 16); 317 318 idct16x16_10_pass1(in, l); 319 320 for (i = 0; i < 16; i += 8) { 321 int j; 322 idct16x16_10_pass2(l + i, in); 323 324 for (j = 0; j < 16; ++j) { 325 highbd_write_buffer_8(dest + j * stride, in[j], bd); 326 } 327 dest += 8; 328 } 329 } else { 330 __m128i all[2][16], *in; 331 332 for (i = 0; i < 2; i++) { 333 in = all[i]; 334 highbd_load_transpose_32bit_4x4(input, 16, in); 335 highbd_idct16x16_10_4col(in); 336 input += 4 * 16; 337 } 338 339 for (i = 0; i < 16; i += 4) { 340 int j; 341 transpose_32bit_4x4(&all[0][i], out); 342 highbd_idct16x16_10_4col(out); 343 344 for (j = 0; j < 16; ++j) { 345 highbd_write_buffer_4(dest + j * stride, out[j], bd); 346 } 347 dest += 4; 348 } 349 } 350 } 351 352 void vpx_highbd_idct16x16_1_add_sse2(const tran_low_t *input, uint16_t *dest, 353 int stride, int bd) { 354 highbd_idct_1_add_kernel(input, dest, stride, bd, 16); 355 } 356