1 /* 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include "./vpx_dsp_rtcd.h" 12 #include "vpx_dsp/x86/highbd_inv_txfm_sse2.h" 13 #include "vpx_dsp/x86/inv_txfm_sse2.h" 14 #include "vpx_dsp/x86/transpose_sse2.h" 15 #include "vpx_dsp/x86/txfm_common_sse2.h" 16 17 void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint16_t *dest, 18 int stride, int bd) { 19 tran_low_t out[4 * 4]; 20 tran_low_t *outptr = out; 21 int i, j; 22 __m128i inptr[4]; 23 __m128i sign_bits[2]; 24 __m128i temp_mm, min_input, max_input; 25 int test; 26 int optimised_cols = 0; 27 const __m128i zero = _mm_set1_epi16(0); 28 const __m128i eight = _mm_set1_epi16(8); 29 const __m128i max = _mm_set1_epi16(12043); 30 const __m128i min = _mm_set1_epi16(-12043); 31 // Load input into __m128i 32 inptr[0] = _mm_loadu_si128((const __m128i *)input); 33 inptr[1] = _mm_loadu_si128((const __m128i *)(input + 4)); 34 inptr[2] = _mm_loadu_si128((const __m128i *)(input + 8)); 35 inptr[3] = _mm_loadu_si128((const __m128i *)(input + 12)); 36 37 // Pack to 16 bits 38 inptr[0] = _mm_packs_epi32(inptr[0], inptr[1]); 39 inptr[1] = _mm_packs_epi32(inptr[2], inptr[3]); 40 41 max_input = _mm_max_epi16(inptr[0], inptr[1]); 42 min_input = _mm_min_epi16(inptr[0], inptr[1]); 43 max_input = _mm_cmpgt_epi16(max_input, max); 44 min_input = _mm_cmplt_epi16(min_input, min); 45 temp_mm = _mm_or_si128(max_input, min_input); 46 test = _mm_movemask_epi8(temp_mm); 47 48 if (!test) { 49 // Do the row transform 50 idct4_sse2(inptr); 51 52 // Check the min & max values 53 max_input = _mm_max_epi16(inptr[0], inptr[1]); 54 min_input = _mm_min_epi16(inptr[0], inptr[1]); 55 max_input = _mm_cmpgt_epi16(max_input, max); 56 min_input = _mm_cmplt_epi16(min_input, min); 57 temp_mm = _mm_or_si128(max_input, min_input); 58 test = _mm_movemask_epi8(temp_mm); 59 60 if (test) { 61 transpose_16bit_4x4(inptr); 62 sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero); 63 sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero); 64 inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]); 65 inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]); 66 inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]); 67 inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]); 68 _mm_storeu_si128((__m128i *)outptr, inptr[0]); 69 _mm_storeu_si128((__m128i *)(outptr + 4), inptr[1]); 70 _mm_storeu_si128((__m128i *)(outptr + 8), inptr[2]); 71 _mm_storeu_si128((__m128i *)(outptr + 12), inptr[3]); 72 } else { 73 // Set to use the optimised transform for the column 74 optimised_cols = 1; 75 } 76 } else { 77 // Run the un-optimised row transform 78 for (i = 0; i < 4; ++i) { 79 vpx_highbd_idct4_c(input, outptr, bd); 80 input += 4; 81 outptr += 4; 82 } 83 } 84 85 if (optimised_cols) { 86 idct4_sse2(inptr); 87 88 // Final round and shift 89 inptr[0] = _mm_add_epi16(inptr[0], eight); 90 inptr[1] = _mm_add_epi16(inptr[1], eight); 91 92 inptr[0] = _mm_srai_epi16(inptr[0], 4); 93 inptr[1] = _mm_srai_epi16(inptr[1], 4); 94 95 // Reconstruction and Store 96 { 97 __m128i d0 = _mm_loadl_epi64((const __m128i *)dest); 98 __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2)); 99 d0 = _mm_unpacklo_epi64( 100 d0, _mm_loadl_epi64((const __m128i *)(dest + stride))); 101 d2 = _mm_unpacklo_epi64( 102 d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3))); 103 d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd); 104 d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd); 105 // store input0 106 _mm_storel_epi64((__m128i *)dest, d0); 107 // store input1 108 d0 = _mm_srli_si128(d0, 8); 109 _mm_storel_epi64((__m128i *)(dest + stride), d0); 110 // store input2 111 _mm_storel_epi64((__m128i *)(dest + stride * 2), d2); 112 // store input3 113 d2 = _mm_srli_si128(d2, 8); 114 _mm_storel_epi64((__m128i *)(dest + stride * 3), d2); 115 } 116 } else { 117 // Run the un-optimised column transform 118 tran_low_t temp_in[4], temp_out[4]; 119 // Columns 120 for (i = 0; i < 4; ++i) { 121 for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i]; 122 vpx_highbd_idct4_c(temp_in, temp_out, bd); 123 for (j = 0; j < 4; ++j) { 124 dest[j * stride + i] = highbd_clip_pixel_add( 125 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd); 126 } 127 } 128 } 129 } 130 131 void vpx_highbd_idct4x4_1_add_sse2(const tran_low_t *input, uint16_t *dest, 132 int stride, int bd) { 133 const __m128i zero = _mm_setzero_si128(); 134 // Faster than _mm_set1_epi16((1 << bd) - 1). 135 const __m128i one = _mm_set1_epi16(1); 136 const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one); 137 int a1, i; 138 tran_low_t out; 139 __m128i dc, d; 140 141 out = HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); 142 out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd); 143 a1 = ROUND_POWER_OF_TWO(out, 4); 144 dc = _mm_set1_epi16(a1); 145 146 for (i = 0; i < 4; ++i) { 147 d = _mm_loadl_epi64((const __m128i *)dest); 148 d = add_dc_clamp(&zero, &max, &dc, &d); 149 _mm_storel_epi64((__m128i *)dest, d); 150 dest += stride; 151 } 152 } 153