1 /* 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <emmintrin.h> // SSE2 12 13 #include "./vpx_config.h" 14 #include "./vpx_dsp_rtcd.h" 15 #include "vpx_dsp/vpx_dsp_common.h" 16 #include "vpx_dsp/x86/fwd_txfm_sse2.h" 17 18 void vpx_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride) { 19 __m128i in0, in1; 20 __m128i tmp; 21 const __m128i zero = _mm_setzero_si128(); 22 in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); 23 in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); 24 in1 = _mm_unpacklo_epi64( 25 in1, _mm_loadl_epi64((const __m128i *)(input + 2 * stride))); 26 in0 = _mm_unpacklo_epi64( 27 in0, _mm_loadl_epi64((const __m128i *)(input + 3 * stride))); 28 29 tmp = _mm_add_epi16(in0, in1); 30 in0 = _mm_unpacklo_epi16(zero, tmp); 31 in1 = _mm_unpackhi_epi16(zero, tmp); 32 in0 = _mm_srai_epi32(in0, 16); 33 in1 = _mm_srai_epi32(in1, 16); 34 35 tmp = _mm_add_epi32(in0, in1); 36 in0 = _mm_unpacklo_epi32(tmp, zero); 37 in1 = _mm_unpackhi_epi32(tmp, zero); 38 39 tmp = _mm_add_epi32(in0, in1); 40 in0 = _mm_srli_si128(tmp, 8); 41 42 in1 = _mm_add_epi32(tmp, in0); 43 in0 = _mm_slli_epi32(in1, 1); 44 output[0] = (tran_low_t)_mm_cvtsi128_si32(in0); 45 } 46 47 void vpx_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) { 48 __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); 49 __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride)); 50 __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride)); 51 __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride)); 52 __m128i u0, u1, sum; 53 54 u0 = _mm_add_epi16(in0, in1); 55 u1 = _mm_add_epi16(in2, in3); 56 57 in0 = _mm_load_si128((const __m128i *)(input + 4 * stride)); 58 in1 = _mm_load_si128((const __m128i *)(input + 5 * stride)); 59 in2 = _mm_load_si128((const __m128i *)(input + 6 * stride)); 60 in3 = _mm_load_si128((const __m128i *)(input + 7 * stride)); 61 62 sum = _mm_add_epi16(u0, u1); 63 64 in0 = _mm_add_epi16(in0, in1); 65 in2 = _mm_add_epi16(in2, in3); 66 sum = _mm_add_epi16(sum, in0); 67 68 u0 = _mm_setzero_si128(); 69 sum = _mm_add_epi16(sum, in2); 70 71 in0 = _mm_unpacklo_epi16(u0, sum); 72 in1 = _mm_unpackhi_epi16(u0, sum); 73 in0 = _mm_srai_epi32(in0, 16); 74 in1 = _mm_srai_epi32(in1, 16); 75 76 sum = _mm_add_epi32(in0, in1); 77 in0 = _mm_unpacklo_epi32(sum, u0); 78 in1 = _mm_unpackhi_epi32(sum, u0); 79 80 sum = _mm_add_epi32(in0, in1); 81 in0 = _mm_srli_si128(sum, 8); 82 83 in1 = _mm_add_epi32(sum, in0); 84 output[0] = (tran_low_t)_mm_cvtsi128_si32(in1); 85 } 86 87 void vpx_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output, 88 int stride) { 89 __m128i in0, in1, in2, in3; 90 __m128i u0, u1; 91 __m128i sum = _mm_setzero_si128(); 92 int i; 93 94 for (i = 0; i < 2; ++i) { 95 in0 = _mm_load_si128((const __m128i *)(input + 0 * stride + 0)); 96 in1 = _mm_load_si128((const __m128i *)(input + 0 * stride + 8)); 97 in2 = _mm_load_si128((const __m128i *)(input + 1 * stride + 0)); 98 in3 = _mm_load_si128((const __m128i *)(input + 1 * stride + 8)); 99 100 u0 = _mm_add_epi16(in0, in1); 101 u1 = _mm_add_epi16(in2, in3); 102 sum = _mm_add_epi16(sum, u0); 103 104 in0 = _mm_load_si128((const __m128i *)(input + 2 * stride + 0)); 105 in1 = _mm_load_si128((const __m128i *)(input + 2 * stride + 8)); 106 in2 = _mm_load_si128((const __m128i *)(input + 3 * stride + 0)); 107 in3 = _mm_load_si128((const __m128i *)(input + 3 * stride + 8)); 108 109 sum = _mm_add_epi16(sum, u1); 110 u0 = _mm_add_epi16(in0, in1); 111 u1 = _mm_add_epi16(in2, in3); 112 sum = _mm_add_epi16(sum, u0); 113 114 in0 = _mm_load_si128((const __m128i *)(input + 4 * stride + 0)); 115 in1 = _mm_load_si128((const __m128i *)(input + 4 * stride + 8)); 116 in2 = _mm_load_si128((const __m128i *)(input + 5 * stride + 0)); 117 in3 = _mm_load_si128((const __m128i *)(input + 5 * stride + 8)); 118 119 sum = _mm_add_epi16(sum, u1); 120 u0 = _mm_add_epi16(in0, in1); 121 u1 = _mm_add_epi16(in2, in3); 122 sum = _mm_add_epi16(sum, u0); 123 124 in0 = _mm_load_si128((const __m128i *)(input + 6 * stride + 0)); 125 in1 = _mm_load_si128((const __m128i *)(input + 6 * stride + 8)); 126 in2 = _mm_load_si128((const __m128i *)(input + 7 * stride + 0)); 127 in3 = _mm_load_si128((const __m128i *)(input + 7 * stride + 8)); 128 129 sum = _mm_add_epi16(sum, u1); 130 u0 = _mm_add_epi16(in0, in1); 131 u1 = _mm_add_epi16(in2, in3); 132 sum = _mm_add_epi16(sum, u0); 133 134 sum = _mm_add_epi16(sum, u1); 135 input += 8 * stride; 136 } 137 138 u0 = _mm_setzero_si128(); 139 in0 = _mm_unpacklo_epi16(u0, sum); 140 in1 = _mm_unpackhi_epi16(u0, sum); 141 in0 = _mm_srai_epi32(in0, 16); 142 in1 = _mm_srai_epi32(in1, 16); 143 144 sum = _mm_add_epi32(in0, in1); 145 in0 = _mm_unpacklo_epi32(sum, u0); 146 in1 = _mm_unpackhi_epi32(sum, u0); 147 148 sum = _mm_add_epi32(in0, in1); 149 in0 = _mm_srli_si128(sum, 8); 150 151 in1 = _mm_add_epi32(sum, in0); 152 in1 = _mm_srai_epi32(in1, 1); 153 output[0] = (tran_low_t)_mm_cvtsi128_si32(in1); 154 } 155 156 void vpx_fdct32x32_1_sse2(const int16_t *input, tran_low_t *output, 157 int stride) { 158 __m128i in0, in1, in2, in3; 159 __m128i u0, u1; 160 __m128i sum = _mm_setzero_si128(); 161 int i; 162 163 for (i = 0; i < 8; ++i) { 164 in0 = _mm_load_si128((const __m128i *)(input + 0)); 165 in1 = _mm_load_si128((const __m128i *)(input + 8)); 166 in2 = _mm_load_si128((const __m128i *)(input + 16)); 167 in3 = _mm_load_si128((const __m128i *)(input + 24)); 168 169 input += stride; 170 u0 = _mm_add_epi16(in0, in1); 171 u1 = _mm_add_epi16(in2, in3); 172 sum = _mm_add_epi16(sum, u0); 173 174 in0 = _mm_load_si128((const __m128i *)(input + 0)); 175 in1 = _mm_load_si128((const __m128i *)(input + 8)); 176 in2 = _mm_load_si128((const __m128i *)(input + 16)); 177 in3 = _mm_load_si128((const __m128i *)(input + 24)); 178 179 input += stride; 180 sum = _mm_add_epi16(sum, u1); 181 u0 = _mm_add_epi16(in0, in1); 182 u1 = _mm_add_epi16(in2, in3); 183 sum = _mm_add_epi16(sum, u0); 184 185 in0 = _mm_load_si128((const __m128i *)(input + 0)); 186 in1 = _mm_load_si128((const __m128i *)(input + 8)); 187 in2 = _mm_load_si128((const __m128i *)(input + 16)); 188 in3 = _mm_load_si128((const __m128i *)(input + 24)); 189 190 input += stride; 191 sum = _mm_add_epi16(sum, u1); 192 u0 = _mm_add_epi16(in0, in1); 193 u1 = _mm_add_epi16(in2, in3); 194 sum = _mm_add_epi16(sum, u0); 195 196 in0 = _mm_load_si128((const __m128i *)(input + 0)); 197 in1 = _mm_load_si128((const __m128i *)(input + 8)); 198 in2 = _mm_load_si128((const __m128i *)(input + 16)); 199 in3 = _mm_load_si128((const __m128i *)(input + 24)); 200 201 input += stride; 202 sum = _mm_add_epi16(sum, u1); 203 u0 = _mm_add_epi16(in0, in1); 204 u1 = _mm_add_epi16(in2, in3); 205 sum = _mm_add_epi16(sum, u0); 206 207 sum = _mm_add_epi16(sum, u1); 208 } 209 210 u0 = _mm_setzero_si128(); 211 in0 = _mm_unpacklo_epi16(u0, sum); 212 in1 = _mm_unpackhi_epi16(u0, sum); 213 in0 = _mm_srai_epi32(in0, 16); 214 in1 = _mm_srai_epi32(in1, 16); 215 216 sum = _mm_add_epi32(in0, in1); 217 in0 = _mm_unpacklo_epi32(sum, u0); 218 in1 = _mm_unpackhi_epi32(sum, u0); 219 220 sum = _mm_add_epi32(in0, in1); 221 in0 = _mm_srli_si128(sum, 8); 222 223 in1 = _mm_add_epi32(sum, in0); 224 in1 = _mm_srai_epi32(in1, 3); 225 output[0] = (tran_low_t)_mm_cvtsi128_si32(in1); 226 } 227 228 #define DCT_HIGH_BIT_DEPTH 0 229 #define FDCT4x4_2D vpx_fdct4x4_sse2 230 #define FDCT8x8_2D vpx_fdct8x8_sse2 231 #define FDCT16x16_2D vpx_fdct16x16_sse2 232 #include "vpx_dsp/x86/fwd_txfm_impl_sse2.h" 233 #undef FDCT4x4_2D 234 #undef FDCT8x8_2D 235 #undef FDCT16x16_2D 236 237 #define FDCT32x32_2D vpx_fdct32x32_rd_sse2 238 #define FDCT32x32_HIGH_PRECISION 0 239 #include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h" 240 #undef FDCT32x32_2D 241 #undef FDCT32x32_HIGH_PRECISION 242 243 #define FDCT32x32_2D vpx_fdct32x32_sse2 244 #define FDCT32x32_HIGH_PRECISION 1 245 #include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h" // NOLINT 246 #undef FDCT32x32_2D 247 #undef FDCT32x32_HIGH_PRECISION 248 #undef DCT_HIGH_BIT_DEPTH 249 250 #if CONFIG_VP9_HIGHBITDEPTH 251 #define DCT_HIGH_BIT_DEPTH 1 252 #define FDCT4x4_2D vpx_highbd_fdct4x4_sse2 253 #define FDCT8x8_2D vpx_highbd_fdct8x8_sse2 254 #define FDCT16x16_2D vpx_highbd_fdct16x16_sse2 255 #include "vpx_dsp/x86/fwd_txfm_impl_sse2.h" // NOLINT 256 #undef FDCT4x4_2D 257 #undef FDCT8x8_2D 258 #undef FDCT16x16_2D 259 260 #define FDCT32x32_2D vpx_highbd_fdct32x32_rd_sse2 261 #define FDCT32x32_HIGH_PRECISION 0 262 #include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h" // NOLINT 263 #undef FDCT32x32_2D 264 #undef FDCT32x32_HIGH_PRECISION 265 266 #define FDCT32x32_2D vpx_highbd_fdct32x32_sse2 267 #define FDCT32x32_HIGH_PRECISION 1 268 #include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h" // NOLINT 269 #undef FDCT32x32_2D 270 #undef FDCT32x32_HIGH_PRECISION 271 #undef DCT_HIGH_BIT_DEPTH 272 #endif // CONFIG_VP9_HIGHBITDEPTH 273