1 /* 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <tmmintrin.h> 12 13 #include "./vpx_dsp_rtcd.h" 14 #include "vpx_dsp/vpx_filter.h" 15 #include "vpx_dsp/x86/convolve.h" 16 #include "vpx_mem/vpx_mem.h" 17 #include "vpx_ports/mem.h" 18 #include "vpx_ports/emmintrin_compat.h" 19 20 // filters only for the 4_h8 convolution 21 DECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = { 22 0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6 23 }; 24 25 DECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = { 26 4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10 27 }; 28 29 // filters for 8_h8 and 16_h8 30 DECLARE_ALIGNED(16, static const uint8_t, filt1_global[16]) = { 31 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 32 }; 33 34 DECLARE_ALIGNED(16, static const uint8_t, filt2_global[16]) = { 35 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 36 }; 37 38 DECLARE_ALIGNED(16, static const uint8_t, filt3_global[16]) = { 39 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 40 }; 41 42 DECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = { 43 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 44 }; 45 46 // These are reused by the avx2 intrinsics. 47 filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3; 48 filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3; 49 filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3; 50 51 void vpx_filter_block1d4_h8_intrin_ssse3( 52 const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, 53 ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { 54 __m128i firstFilters, secondFilters, shuffle1, shuffle2; 55 __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; 56 __m128i addFilterReg64, filtersReg, srcReg, minReg; 57 unsigned int i; 58 59 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 60 addFilterReg64 = _mm_set1_epi32((int)0x0400040u); 61 filtersReg = _mm_loadu_si128((const __m128i *)filter); 62 // converting the 16 bit (short) to 8 bit (byte) and have the same data 63 // in both lanes of 128 bit register. 64 filtersReg = _mm_packs_epi16(filtersReg, filtersReg); 65 66 // duplicate only the first 16 bits in the filter into the first lane 67 firstFilters = _mm_shufflelo_epi16(filtersReg, 0); 68 // duplicate only the third 16 bit in the filter into the first lane 69 secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu); 70 // duplicate only the seconds 16 bits in the filter into the second lane 71 // firstFilters: k0 k1 k0 k1 k0 k1 k0 k1 k2 k3 k2 k3 k2 k3 k2 k3 72 firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u); 73 // duplicate only the forth 16 bits in the filter into the second lane 74 // secondFilters: k4 k5 k4 k5 k4 k5 k4 k5 k6 k7 k6 k7 k6 k7 k6 k7 75 secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu); 76 77 // loading the local filters 78 shuffle1 = _mm_load_si128((__m128i const *)filt1_4_h8); 79 shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8); 80 81 for (i = 0; i < output_height; i++) { 82 srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); 83 84 // filter the source buffer 85 srcRegFilt1 = _mm_shuffle_epi8(srcReg, shuffle1); 86 srcRegFilt2 = _mm_shuffle_epi8(srcReg, shuffle2); 87 88 // multiply 2 adjacent elements with the filter and add the result 89 srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); 90 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); 91 92 // extract the higher half of the lane 93 srcRegFilt3 = _mm_srli_si128(srcRegFilt1, 8); 94 srcRegFilt4 = _mm_srli_si128(srcRegFilt2, 8); 95 96 minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2); 97 98 // add and saturate all the results together 99 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); 100 srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2); 101 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); 102 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3); 103 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); 104 105 // shift by 7 bit each 16 bits 106 srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); 107 108 // shrink to 8 bit each 16 bits 109 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); 110 src_ptr += src_pixels_per_line; 111 112 // save only 4 bytes 113 *((int *)&output_ptr[0]) = _mm_cvtsi128_si32(srcRegFilt1); 114 115 output_ptr += output_pitch; 116 } 117 } 118 119 void vpx_filter_block1d8_h8_intrin_ssse3( 120 const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, 121 ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { 122 __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg; 123 __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg; 124 __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; 125 __m128i addFilterReg64, filtersReg, minReg; 126 unsigned int i; 127 128 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 129 addFilterReg64 = _mm_set1_epi32((int)0x0400040u); 130 filtersReg = _mm_loadu_si128((const __m128i *)filter); 131 // converting the 16 bit (short) to 8 bit (byte) and have the same data 132 // in both lanes of 128 bit register. 133 filtersReg = _mm_packs_epi16(filtersReg, filtersReg); 134 135 // duplicate only the first 16 bits (first and second byte) 136 // across 128 bit register 137 firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); 138 // duplicate only the second 16 bits (third and forth byte) 139 // across 128 bit register 140 secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); 141 // duplicate only the third 16 bits (fifth and sixth byte) 142 // across 128 bit register 143 thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); 144 // duplicate only the forth 16 bits (seventh and eighth byte) 145 // across 128 bit register 146 forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); 147 148 filt1Reg = _mm_load_si128((__m128i const *)filt1_global); 149 filt2Reg = _mm_load_si128((__m128i const *)filt2_global); 150 filt3Reg = _mm_load_si128((__m128i const *)filt3_global); 151 filt4Reg = _mm_load_si128((__m128i const *)filt4_global); 152 153 for (i = 0; i < output_height; i++) { 154 srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); 155 156 // filter the source buffer 157 srcRegFilt1 = _mm_shuffle_epi8(srcReg, filt1Reg); 158 srcRegFilt2 = _mm_shuffle_epi8(srcReg, filt2Reg); 159 160 // multiply 2 adjacent elements with the filter and add the result 161 srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); 162 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); 163 164 // filter the source buffer 165 srcRegFilt3 = _mm_shuffle_epi8(srcReg, filt3Reg); 166 srcRegFilt4 = _mm_shuffle_epi8(srcReg, filt4Reg); 167 168 // multiply 2 adjacent elements with the filter and add the result 169 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters); 170 srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters); 171 172 // add and saturate all the results together 173 minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3); 174 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); 175 176 srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3); 177 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); 178 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2); 179 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); 180 181 // shift by 7 bit each 16 bits 182 srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); 183 184 // shrink to 8 bit each 16 bits 185 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); 186 187 src_ptr += src_pixels_per_line; 188 189 // save only 8 bytes 190 _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1); 191 192 output_ptr += output_pitch; 193 } 194 } 195 196 void vpx_filter_block1d8_v8_intrin_ssse3( 197 const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, 198 ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { 199 __m128i addFilterReg64, filtersReg, minReg; 200 __m128i firstFilters, secondFilters, thirdFilters, forthFilters; 201 __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5; 202 __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7; 203 __m128i srcReg8; 204 unsigned int i; 205 206 // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 207 addFilterReg64 = _mm_set1_epi32((int)0x0400040u); 208 filtersReg = _mm_loadu_si128((const __m128i *)filter); 209 // converting the 16 bit (short) to 8 bit (byte) and have the same data 210 // in both lanes of 128 bit register. 211 filtersReg = _mm_packs_epi16(filtersReg, filtersReg); 212 213 // duplicate only the first 16 bits in the filter 214 firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); 215 // duplicate only the second 16 bits in the filter 216 secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); 217 // duplicate only the third 16 bits in the filter 218 thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); 219 // duplicate only the forth 16 bits in the filter 220 forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); 221 222 // load the first 7 rows of 8 bytes 223 srcReg1 = _mm_loadl_epi64((const __m128i *)src_ptr); 224 srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch)); 225 srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); 226 srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); 227 srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)); 228 srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)); 229 srcReg7 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)); 230 231 for (i = 0; i < output_height; i++) { 232 // load the last 8 bytes 233 srcReg8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7)); 234 235 // merge the result together 236 srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2); 237 srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4); 238 239 // merge the result together 240 srcRegFilt2 = _mm_unpacklo_epi8(srcReg5, srcReg6); 241 srcRegFilt5 = _mm_unpacklo_epi8(srcReg7, srcReg8); 242 243 // multiply 2 adjacent elements with the filter and add the result 244 srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); 245 srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters); 246 srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); 247 srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters); 248 249 // add and saturate the results together 250 minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3); 251 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5); 252 srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3); 253 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); 254 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2); 255 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); 256 257 // shift by 7 bit each 16 bit 258 srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); 259 260 // shrink to 8 bit each 16 bits 261 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); 262 263 src_ptr += src_pitch; 264 265 // shift down a row 266 srcReg1 = srcReg2; 267 srcReg2 = srcReg3; 268 srcReg3 = srcReg4; 269 srcReg4 = srcReg5; 270 srcReg5 = srcReg6; 271 srcReg6 = srcReg7; 272 srcReg7 = srcReg8; 273 274 // save only 8 bytes convolve result 275 _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1); 276 277 output_ptr += out_pitch; 278 } 279 } 280 281 filter8_1dfunction vpx_filter_block1d16_v8_ssse3; 282 filter8_1dfunction vpx_filter_block1d16_h8_ssse3; 283 filter8_1dfunction vpx_filter_block1d8_v8_ssse3; 284 filter8_1dfunction vpx_filter_block1d8_h8_ssse3; 285 filter8_1dfunction vpx_filter_block1d4_v8_ssse3; 286 filter8_1dfunction vpx_filter_block1d4_h8_ssse3; 287 filter8_1dfunction vpx_filter_block1d16_v8_avg_ssse3; 288 filter8_1dfunction vpx_filter_block1d16_h8_avg_ssse3; 289 filter8_1dfunction vpx_filter_block1d8_v8_avg_ssse3; 290 filter8_1dfunction vpx_filter_block1d8_h8_avg_ssse3; 291 filter8_1dfunction vpx_filter_block1d4_v8_avg_ssse3; 292 filter8_1dfunction vpx_filter_block1d4_h8_avg_ssse3; 293 294 filter8_1dfunction vpx_filter_block1d16_v2_ssse3; 295 filter8_1dfunction vpx_filter_block1d16_h2_ssse3; 296 filter8_1dfunction vpx_filter_block1d8_v2_ssse3; 297 filter8_1dfunction vpx_filter_block1d8_h2_ssse3; 298 filter8_1dfunction vpx_filter_block1d4_v2_ssse3; 299 filter8_1dfunction vpx_filter_block1d4_h2_ssse3; 300 filter8_1dfunction vpx_filter_block1d16_v2_avg_ssse3; 301 filter8_1dfunction vpx_filter_block1d16_h2_avg_ssse3; 302 filter8_1dfunction vpx_filter_block1d8_v2_avg_ssse3; 303 filter8_1dfunction vpx_filter_block1d8_h2_avg_ssse3; 304 filter8_1dfunction vpx_filter_block1d4_v2_avg_ssse3; 305 filter8_1dfunction vpx_filter_block1d4_h2_avg_ssse3; 306 307 // void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, 308 // uint8_t *dst, ptrdiff_t dst_stride, 309 // const int16_t *filter_x, int x_step_q4, 310 // const int16_t *filter_y, int y_step_q4, 311 // int w, int h); 312 // void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, 313 // uint8_t *dst, ptrdiff_t dst_stride, 314 // const int16_t *filter_x, int x_step_q4, 315 // const int16_t *filter_y, int y_step_q4, 316 // int w, int h); 317 // void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, 318 // uint8_t *dst, ptrdiff_t dst_stride, 319 // const int16_t *filter_x, int x_step_q4, 320 // const int16_t *filter_y, int y_step_q4, 321 // int w, int h); 322 // void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, 323 // uint8_t *dst, ptrdiff_t dst_stride, 324 // const int16_t *filter_x, int x_step_q4, 325 // const int16_t *filter_y, int y_step_q4, 326 // int w, int h); 327 FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3); 328 FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3); 329 FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3); 330 FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, 331 ssse3); 332 333 #define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ 334 out2, out3, out4, out5, out6, out7) \ 335 { \ 336 const __m128i tr0_0 = _mm_unpacklo_epi8(in0, in1); \ 337 const __m128i tr0_1 = _mm_unpacklo_epi8(in2, in3); \ 338 const __m128i tr0_2 = _mm_unpacklo_epi8(in4, in5); \ 339 const __m128i tr0_3 = _mm_unpacklo_epi8(in6, in7); \ 340 \ 341 const __m128i tr1_0 = _mm_unpacklo_epi16(tr0_0, tr0_1); \ 342 const __m128i tr1_1 = _mm_unpackhi_epi16(tr0_0, tr0_1); \ 343 const __m128i tr1_2 = _mm_unpacklo_epi16(tr0_2, tr0_3); \ 344 const __m128i tr1_3 = _mm_unpackhi_epi16(tr0_2, tr0_3); \ 345 \ 346 const __m128i tr2_0 = _mm_unpacklo_epi32(tr1_0, tr1_2); \ 347 const __m128i tr2_1 = _mm_unpackhi_epi32(tr1_0, tr1_2); \ 348 const __m128i tr2_2 = _mm_unpacklo_epi32(tr1_1, tr1_3); \ 349 const __m128i tr2_3 = _mm_unpackhi_epi32(tr1_1, tr1_3); \ 350 \ 351 out0 = _mm_unpacklo_epi64(tr2_0, tr2_0); \ 352 out1 = _mm_unpackhi_epi64(tr2_0, tr2_0); \ 353 out2 = _mm_unpacklo_epi64(tr2_1, tr2_1); \ 354 out3 = _mm_unpackhi_epi64(tr2_1, tr2_1); \ 355 out4 = _mm_unpacklo_epi64(tr2_2, tr2_2); \ 356 out5 = _mm_unpackhi_epi64(tr2_2, tr2_2); \ 357 out6 = _mm_unpacklo_epi64(tr2_3, tr2_3); \ 358 out7 = _mm_unpackhi_epi64(tr2_3, tr2_3); \ 359 } 360 361 static void filter_horiz_w8_ssse3(const uint8_t *src_x, ptrdiff_t src_pitch, 362 uint8_t *dst, const int16_t *x_filter) { 363 const __m128i k_256 = _mm_set1_epi16(1 << 8); 364 const __m128i f_values = _mm_load_si128((const __m128i *)x_filter); 365 // pack and duplicate the filter values 366 const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); 367 const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); 368 const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); 369 const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); 370 const __m128i A = _mm_loadl_epi64((const __m128i *)src_x); 371 const __m128i B = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch)); 372 const __m128i C = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 2)); 373 const __m128i D = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 3)); 374 const __m128i E = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 4)); 375 const __m128i F = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 5)); 376 const __m128i G = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 6)); 377 const __m128i H = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 7)); 378 // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17 379 const __m128i tr0_0 = _mm_unpacklo_epi16(A, B); 380 // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37 381 const __m128i tr0_1 = _mm_unpacklo_epi16(C, D); 382 // 40 41 50 51 42 43 52 53 44 45 54 55 46 47 56 57 383 const __m128i tr0_2 = _mm_unpacklo_epi16(E, F); 384 // 60 61 70 71 62 63 72 73 64 65 74 75 66 67 76 77 385 const __m128i tr0_3 = _mm_unpacklo_epi16(G, H); 386 // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33 387 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); 388 // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37 389 const __m128i tr1_1 = _mm_unpackhi_epi32(tr0_0, tr0_1); 390 // 40 41 50 51 60 61 70 71 42 43 52 53 62 63 72 73 391 const __m128i tr1_2 = _mm_unpacklo_epi32(tr0_2, tr0_3); 392 // 44 45 54 55 64 65 74 75 46 47 56 57 66 67 76 77 393 const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); 394 // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71 395 const __m128i s1s0 = _mm_unpacklo_epi64(tr1_0, tr1_2); 396 const __m128i s3s2 = _mm_unpackhi_epi64(tr1_0, tr1_2); 397 const __m128i s5s4 = _mm_unpacklo_epi64(tr1_1, tr1_3); 398 const __m128i s7s6 = _mm_unpackhi_epi64(tr1_1, tr1_3); 399 // multiply 2 adjacent elements with the filter and add the result 400 const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0); 401 const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2); 402 const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4); 403 const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6); 404 // add and saturate the results together 405 const __m128i min_x2x1 = _mm_min_epi16(x2, x1); 406 const __m128i max_x2x1 = _mm_max_epi16(x2, x1); 407 __m128i temp = _mm_adds_epi16(x0, x3); 408 temp = _mm_adds_epi16(temp, min_x2x1); 409 temp = _mm_adds_epi16(temp, max_x2x1); 410 // round and shift by 7 bit each 16 bit 411 temp = _mm_mulhrs_epi16(temp, k_256); 412 // shrink to 8 bit each 16 bits 413 temp = _mm_packus_epi16(temp, temp); 414 // save only 8 bytes convolve result 415 _mm_storel_epi64((__m128i *)dst, temp); 416 } 417 418 static void transpose8x8_to_dst(const uint8_t *src, ptrdiff_t src_stride, 419 uint8_t *dst, ptrdiff_t dst_stride) { 420 __m128i A, B, C, D, E, F, G, H; 421 422 A = _mm_loadl_epi64((const __m128i *)src); 423 B = _mm_loadl_epi64((const __m128i *)(src + src_stride)); 424 C = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2)); 425 D = _mm_loadl_epi64((const __m128i *)(src + src_stride * 3)); 426 E = _mm_loadl_epi64((const __m128i *)(src + src_stride * 4)); 427 F = _mm_loadl_epi64((const __m128i *)(src + src_stride * 5)); 428 G = _mm_loadl_epi64((const __m128i *)(src + src_stride * 6)); 429 H = _mm_loadl_epi64((const __m128i *)(src + src_stride * 7)); 430 431 TRANSPOSE_8X8(A, B, C, D, E, F, G, H, A, B, C, D, E, F, G, H); 432 433 _mm_storel_epi64((__m128i *)dst, A); 434 _mm_storel_epi64((__m128i *)(dst + dst_stride * 1), B); 435 _mm_storel_epi64((__m128i *)(dst + dst_stride * 2), C); 436 _mm_storel_epi64((__m128i *)(dst + dst_stride * 3), D); 437 _mm_storel_epi64((__m128i *)(dst + dst_stride * 4), E); 438 _mm_storel_epi64((__m128i *)(dst + dst_stride * 5), F); 439 _mm_storel_epi64((__m128i *)(dst + dst_stride * 6), G); 440 _mm_storel_epi64((__m128i *)(dst + dst_stride * 7), H); 441 } 442 443 static void scaledconvolve_horiz_w8(const uint8_t *src, ptrdiff_t src_stride, 444 uint8_t *dst, ptrdiff_t dst_stride, 445 const InterpKernel *x_filters, int x0_q4, 446 int x_step_q4, int w, int h) { 447 DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]); 448 int x, y, z; 449 src -= SUBPEL_TAPS / 2 - 1; 450 451 // This function processes 8x8 areas. The intermediate height is not always 452 // a multiple of 8, so force it to be a multiple of 8 here. 453 y = h + (8 - (h & 0x7)); 454 455 do { 456 int x_q4 = x0_q4; 457 for (x = 0; x < w; x += 8) { 458 // process 8 src_x steps 459 for (z = 0; z < 8; ++z) { 460 const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; 461 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; 462 if (x_q4 & SUBPEL_MASK) { 463 filter_horiz_w8_ssse3(src_x, src_stride, temp + (z * 8), x_filter); 464 } else { 465 int i; 466 for (i = 0; i < 8; ++i) { 467 temp[z * 8 + i] = src_x[i * src_stride + 3]; 468 } 469 } 470 x_q4 += x_step_q4; 471 } 472 473 // transpose the 8x8 filters values back to dst 474 transpose8x8_to_dst(temp, 8, dst + x, dst_stride); 475 } 476 477 src += src_stride * 8; 478 dst += dst_stride * 8; 479 } while (y -= 8); 480 } 481 482 static void filter_horiz_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch, 483 uint8_t *dst, const int16_t *filter) { 484 const __m128i k_256 = _mm_set1_epi16(1 << 8); 485 const __m128i f_values = _mm_load_si128((const __m128i *)filter); 486 // pack and duplicate the filter values 487 const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); 488 const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); 489 const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); 490 const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); 491 const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr); 492 const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch)); 493 const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); 494 const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); 495 // TRANSPOSE... 496 // 00 01 02 03 04 05 06 07 497 // 10 11 12 13 14 15 16 17 498 // 20 21 22 23 24 25 26 27 499 // 30 31 32 33 34 35 36 37 500 // 501 // TO 502 // 503 // 00 10 20 30 504 // 01 11 21 31 505 // 02 12 22 32 506 // 03 13 23 33 507 // 04 14 24 34 508 // 05 15 25 35 509 // 06 16 26 36 510 // 07 17 27 37 511 // 512 // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17 513 const __m128i tr0_0 = _mm_unpacklo_epi16(A, B); 514 // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37 515 const __m128i tr0_1 = _mm_unpacklo_epi16(C, D); 516 // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33 517 const __m128i s1s0 = _mm_unpacklo_epi32(tr0_0, tr0_1); 518 // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37 519 const __m128i s5s4 = _mm_unpackhi_epi32(tr0_0, tr0_1); 520 // 02 03 12 13 22 23 32 33 521 const __m128i s3s2 = _mm_srli_si128(s1s0, 8); 522 // 06 07 16 17 26 27 36 37 523 const __m128i s7s6 = _mm_srli_si128(s5s4, 8); 524 // multiply 2 adjacent elements with the filter and add the result 525 const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0); 526 const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2); 527 const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4); 528 const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6); 529 // add and saturate the results together 530 const __m128i min_x2x1 = _mm_min_epi16(x2, x1); 531 const __m128i max_x2x1 = _mm_max_epi16(x2, x1); 532 __m128i temp = _mm_adds_epi16(x0, x3); 533 temp = _mm_adds_epi16(temp, min_x2x1); 534 temp = _mm_adds_epi16(temp, max_x2x1); 535 // round and shift by 7 bit each 16 bit 536 temp = _mm_mulhrs_epi16(temp, k_256); 537 // shrink to 8 bit each 16 bits 538 temp = _mm_packus_epi16(temp, temp); 539 // save only 4 bytes 540 *(int *)dst = _mm_cvtsi128_si32(temp); 541 } 542 543 static void transpose4x4_to_dst(const uint8_t *src, ptrdiff_t src_stride, 544 uint8_t *dst, ptrdiff_t dst_stride) { 545 __m128i A = _mm_cvtsi32_si128(*(const int *)src); 546 __m128i B = _mm_cvtsi32_si128(*(const int *)(src + src_stride)); 547 __m128i C = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 2)); 548 __m128i D = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 3)); 549 // 00 10 01 11 02 12 03 13 550 const __m128i tr0_0 = _mm_unpacklo_epi8(A, B); 551 // 20 30 21 31 22 32 23 33 552 const __m128i tr0_1 = _mm_unpacklo_epi8(C, D); 553 // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 554 A = _mm_unpacklo_epi16(tr0_0, tr0_1); 555 B = _mm_srli_si128(A, 4); 556 C = _mm_srli_si128(A, 8); 557 D = _mm_srli_si128(A, 12); 558 559 *(int *)(dst) = _mm_cvtsi128_si32(A); 560 *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(B); 561 *(int *)(dst + dst_stride * 2) = _mm_cvtsi128_si32(C); 562 *(int *)(dst + dst_stride * 3) = _mm_cvtsi128_si32(D); 563 } 564 565 static void scaledconvolve_horiz_w4(const uint8_t *src, ptrdiff_t src_stride, 566 uint8_t *dst, ptrdiff_t dst_stride, 567 const InterpKernel *x_filters, int x0_q4, 568 int x_step_q4, int w, int h) { 569 DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]); 570 int x, y, z; 571 src -= SUBPEL_TAPS / 2 - 1; 572 573 for (y = 0; y < h; y += 4) { 574 int x_q4 = x0_q4; 575 for (x = 0; x < w; x += 4) { 576 // process 4 src_x steps 577 for (z = 0; z < 4; ++z) { 578 const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; 579 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; 580 if (x_q4 & SUBPEL_MASK) { 581 filter_horiz_w4_ssse3(src_x, src_stride, temp + (z * 4), x_filter); 582 } else { 583 int i; 584 for (i = 0; i < 4; ++i) { 585 temp[z * 4 + i] = src_x[i * src_stride + 3]; 586 } 587 } 588 x_q4 += x_step_q4; 589 } 590 591 // transpose the 4x4 filters values back to dst 592 transpose4x4_to_dst(temp, 4, dst + x, dst_stride); 593 } 594 595 src += src_stride * 4; 596 dst += dst_stride * 4; 597 } 598 } 599 600 static void filter_vert_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch, 601 uint8_t *dst, const int16_t *filter) { 602 const __m128i k_256 = _mm_set1_epi16(1 << 8); 603 const __m128i f_values = _mm_load_si128((const __m128i *)filter); 604 // pack and duplicate the filter values 605 const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); 606 const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); 607 const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); 608 const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); 609 const __m128i A = _mm_cvtsi32_si128(*(const int *)src_ptr); 610 const __m128i B = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch)); 611 const __m128i C = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 2)); 612 const __m128i D = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 3)); 613 const __m128i E = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 4)); 614 const __m128i F = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 5)); 615 const __m128i G = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 6)); 616 const __m128i H = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 7)); 617 const __m128i s1s0 = _mm_unpacklo_epi8(A, B); 618 const __m128i s3s2 = _mm_unpacklo_epi8(C, D); 619 const __m128i s5s4 = _mm_unpacklo_epi8(E, F); 620 const __m128i s7s6 = _mm_unpacklo_epi8(G, H); 621 // multiply 2 adjacent elements with the filter and add the result 622 const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0); 623 const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2); 624 const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4); 625 const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6); 626 // add and saturate the results together 627 const __m128i min_x2x1 = _mm_min_epi16(x2, x1); 628 const __m128i max_x2x1 = _mm_max_epi16(x2, x1); 629 __m128i temp = _mm_adds_epi16(x0, x3); 630 temp = _mm_adds_epi16(temp, min_x2x1); 631 temp = _mm_adds_epi16(temp, max_x2x1); 632 // round and shift by 7 bit each 16 bit 633 temp = _mm_mulhrs_epi16(temp, k_256); 634 // shrink to 8 bit each 16 bits 635 temp = _mm_packus_epi16(temp, temp); 636 // save only 4 bytes 637 *(int *)dst = _mm_cvtsi128_si32(temp); 638 } 639 640 static void scaledconvolve_vert_w4(const uint8_t *src, ptrdiff_t src_stride, 641 uint8_t *dst, ptrdiff_t dst_stride, 642 const InterpKernel *y_filters, int y0_q4, 643 int y_step_q4, int w, int h) { 644 int y; 645 int y_q4 = y0_q4; 646 647 src -= src_stride * (SUBPEL_TAPS / 2 - 1); 648 for (y = 0; y < h; ++y) { 649 const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; 650 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; 651 652 if (y_q4 & SUBPEL_MASK) { 653 filter_vert_w4_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter); 654 } else { 655 memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w); 656 } 657 658 y_q4 += y_step_q4; 659 } 660 } 661 662 static void filter_vert_w8_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch, 663 uint8_t *dst, const int16_t *filter) { 664 const __m128i k_256 = _mm_set1_epi16(1 << 8); 665 const __m128i f_values = _mm_load_si128((const __m128i *)filter); 666 // pack and duplicate the filter values 667 const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); 668 const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); 669 const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); 670 const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); 671 const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr); 672 const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch)); 673 const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); 674 const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); 675 const __m128i E = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)); 676 const __m128i F = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)); 677 const __m128i G = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)); 678 const __m128i H = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7)); 679 const __m128i s1s0 = _mm_unpacklo_epi8(A, B); 680 const __m128i s3s2 = _mm_unpacklo_epi8(C, D); 681 const __m128i s5s4 = _mm_unpacklo_epi8(E, F); 682 const __m128i s7s6 = _mm_unpacklo_epi8(G, H); 683 // multiply 2 adjacent elements with the filter and add the result 684 const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0); 685 const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2); 686 const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4); 687 const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6); 688 // add and saturate the results together 689 const __m128i min_x2x1 = _mm_min_epi16(x2, x1); 690 const __m128i max_x2x1 = _mm_max_epi16(x2, x1); 691 __m128i temp = _mm_adds_epi16(x0, x3); 692 temp = _mm_adds_epi16(temp, min_x2x1); 693 temp = _mm_adds_epi16(temp, max_x2x1); 694 // round and shift by 7 bit each 16 bit 695 temp = _mm_mulhrs_epi16(temp, k_256); 696 // shrink to 8 bit each 16 bits 697 temp = _mm_packus_epi16(temp, temp); 698 // save only 8 bytes convolve result 699 _mm_storel_epi64((__m128i *)dst, temp); 700 } 701 702 static void scaledconvolve_vert_w8(const uint8_t *src, ptrdiff_t src_stride, 703 uint8_t *dst, ptrdiff_t dst_stride, 704 const InterpKernel *y_filters, int y0_q4, 705 int y_step_q4, int w, int h) { 706 int y; 707 int y_q4 = y0_q4; 708 709 src -= src_stride * (SUBPEL_TAPS / 2 - 1); 710 for (y = 0; y < h; ++y) { 711 const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; 712 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; 713 if (y_q4 & SUBPEL_MASK) { 714 filter_vert_w8_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter); 715 } else { 716 memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w); 717 } 718 y_q4 += y_step_q4; 719 } 720 } 721 722 static void filter_vert_w16_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch, 723 uint8_t *dst, const int16_t *filter, int w) { 724 const __m128i k_256 = _mm_set1_epi16(1 << 8); 725 const __m128i f_values = _mm_load_si128((const __m128i *)filter); 726 // pack and duplicate the filter values 727 const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); 728 const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); 729 const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); 730 const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); 731 int i; 732 733 for (i = 0; i < w; i += 16) { 734 const __m128i A = _mm_loadu_si128((const __m128i *)src_ptr); 735 const __m128i B = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch)); 736 const __m128i C = 737 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)); 738 const __m128i D = 739 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)); 740 const __m128i E = 741 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)); 742 const __m128i F = 743 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)); 744 const __m128i G = 745 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)); 746 const __m128i H = 747 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)); 748 // merge the result together 749 const __m128i s1s0_lo = _mm_unpacklo_epi8(A, B); 750 const __m128i s7s6_lo = _mm_unpacklo_epi8(G, H); 751 const __m128i s1s0_hi = _mm_unpackhi_epi8(A, B); 752 const __m128i s7s6_hi = _mm_unpackhi_epi8(G, H); 753 // multiply 2 adjacent elements with the filter and add the result 754 const __m128i x0_lo = _mm_maddubs_epi16(s1s0_lo, f1f0); 755 const __m128i x3_lo = _mm_maddubs_epi16(s7s6_lo, f7f6); 756 const __m128i x0_hi = _mm_maddubs_epi16(s1s0_hi, f1f0); 757 const __m128i x3_hi = _mm_maddubs_epi16(s7s6_hi, f7f6); 758 // add and saturate the results together 759 const __m128i x3x0_lo = _mm_adds_epi16(x0_lo, x3_lo); 760 const __m128i x3x0_hi = _mm_adds_epi16(x0_hi, x3_hi); 761 // merge the result together 762 const __m128i s3s2_lo = _mm_unpacklo_epi8(C, D); 763 const __m128i s3s2_hi = _mm_unpackhi_epi8(C, D); 764 // multiply 2 adjacent elements with the filter and add the result 765 const __m128i x1_lo = _mm_maddubs_epi16(s3s2_lo, f3f2); 766 const __m128i x1_hi = _mm_maddubs_epi16(s3s2_hi, f3f2); 767 // merge the result together 768 const __m128i s5s4_lo = _mm_unpacklo_epi8(E, F); 769 const __m128i s5s4_hi = _mm_unpackhi_epi8(E, F); 770 // multiply 2 adjacent elements with the filter and add the result 771 const __m128i x2_lo = _mm_maddubs_epi16(s5s4_lo, f5f4); 772 const __m128i x2_hi = _mm_maddubs_epi16(s5s4_hi, f5f4); 773 // add and saturate the results together 774 __m128i temp_lo = _mm_adds_epi16(x3x0_lo, _mm_min_epi16(x1_lo, x2_lo)); 775 __m128i temp_hi = _mm_adds_epi16(x3x0_hi, _mm_min_epi16(x1_hi, x2_hi)); 776 777 // add and saturate the results together 778 temp_lo = _mm_adds_epi16(temp_lo, _mm_max_epi16(x1_lo, x2_lo)); 779 temp_hi = _mm_adds_epi16(temp_hi, _mm_max_epi16(x1_hi, x2_hi)); 780 // round and shift by 7 bit each 16 bit 781 temp_lo = _mm_mulhrs_epi16(temp_lo, k_256); 782 temp_hi = _mm_mulhrs_epi16(temp_hi, k_256); 783 // shrink to 8 bit each 16 bits, the first lane contain the first 784 // convolve result and the second lane contain the second convolve 785 // result 786 temp_hi = _mm_packus_epi16(temp_lo, temp_hi); 787 src_ptr += 16; 788 // save 16 bytes convolve result 789 _mm_store_si128((__m128i *)&dst[i], temp_hi); 790 } 791 } 792 793 static void scaledconvolve_vert_w16(const uint8_t *src, ptrdiff_t src_stride, 794 uint8_t *dst, ptrdiff_t dst_stride, 795 const InterpKernel *y_filters, int y0_q4, 796 int y_step_q4, int w, int h) { 797 int y; 798 int y_q4 = y0_q4; 799 800 src -= src_stride * (SUBPEL_TAPS / 2 - 1); 801 for (y = 0; y < h; ++y) { 802 const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; 803 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; 804 if (y_q4 & SUBPEL_MASK) { 805 filter_vert_w16_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter, 806 w); 807 } else { 808 memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w); 809 } 810 y_q4 += y_step_q4; 811 } 812 } 813 814 static void scaledconvolve2d(const uint8_t *src, ptrdiff_t src_stride, 815 uint8_t *dst, ptrdiff_t dst_stride, 816 const InterpKernel *const x_filters, int x0_q4, 817 int x_step_q4, const InterpKernel *const y_filters, 818 int y0_q4, int y_step_q4, int w, int h) { 819 // Note: Fixed size intermediate buffer, temp, places limits on parameters. 820 // 2d filtering proceeds in 2 steps: 821 // (1) Interpolate horizontally into an intermediate buffer, temp. 822 // (2) Interpolate temp vertically to derive the sub-pixel result. 823 // Deriving the maximum number of rows in the temp buffer (135): 824 // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). 825 // --Largest block size is 64x64 pixels. 826 // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the 827 // original frame (in 1/16th pixel units). 828 // --Must round-up because block may be located at sub-pixel position. 829 // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. 830 // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. 831 // --Require an additional 8 rows for the horiz_w8 transpose tail. 832 DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]); 833 const int intermediate_height = 834 (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; 835 836 assert(w <= 64); 837 assert(h <= 64); 838 assert(y_step_q4 <= 32); 839 assert(x_step_q4 <= 32); 840 841 if (w >= 8) { 842 scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1), 843 src_stride, temp, 64, x_filters, x0_q4, x_step_q4, 844 w, intermediate_height); 845 } else { 846 scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1), 847 src_stride, temp, 64, x_filters, x0_q4, x_step_q4, 848 w, intermediate_height); 849 } 850 851 if (w >= 16) { 852 scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, 853 dst_stride, y_filters, y0_q4, y_step_q4, w, h); 854 } else if (w == 8) { 855 scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, 856 dst_stride, y_filters, y0_q4, y_step_q4, w, h); 857 } else { 858 scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, 859 dst_stride, y_filters, y0_q4, y_step_q4, w, h); 860 } 861 } 862 863 void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, 864 ptrdiff_t dst_stride, const int16_t *filter_x, 865 int x_step_q4, const int16_t *filter_y, int y_step_q4, 866 int w, int h) { 867 const InterpKernel *const filters_x = get_filter_base(filter_x); 868 const int x0_q4 = get_filter_offset(filter_x, filters_x); 869 870 const InterpKernel *const filters_y = get_filter_base(filter_y); 871 const int y0_q4 = get_filter_offset(filter_y, filters_y); 872 873 scaledconvolve2d(src, src_stride, dst, dst_stride, filters_x, x0_q4, 874 x_step_q4, filters_y, y0_q4, y_step_q4, w, h); 875 } 876 877 // void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, 878 // uint8_t *dst, ptrdiff_t dst_stride, 879 // const int16_t *filter_x, int x_step_q4, 880 // const int16_t *filter_y, int y_step_q4, 881 // int w, int h); 882 // void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, 883 // uint8_t *dst, ptrdiff_t dst_stride, 884 // const int16_t *filter_x, int x_step_q4, 885 // const int16_t *filter_y, int y_step_q4, 886 // int w, int h); 887 FUN_CONV_2D(, ssse3); 888 FUN_CONV_2D(avg_, ssse3); 889