1 /* 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <assert.h> 12 13 #include "./vpx_config.h" 14 #include "./vp9_rtcd.h" 15 #include "vpx_ports/mem.h" 16 /////////////////////////////////////////////////////////////////////////// 17 // the mmx function that does the bilinear filtering and var calculation // 18 // int one pass // 19 /////////////////////////////////////////////////////////////////////////// 20 DECLARE_ALIGNED(16, const short, vp9_bilinear_filters_mmx[16][8]) = { 21 { 128, 128, 128, 128, 0, 0, 0, 0 }, 22 { 120, 120, 120, 120, 8, 8, 8, 8 }, 23 { 112, 112, 112, 112, 16, 16, 16, 16 }, 24 { 104, 104, 104, 104, 24, 24, 24, 24 }, 25 { 96, 96, 96, 96, 32, 32, 32, 32 }, 26 { 88, 88, 88, 88, 40, 40, 40, 40 }, 27 { 80, 80, 80, 80, 48, 48, 48, 48 }, 28 { 72, 72, 72, 72, 56, 56, 56, 56 }, 29 { 64, 64, 64, 64, 64, 64, 64, 64 }, 30 { 56, 56, 56, 56, 72, 72, 72, 72 }, 31 { 48, 48, 48, 48, 80, 80, 80, 80 }, 32 { 40, 40, 40, 40, 88, 88, 88, 88 }, 33 { 32, 32, 32, 32, 96, 96, 96, 96 }, 34 { 24, 24, 24, 24, 104, 104, 104, 104 }, 35 { 16, 16, 16, 16, 112, 112, 112, 112 }, 36 { 8, 8, 8, 8, 120, 120, 120, 120 } 37 }; 38 39 typedef void filter8_1dfunction ( 40 const unsigned char *src_ptr, 41 const unsigned int src_pitch, 42 unsigned char *output_ptr, 43 unsigned int out_pitch, 44 unsigned int output_height, 45 const short *filter 46 ); 47 48 #if HAVE_SSSE3 49 filter8_1dfunction vp9_filter_block1d16_v8_ssse3; 50 filter8_1dfunction vp9_filter_block1d16_h8_ssse3; 51 filter8_1dfunction vp9_filter_block1d8_v8_ssse3; 52 filter8_1dfunction vp9_filter_block1d8_h8_ssse3; 53 filter8_1dfunction vp9_filter_block1d4_v8_ssse3; 54 filter8_1dfunction vp9_filter_block1d4_h8_ssse3; 55 filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3; 56 filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3; 57 filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3; 58 filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3; 59 filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3; 60 filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3; 61 62 void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, 63 uint8_t *dst, ptrdiff_t dst_stride, 64 const int16_t *filter_x, int x_step_q4, 65 const int16_t *filter_y, int y_step_q4, 66 int w, int h) { 67 /* Ensure the filter can be compressed to int16_t. */ 68 if (x_step_q4 == 16 && filter_x[3] != 128) { 69 while (w >= 16) { 70 vp9_filter_block1d16_h8_ssse3(src, src_stride, 71 dst, dst_stride, 72 h, filter_x); 73 src += 16; 74 dst += 16; 75 w -= 16; 76 } 77 while (w >= 8) { 78 vp9_filter_block1d8_h8_ssse3(src, src_stride, 79 dst, dst_stride, 80 h, filter_x); 81 src += 8; 82 dst += 8; 83 w -= 8; 84 } 85 while (w >= 4) { 86 vp9_filter_block1d4_h8_ssse3(src, src_stride, 87 dst, dst_stride, 88 h, filter_x); 89 src += 4; 90 dst += 4; 91 w -= 4; 92 } 93 } 94 if (w) { 95 vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride, 96 filter_x, x_step_q4, filter_y, y_step_q4, 97 w, h); 98 } 99 } 100 101 void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, 102 uint8_t *dst, ptrdiff_t dst_stride, 103 const int16_t *filter_x, int x_step_q4, 104 const int16_t *filter_y, int y_step_q4, 105 int w, int h) { 106 if (y_step_q4 == 16 && filter_y[3] != 128) { 107 while (w >= 16) { 108 vp9_filter_block1d16_v8_ssse3(src - src_stride * 3, src_stride, 109 dst, dst_stride, 110 h, filter_y); 111 src += 16; 112 dst += 16; 113 w -= 16; 114 } 115 while (w >= 8) { 116 vp9_filter_block1d8_v8_ssse3(src - src_stride * 3, src_stride, 117 dst, dst_stride, 118 h, filter_y); 119 src += 8; 120 dst += 8; 121 w -= 8; 122 } 123 while (w >= 4) { 124 vp9_filter_block1d4_v8_ssse3(src - src_stride * 3, src_stride, 125 dst, dst_stride, 126 h, filter_y); 127 src += 4; 128 dst += 4; 129 w -= 4; 130 } 131 } 132 if (w) { 133 vp9_convolve8_vert_c(src, src_stride, dst, dst_stride, 134 filter_x, x_step_q4, filter_y, y_step_q4, 135 w, h); 136 } 137 } 138 139 void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, 140 uint8_t *dst, ptrdiff_t dst_stride, 141 const int16_t *filter_x, int x_step_q4, 142 const int16_t *filter_y, int y_step_q4, 143 int w, int h) { 144 if (x_step_q4 == 16 && filter_x[3] != 128) { 145 while (w >= 16) { 146 vp9_filter_block1d16_h8_avg_ssse3(src, src_stride, 147 dst, dst_stride, 148 h, filter_x); 149 src += 16; 150 dst += 16; 151 w -= 16; 152 } 153 while (w >= 8) { 154 vp9_filter_block1d8_h8_avg_ssse3(src, src_stride, 155 dst, dst_stride, 156 h, filter_x); 157 src += 8; 158 dst += 8; 159 w -= 8; 160 } 161 while (w >= 4) { 162 vp9_filter_block1d4_h8_avg_ssse3(src, src_stride, 163 dst, dst_stride, 164 h, filter_x); 165 src += 4; 166 dst += 4; 167 w -= 4; 168 } 169 } 170 if (w) { 171 vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, 172 filter_x, x_step_q4, filter_y, y_step_q4, 173 w, h); 174 } 175 } 176 177 void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, 178 uint8_t *dst, ptrdiff_t dst_stride, 179 const int16_t *filter_x, int x_step_q4, 180 const int16_t *filter_y, int y_step_q4, 181 int w, int h) { 182 if (y_step_q4 == 16 && filter_y[3] != 128) { 183 while (w >= 16) { 184 vp9_filter_block1d16_v8_avg_ssse3(src - src_stride * 3, src_stride, 185 dst, dst_stride, 186 h, filter_y); 187 src += 16; 188 dst += 16; 189 w -= 16; 190 } 191 while (w >= 8) { 192 vp9_filter_block1d8_v8_avg_ssse3(src - src_stride * 3, src_stride, 193 dst, dst_stride, 194 h, filter_y); 195 src += 8; 196 dst += 8; 197 w -= 8; 198 } 199 while (w >= 4) { 200 vp9_filter_block1d4_v8_avg_ssse3(src - src_stride * 3, src_stride, 201 dst, dst_stride, 202 h, filter_y); 203 src += 4; 204 dst += 4; 205 w -= 4; 206 } 207 } 208 if (w) { 209 vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, 210 filter_x, x_step_q4, filter_y, y_step_q4, 211 w, h); 212 } 213 } 214 215 void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, 216 uint8_t *dst, ptrdiff_t dst_stride, 217 const int16_t *filter_x, int x_step_q4, 218 const int16_t *filter_y, int y_step_q4, 219 int w, int h) { 220 DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71); 221 222 assert(w <= 64); 223 assert(h <= 64); 224 if (x_step_q4 == 16 && y_step_q4 == 16) { 225 vp9_convolve8_horiz_ssse3(src - 3 * src_stride, src_stride, fdata2, 64, 226 filter_x, x_step_q4, filter_y, y_step_q4, 227 w, h + 7); 228 vp9_convolve8_vert_ssse3(fdata2 + 3 * 64, 64, dst, dst_stride, 229 filter_x, x_step_q4, filter_y, y_step_q4, w, h); 230 } else { 231 vp9_convolve8_c(src, src_stride, dst, dst_stride, 232 filter_x, x_step_q4, filter_y, y_step_q4, w, h); 233 } 234 } 235 236 void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, 237 uint8_t *dst, ptrdiff_t dst_stride, 238 const int16_t *filter_x, int x_step_q4, 239 const int16_t *filter_y, int y_step_q4, 240 int w, int h) { 241 DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71); 242 243 assert(w <= 64); 244 assert(h <= 64); 245 if (x_step_q4 == 16 && y_step_q4 == 16) { 246 vp9_convolve8_horiz_ssse3(src - 3 * src_stride, src_stride, fdata2, 64, 247 filter_x, x_step_q4, filter_y, y_step_q4, 248 w, h + 7); 249 vp9_convolve8_avg_vert_ssse3(fdata2 + 3 * 64, 64, dst, dst_stride, 250 filter_x, x_step_q4, filter_y, y_step_q4, 251 w, h); 252 } else { 253 vp9_convolve8_avg_c(src, src_stride, dst, dst_stride, 254 filter_x, x_step_q4, filter_y, y_step_q4, w, h); 255 } 256 } 257 #endif 258 259 #if HAVE_SSE2 260 filter8_1dfunction vp9_filter_block1d16_v8_sse2; 261 filter8_1dfunction vp9_filter_block1d16_h8_sse2; 262 filter8_1dfunction vp9_filter_block1d8_v8_sse2; 263 filter8_1dfunction vp9_filter_block1d8_h8_sse2; 264 filter8_1dfunction vp9_filter_block1d4_v8_sse2; 265 filter8_1dfunction vp9_filter_block1d4_h8_sse2; 266 filter8_1dfunction vp9_filter_block1d16_v8_avg_sse2; 267 filter8_1dfunction vp9_filter_block1d16_h8_avg_sse2; 268 filter8_1dfunction vp9_filter_block1d8_v8_avg_sse2; 269 filter8_1dfunction vp9_filter_block1d8_h8_avg_sse2; 270 filter8_1dfunction vp9_filter_block1d4_v8_avg_sse2; 271 filter8_1dfunction vp9_filter_block1d4_h8_avg_sse2; 272 273 void vp9_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, 274 uint8_t *dst, ptrdiff_t dst_stride, 275 const int16_t *filter_x, int x_step_q4, 276 const int16_t *filter_y, int y_step_q4, 277 int w, int h) { 278 /* Ensure the filter can be compressed to int16_t. */ 279 if (x_step_q4 == 16 && filter_x[3] != 128) { 280 while (w >= 16) { 281 vp9_filter_block1d16_h8_sse2(src, src_stride, 282 dst, dst_stride, 283 h, filter_x); 284 src += 16; 285 dst += 16; 286 w -= 16; 287 } 288 while (w >= 8) { 289 vp9_filter_block1d8_h8_sse2(src, src_stride, 290 dst, dst_stride, 291 h, filter_x); 292 src += 8; 293 dst += 8; 294 w -= 8; 295 } 296 while (w >= 4) { 297 vp9_filter_block1d4_h8_sse2(src, src_stride, 298 dst, dst_stride, 299 h, filter_x); 300 src += 4; 301 dst += 4; 302 w -= 4; 303 } 304 } 305 if (w) { 306 vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride, 307 filter_x, x_step_q4, filter_y, y_step_q4, 308 w, h); 309 } 310 } 311 312 void vp9_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, 313 uint8_t *dst, ptrdiff_t dst_stride, 314 const int16_t *filter_x, int x_step_q4, 315 const int16_t *filter_y, int y_step_q4, 316 int w, int h) { 317 if (y_step_q4 == 16 && filter_y[3] != 128) { 318 while (w >= 16) { 319 vp9_filter_block1d16_v8_sse2(src - src_stride * 3, src_stride, 320 dst, dst_stride, 321 h, filter_y); 322 src += 16; 323 dst += 16; 324 w -= 16; 325 } 326 while (w >= 8) { 327 vp9_filter_block1d8_v8_sse2(src - src_stride * 3, src_stride, 328 dst, dst_stride, 329 h, filter_y); 330 src += 8; 331 dst += 8; 332 w -= 8; 333 } 334 while (w >= 4) { 335 vp9_filter_block1d4_v8_sse2(src - src_stride * 3, src_stride, 336 dst, dst_stride, 337 h, filter_y); 338 src += 4; 339 dst += 4; 340 w -= 4; 341 } 342 } 343 if (w) { 344 vp9_convolve8_vert_c(src, src_stride, dst, dst_stride, 345 filter_x, x_step_q4, filter_y, y_step_q4, 346 w, h); 347 } 348 } 349 350 void vp9_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, 351 uint8_t *dst, ptrdiff_t dst_stride, 352 const int16_t *filter_x, int x_step_q4, 353 const int16_t *filter_y, int y_step_q4, 354 int w, int h) { 355 if (x_step_q4 == 16 && filter_x[3] != 128) { 356 while (w >= 16) { 357 vp9_filter_block1d16_h8_avg_sse2(src, src_stride, 358 dst, dst_stride, 359 h, filter_x); 360 src += 16; 361 dst += 16; 362 w -= 16; 363 } 364 while (w >= 8) { 365 vp9_filter_block1d8_h8_avg_sse2(src, src_stride, 366 dst, dst_stride, 367 h, filter_x); 368 src += 8; 369 dst += 8; 370 w -= 8; 371 } 372 while (w >= 4) { 373 vp9_filter_block1d4_h8_avg_sse2(src, src_stride, 374 dst, dst_stride, 375 h, filter_x); 376 src += 4; 377 dst += 4; 378 w -= 4; 379 } 380 } 381 if (w) { 382 vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, 383 filter_x, x_step_q4, filter_y, y_step_q4, 384 w, h); 385 } 386 } 387 388 void vp9_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, 389 uint8_t *dst, ptrdiff_t dst_stride, 390 const int16_t *filter_x, int x_step_q4, 391 const int16_t *filter_y, int y_step_q4, 392 int w, int h) { 393 if (y_step_q4 == 16 && filter_y[3] != 128) { 394 while (w >= 16) { 395 vp9_filter_block1d16_v8_avg_sse2(src - src_stride * 3, src_stride, 396 dst, dst_stride, 397 h, filter_y); 398 src += 16; 399 dst += 16; 400 w -= 16; 401 } 402 while (w >= 8) { 403 vp9_filter_block1d8_v8_avg_sse2(src - src_stride * 3, src_stride, 404 dst, dst_stride, 405 h, filter_y); 406 src += 8; 407 dst += 8; 408 w -= 8; 409 } 410 while (w >= 4) { 411 vp9_filter_block1d4_v8_avg_sse2(src - src_stride * 3, src_stride, 412 dst, dst_stride, 413 h, filter_y); 414 src += 4; 415 dst += 4; 416 w -= 4; 417 } 418 } 419 if (w) { 420 vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, 421 filter_x, x_step_q4, filter_y, y_step_q4, 422 w, h); 423 } 424 } 425 426 void vp9_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, 427 uint8_t *dst, ptrdiff_t dst_stride, 428 const int16_t *filter_x, int x_step_q4, 429 const int16_t *filter_y, int y_step_q4, 430 int w, int h) { 431 DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71); 432 433 assert(w <= 64); 434 assert(h <= 64); 435 if (x_step_q4 == 16 && y_step_q4 == 16) { 436 vp9_convolve8_horiz_sse2(src - 3 * src_stride, src_stride, fdata2, 64, 437 filter_x, x_step_q4, filter_y, y_step_q4, 438 w, h + 7); 439 vp9_convolve8_vert_sse2(fdata2 + 3 * 64, 64, dst, dst_stride, 440 filter_x, x_step_q4, filter_y, y_step_q4, w, h); 441 } else { 442 vp9_convolve8_c(src, src_stride, dst, dst_stride, 443 filter_x, x_step_q4, filter_y, y_step_q4, w, h); 444 } 445 } 446 447 void vp9_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, 448 uint8_t *dst, ptrdiff_t dst_stride, 449 const int16_t *filter_x, int x_step_q4, 450 const int16_t *filter_y, int y_step_q4, 451 int w, int h) { 452 DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71); 453 454 assert(w <= 64); 455 assert(h <= 64); 456 if (x_step_q4 == 16 && y_step_q4 == 16) { 457 vp9_convolve8_horiz_sse2(src - 3 * src_stride, src_stride, fdata2, 64, 458 filter_x, x_step_q4, filter_y, y_step_q4, 459 w, h + 7); 460 vp9_convolve8_avg_vert_sse2(fdata2 + 3 * 64, 64, dst, dst_stride, 461 filter_x, x_step_q4, filter_y, y_step_q4, 462 w, h); 463 } else { 464 vp9_convolve8_avg_c(src, src_stride, dst, dst_stride, 465 filter_x, x_step_q4, filter_y, y_step_q4, w, h); 466 } 467 } 468 #endif 469