1 /* 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include "vpx_config.h" 12 #include "vp8_rtcd.h" 13 #include "vpx_ports/mem.h" 14 15 extern const short vp8_six_tap_x86[8][6 * 8]; 16 17 extern void vp8_filter_block1d_h6_mmx(unsigned char *src_ptr, 18 unsigned short *output_ptr, 19 unsigned int src_pixels_per_line, 20 unsigned int pixel_step, 21 unsigned int output_height, 22 unsigned int output_width, 23 const short *vp8_filter); 24 extern void vp8_filter_block1dc_v6_mmx( 25 unsigned short *src_ptr, unsigned char *output_ptr, int output_pitch, 26 unsigned int pixels_per_line, unsigned int pixel_step, 27 unsigned int output_height, unsigned int output_width, 28 const short *vp8_filter); 29 extern void vp8_filter_block1d8_h6_sse2(unsigned char *src_ptr, 30 unsigned short *output_ptr, 31 unsigned int src_pixels_per_line, 32 unsigned int pixel_step, 33 unsigned int output_height, 34 unsigned int output_width, 35 const short *vp8_filter); 36 extern void vp8_filter_block1d16_h6_sse2(unsigned char *src_ptr, 37 unsigned short *output_ptr, 38 unsigned int src_pixels_per_line, 39 unsigned int pixel_step, 40 unsigned int output_height, 41 unsigned int output_width, 42 const short *vp8_filter); 43 extern void vp8_filter_block1d8_v6_sse2( 44 unsigned short *src_ptr, unsigned char *output_ptr, int dst_ptich, 45 unsigned int pixels_per_line, unsigned int pixel_step, 46 unsigned int output_height, unsigned int output_width, 47 const short *vp8_filter); 48 extern void vp8_filter_block1d16_v6_sse2( 49 unsigned short *src_ptr, unsigned char *output_ptr, int dst_ptich, 50 unsigned int pixels_per_line, unsigned int pixel_step, 51 unsigned int output_height, unsigned int output_width, 52 const short *vp8_filter); 53 extern void vp8_unpack_block1d16_h6_sse2(unsigned char *src_ptr, 54 unsigned short *output_ptr, 55 unsigned int src_pixels_per_line, 56 unsigned int output_height, 57 unsigned int output_width); 58 extern void vp8_filter_block1d8_h6_only_sse2(unsigned char *src_ptr, 59 unsigned int src_pixels_per_line, 60 unsigned char *output_ptr, 61 int dst_ptich, 62 unsigned int output_height, 63 const short *vp8_filter); 64 extern void vp8_filter_block1d16_h6_only_sse2(unsigned char *src_ptr, 65 unsigned int src_pixels_per_line, 66 unsigned char *output_ptr, 67 int dst_ptich, 68 unsigned int output_height, 69 const short *vp8_filter); 70 extern void vp8_filter_block1d8_v6_only_sse2(unsigned char *src_ptr, 71 unsigned int src_pixels_per_line, 72 unsigned char *output_ptr, 73 int dst_ptich, 74 unsigned int output_height, 75 const short *vp8_filter); 76 77 #if HAVE_MMX 78 void vp8_sixtap_predict4x4_mmx(unsigned char *src_ptr, int src_pixels_per_line, 79 int xoffset, int yoffset, unsigned char *dst_ptr, 80 int dst_pitch) { 81 DECLARE_ALIGNED(16, unsigned short, 82 FData2[16 * 16]); /* Temp data bufffer used in filtering */ 83 const short *HFilter, *VFilter; 84 HFilter = vp8_six_tap_x86[xoffset]; 85 vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, 86 src_pixels_per_line, 1, 9, 8, HFilter); 87 VFilter = vp8_six_tap_x86[yoffset]; 88 vp8_filter_block1dc_v6_mmx(FData2 + 8, dst_ptr, dst_pitch, 8, 4, 4, 4, 89 VFilter); 90 } 91 #endif 92 93 #if HAVE_SSE2 94 void vp8_sixtap_predict16x16_sse2(unsigned char *src_ptr, 95 int src_pixels_per_line, int xoffset, 96 int yoffset, unsigned char *dst_ptr, 97 int dst_pitch) { 98 DECLARE_ALIGNED(16, unsigned short, 99 FData2[24 * 24]); /* Temp data bufffer used in filtering */ 100 101 const short *HFilter, *VFilter; 102 103 if (xoffset) { 104 if (yoffset) { 105 HFilter = vp8_six_tap_x86[xoffset]; 106 vp8_filter_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, 107 src_pixels_per_line, 1, 21, 32, HFilter); 108 VFilter = vp8_six_tap_x86[yoffset]; 109 vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16, 16, 110 dst_pitch, VFilter); 111 } else { 112 /* First-pass only */ 113 HFilter = vp8_six_tap_x86[xoffset]; 114 vp8_filter_block1d16_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, 115 dst_pitch, 16, HFilter); 116 } 117 } else { 118 /* Second-pass only */ 119 VFilter = vp8_six_tap_x86[yoffset]; 120 vp8_unpack_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, 121 src_pixels_per_line, 21, 32); 122 vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16, 16, 123 dst_pitch, VFilter); 124 } 125 } 126 127 void vp8_sixtap_predict8x8_sse2(unsigned char *src_ptr, int src_pixels_per_line, 128 int xoffset, int yoffset, 129 unsigned char *dst_ptr, int dst_pitch) { 130 DECLARE_ALIGNED(16, unsigned short, 131 FData2[256]); /* Temp data bufffer used in filtering */ 132 const short *HFilter, *VFilter; 133 134 if (xoffset) { 135 if (yoffset) { 136 HFilter = vp8_six_tap_x86[xoffset]; 137 vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, 138 src_pixels_per_line, 1, 13, 16, HFilter); 139 VFilter = vp8_six_tap_x86[yoffset]; 140 vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8, 8, 141 dst_pitch, VFilter); 142 } else { 143 /* First-pass only */ 144 HFilter = vp8_six_tap_x86[xoffset]; 145 vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, 146 dst_pitch, 8, HFilter); 147 } 148 } else { 149 /* Second-pass only */ 150 VFilter = vp8_six_tap_x86[yoffset]; 151 vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line), 152 src_pixels_per_line, dst_ptr, dst_pitch, 8, 153 VFilter); 154 } 155 } 156 157 void vp8_sixtap_predict8x4_sse2(unsigned char *src_ptr, int src_pixels_per_line, 158 int xoffset, int yoffset, 159 unsigned char *dst_ptr, int dst_pitch) { 160 DECLARE_ALIGNED(16, unsigned short, 161 FData2[256]); /* Temp data bufffer used in filtering */ 162 const short *HFilter, *VFilter; 163 164 if (xoffset) { 165 if (yoffset) { 166 HFilter = vp8_six_tap_x86[xoffset]; 167 vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, 168 src_pixels_per_line, 1, 9, 16, HFilter); 169 VFilter = vp8_six_tap_x86[yoffset]; 170 vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8, 4, 171 dst_pitch, VFilter); 172 } else { 173 /* First-pass only */ 174 HFilter = vp8_six_tap_x86[xoffset]; 175 vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, 176 dst_pitch, 4, HFilter); 177 } 178 } else { 179 /* Second-pass only */ 180 VFilter = vp8_six_tap_x86[yoffset]; 181 vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line), 182 src_pixels_per_line, dst_ptr, dst_pitch, 4, 183 VFilter); 184 } 185 } 186 187 #endif 188 189 #if HAVE_SSSE3 190 191 extern void vp8_filter_block1d8_h6_ssse3(unsigned char *src_ptr, 192 unsigned int src_pixels_per_line, 193 unsigned char *output_ptr, 194 unsigned int output_pitch, 195 unsigned int output_height, 196 unsigned int vp8_filter_index); 197 198 extern void vp8_filter_block1d16_h6_ssse3(unsigned char *src_ptr, 199 unsigned int src_pixels_per_line, 200 unsigned char *output_ptr, 201 unsigned int output_pitch, 202 unsigned int output_height, 203 unsigned int vp8_filter_index); 204 205 extern void vp8_filter_block1d16_v6_ssse3(unsigned char *src_ptr, 206 unsigned int src_pitch, 207 unsigned char *output_ptr, 208 unsigned int out_pitch, 209 unsigned int output_height, 210 unsigned int vp8_filter_index); 211 212 extern void vp8_filter_block1d8_v6_ssse3(unsigned char *src_ptr, 213 unsigned int src_pitch, 214 unsigned char *output_ptr, 215 unsigned int out_pitch, 216 unsigned int output_height, 217 unsigned int vp8_filter_index); 218 219 extern void vp8_filter_block1d4_h6_ssse3(unsigned char *src_ptr, 220 unsigned int src_pixels_per_line, 221 unsigned char *output_ptr, 222 unsigned int output_pitch, 223 unsigned int output_height, 224 unsigned int vp8_filter_index); 225 226 extern void vp8_filter_block1d4_v6_ssse3(unsigned char *src_ptr, 227 unsigned int src_pitch, 228 unsigned char *output_ptr, 229 unsigned int out_pitch, 230 unsigned int output_height, 231 unsigned int vp8_filter_index); 232 233 void vp8_sixtap_predict16x16_ssse3(unsigned char *src_ptr, 234 int src_pixels_per_line, int xoffset, 235 int yoffset, unsigned char *dst_ptr, 236 int dst_pitch) { 237 DECLARE_ALIGNED(16, unsigned char, FData2[24 * 24]); 238 239 if (xoffset) { 240 if (yoffset) { 241 vp8_filter_block1d16_h6_ssse3(src_ptr - (2 * src_pixels_per_line), 242 src_pixels_per_line, FData2, 16, 21, 243 xoffset); 244 vp8_filter_block1d16_v6_ssse3(FData2, 16, dst_ptr, dst_pitch, 16, 245 yoffset); 246 } else { 247 /* First-pass only */ 248 vp8_filter_block1d16_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr, 249 dst_pitch, 16, xoffset); 250 } 251 } else { 252 if (yoffset) { 253 /* Second-pass only */ 254 vp8_filter_block1d16_v6_ssse3(src_ptr - (2 * src_pixels_per_line), 255 src_pixels_per_line, dst_ptr, dst_pitch, 16, 256 yoffset); 257 } else { 258 /* ssse3 second-pass only function couldn't handle (xoffset==0 && 259 * yoffset==0) case correctly. Add copy function here to guarantee 260 * six-tap function handles all possible offsets. */ 261 vp8_copy_mem16x16(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch); 262 } 263 } 264 } 265 266 void vp8_sixtap_predict8x8_ssse3(unsigned char *src_ptr, 267 int src_pixels_per_line, int xoffset, 268 int yoffset, unsigned char *dst_ptr, 269 int dst_pitch) { 270 DECLARE_ALIGNED(16, unsigned char, FData2[256]); 271 272 if (xoffset) { 273 if (yoffset) { 274 vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line), 275 src_pixels_per_line, FData2, 8, 13, xoffset); 276 vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch, 8, yoffset); 277 } else { 278 vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr, 279 dst_pitch, 8, xoffset); 280 } 281 } else { 282 if (yoffset) { 283 /* Second-pass only */ 284 vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line), 285 src_pixels_per_line, dst_ptr, dst_pitch, 8, 286 yoffset); 287 } else { 288 /* ssse3 second-pass only function couldn't handle (xoffset==0 && 289 * yoffset==0) case correctly. Add copy function here to guarantee 290 * six-tap function handles all possible offsets. */ 291 vp8_copy_mem8x8(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch); 292 } 293 } 294 } 295 296 void vp8_sixtap_predict8x4_ssse3(unsigned char *src_ptr, 297 int src_pixels_per_line, int xoffset, 298 int yoffset, unsigned char *dst_ptr, 299 int dst_pitch) { 300 DECLARE_ALIGNED(16, unsigned char, FData2[256]); 301 302 if (xoffset) { 303 if (yoffset) { 304 vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line), 305 src_pixels_per_line, FData2, 8, 9, xoffset); 306 vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch, 4, yoffset); 307 } else { 308 /* First-pass only */ 309 vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr, 310 dst_pitch, 4, xoffset); 311 } 312 } else { 313 if (yoffset) { 314 /* Second-pass only */ 315 vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line), 316 src_pixels_per_line, dst_ptr, dst_pitch, 4, 317 yoffset); 318 } else { 319 /* ssse3 second-pass only function couldn't handle (xoffset==0 && 320 * yoffset==0) case correctly. Add copy function here to guarantee 321 * six-tap function handles all possible offsets. */ 322 vp8_copy_mem8x4(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch); 323 } 324 } 325 } 326 327 void vp8_sixtap_predict4x4_ssse3(unsigned char *src_ptr, 328 int src_pixels_per_line, int xoffset, 329 int yoffset, unsigned char *dst_ptr, 330 int dst_pitch) { 331 DECLARE_ALIGNED(16, unsigned char, FData2[4 * 9]); 332 333 if (xoffset) { 334 if (yoffset) { 335 vp8_filter_block1d4_h6_ssse3(src_ptr - (2 * src_pixels_per_line), 336 src_pixels_per_line, FData2, 4, 9, xoffset); 337 vp8_filter_block1d4_v6_ssse3(FData2, 4, dst_ptr, dst_pitch, 4, yoffset); 338 } else { 339 vp8_filter_block1d4_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr, 340 dst_pitch, 4, xoffset); 341 } 342 } else { 343 if (yoffset) { 344 vp8_filter_block1d4_v6_ssse3(src_ptr - (2 * src_pixels_per_line), 345 src_pixels_per_line, dst_ptr, dst_pitch, 4, 346 yoffset); 347 } else { 348 /* ssse3 second-pass only function couldn't handle (xoffset==0 && 349 * yoffset==0) case correctly. Add copy function here to guarantee 350 * six-tap function handles all possible offsets. */ 351 int r; 352 353 for (r = 0; r < 4; ++r) { 354 dst_ptr[0] = src_ptr[0]; 355 dst_ptr[1] = src_ptr[1]; 356 dst_ptr[2] = src_ptr[2]; 357 dst_ptr[3] = src_ptr[3]; 358 dst_ptr += dst_pitch; 359 src_ptr += src_pixels_per_line; 360 } 361 } 362 } 363 } 364 365 #endif 366