1 /* 2 * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include "third_party/libyuv/include/libyuv/scale.h" 12 13 #include <assert.h> 14 #include <string.h> 15 16 #include "third_party/libyuv/include/libyuv/cpu_id.h" 17 #include "third_party/libyuv/source/row.h" 18 19 #ifdef __cplusplus 20 namespace libyuv { 21 extern "C" { 22 #endif 23 24 /* 25 * Note: Defining YUV_DISABLE_ASM allows to use c version. 26 */ 27 //#define YUV_DISABLE_ASM 28 29 #if defined(_MSC_VER) 30 #define ALIGN16(var) __declspec(align(16)) var 31 #else 32 #define ALIGN16(var) var __attribute__((aligned(16))) 33 #endif 34 35 // Note: A Neon reference manual 36 // http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0204j/CJAJIIGG.html 37 // Note: Some SSE2 reference manuals 38 // cpuvol1.pdf agner_instruction_tables.pdf 253666.pdf 253667.pdf 39 40 // Set the following flag to true to revert to only 41 // using the reference implementation ScalePlaneBox(), and 42 // NOT the optimized versions. Useful for debugging and 43 // when comparing the quality of the resulting YUV planes 44 // as produced by the optimized and non-optimized versions. 45 46 static int use_reference_impl_ = 0; 47 48 void SetUseReferenceImpl(int use) { 49 use_reference_impl_ = use; 50 } 51 52 // ScaleRowDown2Int also used by planar functions 53 54 /** 55 * NEON downscalers with interpolation. 56 * 57 * Provided by Fritz Koenig 58 * 59 */ 60 61 #if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM) 62 #define HAS_SCALEROWDOWN2_NEON 63 void ScaleRowDown2_NEON(const uint8* src_ptr, int src_stride, 64 uint8* dst, int dst_width) { 65 asm volatile ( 66 "1: \n" 67 "vld2.u8 {q0,q1}, [%0]! \n" // load even pixels into q0, odd into q1 68 "vst1.u8 {q0}, [%1]! \n" // store even pixels 69 "subs %2, %2, #16 \n" // 16 processed per loop 70 "bhi 1b \n" 71 : "+r"(src_ptr), // %0 72 "+r"(dst), // %1 73 "+r"(dst_width) // %2 74 : 75 : "q0", "q1" // Clobber List 76 ); 77 } 78 79 void ScaleRowDown2Int_NEON(const uint8* src_ptr, int src_stride, 80 uint8* dst, int dst_width) { 81 asm volatile ( 82 "add %1, %0 \n" // change the stride to row 2 pointer 83 "1: \n" 84 "vld1.u8 {q0,q1}, [%0]! \n" // load row 1 and post increment 85 "vld1.u8 {q2,q3}, [%1]! \n" // load row 2 and post increment 86 "vpaddl.u8 q0, q0 \n" // row 1 add adjacent 87 "vpaddl.u8 q1, q1 \n" 88 "vpadal.u8 q0, q2 \n" // row 2 add adjacent, add row 1 to row 2 89 "vpadal.u8 q1, q3 \n" 90 "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack 91 "vrshrn.u16 d1, q1, #2 \n" 92 "vst1.u8 {q0}, [%2]! \n" 93 "subs %3, %3, #16 \n" // 16 processed per loop 94 "bhi 1b \n" 95 : "+r"(src_ptr), // %0 96 "+r"(src_stride), // %1 97 "+r"(dst), // %2 98 "+r"(dst_width) // %3 99 : 100 : "q0", "q1", "q2", "q3" // Clobber List 101 ); 102 } 103 104 #define HAS_SCALEROWDOWN4_NEON 105 static void ScaleRowDown4_NEON(const uint8* src_ptr, int src_stride, 106 uint8* dst_ptr, int dst_width) { 107 asm volatile ( 108 "1: \n" 109 "vld2.u8 {d0, d1}, [%0]! \n" 110 "vtrn.u8 d1, d0 \n" 111 "vshrn.u16 d0, q0, #8 \n" 112 "vst1.u32 {d0[1]}, [%1]! \n" 113 114 "subs %2, #4 \n" 115 "bhi 1b \n" 116 : "+r"(src_ptr), // %0 117 "+r"(dst_ptr), // %1 118 "+r"(dst_width) // %2 119 : 120 : "q0", "q1", "memory", "cc" 121 ); 122 } 123 124 static void ScaleRowDown4Int_NEON(const uint8* src_ptr, int src_stride, 125 uint8* dst_ptr, int dst_width) { 126 asm volatile ( 127 "add r4, %0, %3 \n" 128 "add r5, r4, %3 \n" 129 "add %3, r5, %3 \n" 130 "1: \n" 131 "vld1.u8 {q0}, [%0]! \n" // load up 16x4 block of input data 132 "vld1.u8 {q1}, [r4]! \n" 133 "vld1.u8 {q2}, [r5]! \n" 134 "vld1.u8 {q3}, [%3]! \n" 135 136 "vpaddl.u8 q0, q0 \n" 137 "vpadal.u8 q0, q1 \n" 138 "vpadal.u8 q0, q2 \n" 139 "vpadal.u8 q0, q3 \n" 140 141 "vpaddl.u16 q0, q0 \n" 142 143 "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding 144 145 "vmovn.u16 d0, q0 \n" 146 "vst1.u32 {d0[0]}, [%1]! \n" 147 148 "subs %2, #4 \n" 149 "bhi 1b \n" 150 151 : "+r"(src_ptr), // %0 152 "+r"(dst_ptr), // %1 153 "+r"(dst_width) // %2 154 : "r"(src_stride) // %3 155 : "r4", "r5", "q0", "q1", "q2", "q3", "memory", "cc" 156 ); 157 } 158 159 #define HAS_SCALEROWDOWN34_NEON 160 // Down scale from 4 to 3 pixels. Use the neon multilane read/write 161 // to load up the every 4th pixel into a 4 different registers. 162 // Point samples 32 pixels to 24 pixels. 163 static void ScaleRowDown34_NEON(const uint8* src_ptr, int src_stride, 164 uint8* dst_ptr, int dst_width) { 165 asm volatile ( 166 "1: \n" 167 "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 168 "vmov d2, d3 \n" // order needs to be d0, d1, d2 169 "vst3.u8 {d0, d1, d2}, [%1]! \n" 170 "subs %2, #24 \n" 171 "bhi 1b \n" 172 : "+r"(src_ptr), // %0 173 "+r"(dst_ptr), // %1 174 "+r"(dst_width) // %2 175 : 176 : "d0", "d1", "d2", "d3", "memory", "cc" 177 ); 178 } 179 180 static void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr, int src_stride, 181 uint8* dst_ptr, int dst_width) { 182 asm volatile ( 183 "vmov.u8 d24, #3 \n" 184 "add %3, %0 \n" 185 "1: \n" 186 "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 187 "vld4.u8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 188 189 // filter src line 0 with src line 1 190 // expand chars to shorts to allow for room 191 // when adding lines together 192 "vmovl.u8 q8, d4 \n" 193 "vmovl.u8 q9, d5 \n" 194 "vmovl.u8 q10, d6 \n" 195 "vmovl.u8 q11, d7 \n" 196 197 // 3 * line_0 + line_1 198 "vmlal.u8 q8, d0, d24 \n" 199 "vmlal.u8 q9, d1, d24 \n" 200 "vmlal.u8 q10, d2, d24 \n" 201 "vmlal.u8 q11, d3, d24 \n" 202 203 // (3 * line_0 + line_1) >> 2 204 "vqrshrn.u16 d0, q8, #2 \n" 205 "vqrshrn.u16 d1, q9, #2 \n" 206 "vqrshrn.u16 d2, q10, #2 \n" 207 "vqrshrn.u16 d3, q11, #2 \n" 208 209 // a0 = (src[0] * 3 + s[1] * 1) >> 2 210 "vmovl.u8 q8, d1 \n" 211 "vmlal.u8 q8, d0, d24 \n" 212 "vqrshrn.u16 d0, q8, #2 \n" 213 214 // a1 = (src[1] * 1 + s[2] * 1) >> 1 215 "vrhadd.u8 d1, d1, d2 \n" 216 217 // a2 = (src[2] * 1 + s[3] * 3) >> 2 218 "vmovl.u8 q8, d2 \n" 219 "vmlal.u8 q8, d3, d24 \n" 220 "vqrshrn.u16 d2, q8, #2 \n" 221 222 "vst3.u8 {d0, d1, d2}, [%1]! \n" 223 224 "subs %2, #24 \n" 225 "bhi 1b \n" 226 : "+r"(src_ptr), // %0 227 "+r"(dst_ptr), // %1 228 "+r"(dst_width), // %2 229 "+r"(src_stride) // %3 230 : 231 : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc" 232 ); 233 } 234 235 static void ScaleRowDown34_1_Int_NEON(const uint8* src_ptr, int src_stride, 236 uint8* dst_ptr, int dst_width) { 237 asm volatile ( 238 "vmov.u8 d24, #3 \n" 239 "add %3, %0 \n" 240 "1: \n" 241 "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 242 "vld4.u8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 243 244 // average src line 0 with src line 1 245 "vrhadd.u8 q0, q0, q2 \n" 246 "vrhadd.u8 q1, q1, q3 \n" 247 248 // a0 = (src[0] * 3 + s[1] * 1) >> 2 249 "vmovl.u8 q3, d1 \n" 250 "vmlal.u8 q3, d0, d24 \n" 251 "vqrshrn.u16 d0, q3, #2 \n" 252 253 // a1 = (src[1] * 1 + s[2] * 1) >> 1 254 "vrhadd.u8 d1, d1, d2 \n" 255 256 // a2 = (src[2] * 1 + s[3] * 3) >> 2 257 "vmovl.u8 q3, d2 \n" 258 "vmlal.u8 q3, d3, d24 \n" 259 "vqrshrn.u16 d2, q3, #2 \n" 260 261 "vst3.u8 {d0, d1, d2}, [%1]! \n" 262 263 "subs %2, #24 \n" 264 "bhi 1b \n" 265 : "+r"(src_ptr), // %0 266 "+r"(dst_ptr), // %1 267 "+r"(dst_width), // %2 268 "+r"(src_stride) // %3 269 : 270 : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc" 271 ); 272 } 273 274 #define HAS_SCALEROWDOWN38_NEON 275 const uint8 shuf38[16] __attribute__ ((aligned(16))) = 276 { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 }; 277 const uint8 shuf38_2[16] __attribute__ ((aligned(16))) = 278 { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 }; 279 const unsigned short mult38_div6[8] __attribute__ ((aligned(16))) = 280 { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12, 281 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 }; 282 const unsigned short mult38_div9[8] __attribute__ ((aligned(16))) = 283 { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18, 284 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 }; 285 286 // 32 -> 12 287 static void ScaleRowDown38_NEON(const uint8* src_ptr, int src_stride, 288 uint8* dst_ptr, int dst_width) { 289 asm volatile ( 290 "vld1.u8 {q3}, [%3] \n" 291 "1: \n" 292 "vld1.u8 {d0, d1, d2, d3}, [%0]! \n" 293 "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n" 294 "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n" 295 "vst1.u8 {d4}, [%1]! \n" 296 "vst1.u32 {d5[0]}, [%1]! \n" 297 "subs %2, #12 \n" 298 "bhi 1b \n" 299 : "+r"(src_ptr), // %0 300 "+r"(dst_ptr), // %1 301 "+r"(dst_width) // %2 302 : "r"(shuf38) // %3 303 : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc" 304 ); 305 } 306 307 // 32x3 -> 12x1 308 static void ScaleRowDown38_3_Int_NEON(const uint8* src_ptr, int src_stride, 309 uint8* dst_ptr, int dst_width) { 310 asm volatile ( 311 "vld1.u16 {q13}, [%4] \n" 312 "vld1.u8 {q14}, [%5] \n" 313 "vld1.u8 {q15}, [%6] \n" 314 "add r4, %0, %3, lsl #1 \n" 315 "add %3, %0 \n" 316 "1: \n" 317 318 // d0 = 00 40 01 41 02 42 03 43 319 // d1 = 10 50 11 51 12 52 13 53 320 // d2 = 20 60 21 61 22 62 23 63 321 // d3 = 30 70 31 71 32 72 33 73 322 "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" 323 "vld4.u8 {d4, d5, d6, d7}, [%3]! \n" 324 "vld4.u8 {d16, d17, d18, d19}, [r4]! \n" 325 326 // Shuffle the input data around to get align the data 327 // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 328 // d0 = 00 10 01 11 02 12 03 13 329 // d1 = 40 50 41 51 42 52 43 53 330 "vtrn.u8 d0, d1 \n" 331 "vtrn.u8 d4, d5 \n" 332 "vtrn.u8 d16, d17 \n" 333 334 // d2 = 20 30 21 31 22 32 23 33 335 // d3 = 60 70 61 71 62 72 63 73 336 "vtrn.u8 d2, d3 \n" 337 "vtrn.u8 d6, d7 \n" 338 "vtrn.u8 d18, d19 \n" 339 340 // d0 = 00+10 01+11 02+12 03+13 341 // d2 = 40+50 41+51 42+52 43+53 342 "vpaddl.u8 q0, q0 \n" 343 "vpaddl.u8 q2, q2 \n" 344 "vpaddl.u8 q8, q8 \n" 345 346 // d3 = 60+70 61+71 62+72 63+73 347 "vpaddl.u8 d3, d3 \n" 348 "vpaddl.u8 d7, d7 \n" 349 "vpaddl.u8 d19, d19 \n" 350 351 // combine source lines 352 "vadd.u16 q0, q2 \n" 353 "vadd.u16 q0, q8 \n" 354 "vadd.u16 d4, d3, d7 \n" 355 "vadd.u16 d4, d19 \n" 356 357 // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0] 358 // + s[6 + st * 1] + s[7 + st * 1] 359 // + s[6 + st * 2] + s[7 + st * 2]) / 6 360 "vqrdmulh.s16 q2, q13 \n" 361 "vmovn.u16 d4, q2 \n" 362 363 // Shuffle 2,3 reg around so that 2 can be added to the 364 // 0,1 reg and 3 can be added to the 4,5 reg. This 365 // requires expanding from u8 to u16 as the 0,1 and 4,5 366 // registers are already expanded. Then do transposes 367 // to get aligned. 368 // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 369 "vmovl.u8 q1, d2 \n" 370 "vmovl.u8 q3, d6 \n" 371 "vmovl.u8 q9, d18 \n" 372 373 // combine source lines 374 "vadd.u16 q1, q3 \n" 375 "vadd.u16 q1, q9 \n" 376 377 // d4 = xx 20 xx 30 xx 22 xx 32 378 // d5 = xx 21 xx 31 xx 23 xx 33 379 "vtrn.u32 d2, d3 \n" 380 381 // d4 = xx 20 xx 21 xx 22 xx 23 382 // d5 = xx 30 xx 31 xx 32 xx 33 383 "vtrn.u16 d2, d3 \n" 384 385 // 0+1+2, 3+4+5 386 "vadd.u16 q0, q1 \n" 387 388 // Need to divide, but can't downshift as the the value 389 // isn't a power of 2. So multiply by 65536 / n 390 // and take the upper 16 bits. 391 "vqrdmulh.s16 q0, q15 \n" 392 393 // Align for table lookup, vtbl requires registers to 394 // be adjacent 395 "vmov.u8 d2, d4 \n" 396 397 "vtbl.u8 d3, {d0, d1, d2}, d28 \n" 398 "vtbl.u8 d4, {d0, d1, d2}, d29 \n" 399 400 "vst1.u8 {d3}, [%1]! \n" 401 "vst1.u32 {d4[0]}, [%1]! \n" 402 "subs %2, #12 \n" 403 "bhi 1b \n" 404 : "+r"(src_ptr), // %0 405 "+r"(dst_ptr), // %1 406 "+r"(dst_width), // %2 407 "+r"(src_stride) // %3 408 : "r"(mult38_div6), // %4 409 "r"(shuf38_2), // %5 410 "r"(mult38_div9) // %6 411 : "r4", "q0", "q1", "q2", "q3", "q8", "q9", 412 "q13", "q14", "q15", "memory", "cc" 413 ); 414 } 415 416 // 32x2 -> 12x1 417 static void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr, int src_stride, 418 uint8* dst_ptr, int dst_width) { 419 asm volatile ( 420 "vld1.u16 {q13}, [%4] \n" 421 "vld1.u8 {q14}, [%5] \n" 422 "add %3, %0 \n" 423 "1: \n" 424 425 // d0 = 00 40 01 41 02 42 03 43 426 // d1 = 10 50 11 51 12 52 13 53 427 // d2 = 20 60 21 61 22 62 23 63 428 // d3 = 30 70 31 71 32 72 33 73 429 "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" 430 "vld4.u8 {d4, d5, d6, d7}, [%3]! \n" 431 432 // Shuffle the input data around to get align the data 433 // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 434 // d0 = 00 10 01 11 02 12 03 13 435 // d1 = 40 50 41 51 42 52 43 53 436 "vtrn.u8 d0, d1 \n" 437 "vtrn.u8 d4, d5 \n" 438 439 // d2 = 20 30 21 31 22 32 23 33 440 // d3 = 60 70 61 71 62 72 63 73 441 "vtrn.u8 d2, d3 \n" 442 "vtrn.u8 d6, d7 \n" 443 444 // d0 = 00+10 01+11 02+12 03+13 445 // d2 = 40+50 41+51 42+52 43+53 446 "vpaddl.u8 q0, q0 \n" 447 "vpaddl.u8 q2, q2 \n" 448 449 // d3 = 60+70 61+71 62+72 63+73 450 "vpaddl.u8 d3, d3 \n" 451 "vpaddl.u8 d7, d7 \n" 452 453 // combine source lines 454 "vadd.u16 q0, q2 \n" 455 "vadd.u16 d4, d3, d7 \n" 456 457 // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4 458 "vqrshrn.u16 d4, q2, #2 \n" 459 460 // Shuffle 2,3 reg around so that 2 can be added to the 461 // 0,1 reg and 3 can be added to the 4,5 reg. This 462 // requires expanding from u8 to u16 as the 0,1 and 4,5 463 // registers are already expanded. Then do transposes 464 // to get aligned. 465 // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 466 "vmovl.u8 q1, d2 \n" 467 "vmovl.u8 q3, d6 \n" 468 469 // combine source lines 470 "vadd.u16 q1, q3 \n" 471 472 // d4 = xx 20 xx 30 xx 22 xx 32 473 // d5 = xx 21 xx 31 xx 23 xx 33 474 "vtrn.u32 d2, d3 \n" 475 476 // d4 = xx 20 xx 21 xx 22 xx 23 477 // d5 = xx 30 xx 31 xx 32 xx 33 478 "vtrn.u16 d2, d3 \n" 479 480 // 0+1+2, 3+4+5 481 "vadd.u16 q0, q1 \n" 482 483 // Need to divide, but can't downshift as the the value 484 // isn't a power of 2. So multiply by 65536 / n 485 // and take the upper 16 bits. 486 "vqrdmulh.s16 q0, q13 \n" 487 488 // Align for table lookup, vtbl requires registers to 489 // be adjacent 490 "vmov.u8 d2, d4 \n" 491 492 "vtbl.u8 d3, {d0, d1, d2}, d28 \n" 493 "vtbl.u8 d4, {d0, d1, d2}, d29 \n" 494 495 "vst1.u8 {d3}, [%1]! \n" 496 "vst1.u32 {d4[0]}, [%1]! \n" 497 "subs %2, #12 \n" 498 "bhi 1b \n" 499 : "+r"(src_ptr), // %0 500 "+r"(dst_ptr), // %1 501 "+r"(dst_width), // %2 502 "+r"(src_stride) // %3 503 : "r"(mult38_div6), // %4 504 "r"(shuf38_2) // %5 505 : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc" 506 ); 507 } 508 509 /** 510 * SSE2 downscalers with interpolation. 511 * 512 * Provided by Frank Barchard (fbarchard (at) google.com) 513 * 514 */ 515 516 // Constants for SSE2 code 517 #elif (defined(_M_IX86) || defined(__i386__) || defined(__x86_64__)) && \ 518 !defined(YUV_DISABLE_ASM) 519 #if defined(_MSC_VER) 520 #define TALIGN16(t, var) __declspec(align(16)) t _ ## var 521 #elif (defined(__APPLE__) || defined(__MINGW32__) || defined(__CYGWIN__)) && defined(__i386__) 522 #define TALIGN16(t, var) t var __attribute__((aligned(16))) 523 #else 524 #define TALIGN16(t, var) t _ ## var __attribute__((aligned(16))) 525 #endif 526 527 #if (defined(__APPLE__) || defined(__MINGW32__) || defined(__CYGWIN__)) && \ 528 defined(__i386__) 529 #define DECLARE_FUNCTION(name) \ 530 ".text \n" \ 531 ".globl _" #name " \n" \ 532 "_" #name ": \n" 533 #else 534 #define DECLARE_FUNCTION(name) \ 535 ".text \n" \ 536 ".global " #name " \n" \ 537 #name ": \n" 538 #endif 539 540 541 // Offsets for source bytes 0 to 9 542 //extern "C" 543 TALIGN16(const uint8, shuf0[16]) = 544 { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 }; 545 546 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12. 547 //extern "C" 548 TALIGN16(const uint8, shuf1[16]) = 549 { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 }; 550 551 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. 552 //extern "C" 553 TALIGN16(const uint8, shuf2[16]) = 554 { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 }; 555 556 // Offsets for source bytes 0 to 10 557 //extern "C" 558 TALIGN16(const uint8, shuf01[16]) = 559 { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 }; 560 561 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13. 562 //extern "C" 563 TALIGN16(const uint8, shuf11[16]) = 564 { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 }; 565 566 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. 567 //extern "C" 568 TALIGN16(const uint8, shuf21[16]) = 569 { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 }; 570 571 // Coefficients for source bytes 0 to 10 572 //extern "C" 573 TALIGN16(const uint8, madd01[16]) = 574 { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 }; 575 576 // Coefficients for source bytes 10 to 21 577 //extern "C" 578 TALIGN16(const uint8, madd11[16]) = 579 { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 }; 580 581 // Coefficients for source bytes 21 to 31 582 //extern "C" 583 TALIGN16(const uint8, madd21[16]) = 584 { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 }; 585 586 // Coefficients for source bytes 21 to 31 587 //extern "C" 588 TALIGN16(const int16, round34[8]) = 589 { 2, 2, 2, 2, 2, 2, 2, 2 }; 590 591 //extern "C" 592 TALIGN16(const uint8, shuf38a[16]) = 593 { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; 594 595 //extern "C" 596 TALIGN16(const uint8, shuf38b[16]) = 597 { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 }; 598 599 // Arrange words 0,3,6 into 0,1,2 600 //extern "C" 601 TALIGN16(const uint8, shufac0[16]) = 602 { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; 603 604 // Arrange words 0,3,6 into 3,4,5 605 //extern "C" 606 TALIGN16(const uint8, shufac3[16]) = 607 { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 }; 608 609 // Scaling values for boxes of 3x3 and 2x3 610 //extern "C" 611 TALIGN16(const uint16, scaleac3[8]) = 612 { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 }; 613 614 // Arrange first value for pixels 0,1,2,3,4,5 615 //extern "C" 616 TALIGN16(const uint8, shufab0[16]) = 617 { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 }; 618 619 // Arrange second value for pixels 0,1,2,3,4,5 620 //extern "C" 621 TALIGN16(const uint8, shufab1[16]) = 622 { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 }; 623 624 // Arrange third value for pixels 0,1,2,3,4,5 625 //extern "C" 626 TALIGN16(const uint8, shufab2[16]) = 627 { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 }; 628 629 // Scaling values for boxes of 3x2 and 2x2 630 //extern "C" 631 TALIGN16(const uint16, scaleab2[8]) = 632 { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 }; 633 #endif 634 635 #if defined(_M_IX86) && !defined(YUV_DISABLE_ASM) && defined(_MSC_VER) 636 637 #define HAS_SCALEROWDOWN2_SSE2 638 // Reads 32 pixels, throws half away and writes 16 pixels. 639 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. 640 __declspec(naked) 641 static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride, 642 uint8* dst_ptr, int dst_width) { 643 __asm { 644 mov eax, [esp + 4] // src_ptr 645 // src_stride ignored 646 mov edx, [esp + 12] // dst_ptr 647 mov ecx, [esp + 16] // dst_width 648 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 649 psrlw xmm5, 8 650 651 wloop: 652 movdqa xmm0, [eax] 653 movdqa xmm1, [eax + 16] 654 lea eax, [eax + 32] 655 pand xmm0, xmm5 656 pand xmm1, xmm5 657 packuswb xmm0, xmm1 658 movdqa [edx], xmm0 659 lea edx, [edx + 16] 660 sub ecx, 16 661 ja wloop 662 663 ret 664 } 665 } 666 // Blends 32x2 rectangle to 16x1. 667 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. 668 __declspec(naked) 669 void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride, 670 uint8* dst_ptr, int dst_width) { 671 __asm { 672 push esi 673 mov eax, [esp + 4 + 4] // src_ptr 674 mov esi, [esp + 4 + 8] // src_stride 675 mov edx, [esp + 4 + 12] // dst_ptr 676 mov ecx, [esp + 4 + 16] // dst_width 677 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 678 psrlw xmm5, 8 679 680 wloop: 681 movdqa xmm0, [eax] 682 movdqa xmm1, [eax + 16] 683 movdqa xmm2, [eax + esi] 684 movdqa xmm3, [eax + esi + 16] 685 lea eax, [eax + 32] 686 pavgb xmm0, xmm2 // average rows 687 pavgb xmm1, xmm3 688 689 movdqa xmm2, xmm0 // average columns (32 to 16 pixels) 690 psrlw xmm0, 8 691 movdqa xmm3, xmm1 692 psrlw xmm1, 8 693 pand xmm2, xmm5 694 pand xmm3, xmm5 695 pavgw xmm0, xmm2 696 pavgw xmm1, xmm3 697 packuswb xmm0, xmm1 698 699 movdqa [edx], xmm0 700 lea edx, [edx + 16] 701 sub ecx, 16 702 ja wloop 703 704 pop esi 705 ret 706 } 707 } 708 709 #define HAS_SCALEROWDOWN4_SSE2 710 // Point samples 32 pixels to 8 pixels. 711 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. 712 __declspec(naked) 713 static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride, 714 uint8* dst_ptr, int dst_width) { 715 __asm { 716 pushad 717 mov esi, [esp + 32 + 4] // src_ptr 718 // src_stride ignored 719 mov edi, [esp + 32 + 12] // dst_ptr 720 mov ecx, [esp + 32 + 16] // dst_width 721 pcmpeqb xmm5, xmm5 // generate mask 0x000000ff 722 psrld xmm5, 24 723 724 wloop: 725 movdqa xmm0, [esi] 726 movdqa xmm1, [esi + 16] 727 lea esi, [esi + 32] 728 pand xmm0, xmm5 729 pand xmm1, xmm5 730 packuswb xmm0, xmm1 731 packuswb xmm0, xmm0 732 movq qword ptr [edi], xmm0 733 lea edi, [edi + 8] 734 sub ecx, 8 735 ja wloop 736 737 popad 738 ret 739 } 740 } 741 742 // Blends 32x4 rectangle to 8x1. 743 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. 744 __declspec(naked) 745 static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride, 746 uint8* dst_ptr, int dst_width) { 747 __asm { 748 pushad 749 mov esi, [esp + 32 + 4] // src_ptr 750 mov ebx, [esp + 32 + 8] // src_stride 751 mov edi, [esp + 32 + 12] // dst_ptr 752 mov ecx, [esp + 32 + 16] // dst_width 753 pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff 754 psrlw xmm7, 8 755 lea edx, [ebx + ebx * 2] // src_stride * 3 756 757 wloop: 758 movdqa xmm0, [esi] 759 movdqa xmm1, [esi + 16] 760 movdqa xmm2, [esi + ebx] 761 movdqa xmm3, [esi + ebx + 16] 762 pavgb xmm0, xmm2 // average rows 763 pavgb xmm1, xmm3 764 movdqa xmm2, [esi + ebx * 2] 765 movdqa xmm3, [esi + ebx * 2 + 16] 766 movdqa xmm4, [esi + edx] 767 movdqa xmm5, [esi + edx + 16] 768 lea esi, [esi + 32] 769 pavgb xmm2, xmm4 770 pavgb xmm3, xmm5 771 pavgb xmm0, xmm2 772 pavgb xmm1, xmm3 773 774 movdqa xmm2, xmm0 // average columns (32 to 16 pixels) 775 psrlw xmm0, 8 776 movdqa xmm3, xmm1 777 psrlw xmm1, 8 778 pand xmm2, xmm7 779 pand xmm3, xmm7 780 pavgw xmm0, xmm2 781 pavgw xmm1, xmm3 782 packuswb xmm0, xmm1 783 784 movdqa xmm2, xmm0 // average columns (16 to 8 pixels) 785 psrlw xmm0, 8 786 pand xmm2, xmm7 787 pavgw xmm0, xmm2 788 packuswb xmm0, xmm0 789 790 movq qword ptr [edi], xmm0 791 lea edi, [edi + 8] 792 sub ecx, 8 793 ja wloop 794 795 popad 796 ret 797 } 798 } 799 800 #define HAS_SCALEROWDOWN8_SSE2 801 // Point samples 32 pixels to 4 pixels. 802 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned. 803 __declspec(naked) 804 static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride, 805 uint8* dst_ptr, int dst_width) { 806 __asm { 807 pushad 808 mov esi, [esp + 32 + 4] // src_ptr 809 // src_stride ignored 810 mov edi, [esp + 32 + 12] // dst_ptr 811 mov ecx, [esp + 32 + 16] // dst_width 812 pcmpeqb xmm5, xmm5 // generate mask isolating 1 src 8 bytes 813 psrlq xmm5, 56 814 815 wloop: 816 movdqa xmm0, [esi] 817 movdqa xmm1, [esi + 16] 818 lea esi, [esi + 32] 819 pand xmm0, xmm5 820 pand xmm1, xmm5 821 packuswb xmm0, xmm1 // 32->16 822 packuswb xmm0, xmm0 // 16->8 823 packuswb xmm0, xmm0 // 8->4 824 movd dword ptr [edi], xmm0 825 lea edi, [edi + 4] 826 sub ecx, 4 827 ja wloop 828 829 popad 830 ret 831 } 832 } 833 834 // Blends 32x8 rectangle to 4x1. 835 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned. 836 __declspec(naked) 837 static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride, 838 uint8* dst_ptr, int dst_width) { 839 __asm { 840 pushad 841 mov esi, [esp + 32 + 4] // src_ptr 842 mov ebx, [esp + 32 + 8] // src_stride 843 mov edi, [esp + 32 + 12] // dst_ptr 844 mov ecx, [esp + 32 + 16] // dst_width 845 lea edx, [ebx + ebx * 2] // src_stride * 3 846 pxor xmm7, xmm7 847 848 wloop: 849 movdqa xmm0, [esi] // average 8 rows to 1 850 movdqa xmm1, [esi + 16] 851 movdqa xmm2, [esi + ebx] 852 movdqa xmm3, [esi + ebx + 16] 853 pavgb xmm0, xmm2 854 pavgb xmm1, xmm3 855 movdqa xmm2, [esi + ebx * 2] 856 movdqa xmm3, [esi + ebx * 2 + 16] 857 movdqa xmm4, [esi + edx] 858 movdqa xmm5, [esi + edx + 16] 859 lea ebp, [esi + ebx * 4] 860 lea esi, [esi + 32] 861 pavgb xmm2, xmm4 862 pavgb xmm3, xmm5 863 pavgb xmm0, xmm2 864 pavgb xmm1, xmm3 865 866 movdqa xmm2, [ebp] 867 movdqa xmm3, [ebp + 16] 868 movdqa xmm4, [ebp + ebx] 869 movdqa xmm5, [ebp + ebx + 16] 870 pavgb xmm2, xmm4 871 pavgb xmm3, xmm5 872 movdqa xmm4, [ebp + ebx * 2] 873 movdqa xmm5, [ebp + ebx * 2 + 16] 874 movdqa xmm6, [ebp + edx] 875 pavgb xmm4, xmm6 876 movdqa xmm6, [ebp + edx + 16] 877 pavgb xmm5, xmm6 878 pavgb xmm2, xmm4 879 pavgb xmm3, xmm5 880 pavgb xmm0, xmm2 881 pavgb xmm1, xmm3 882 883 psadbw xmm0, xmm7 // average 32 pixels to 4 884 psadbw xmm1, xmm7 885 pshufd xmm0, xmm0, 0xd8 // x1x0 -> xx01 886 pshufd xmm1, xmm1, 0x8d // x3x2 -> 32xx 887 por xmm0, xmm1 // -> 3201 888 psrlw xmm0, 3 889 packuswb xmm0, xmm0 890 packuswb xmm0, xmm0 891 movd dword ptr [edi], xmm0 892 893 lea edi, [edi + 4] 894 sub ecx, 4 895 ja wloop 896 897 popad 898 ret 899 } 900 } 901 902 #define HAS_SCALEROWDOWN34_SSSE3 903 // Point samples 32 pixels to 24 pixels. 904 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read. 905 // Then shuffled to do the scaling. 906 907 // Note that movdqa+palign may be better than movdqu. 908 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. 909 __declspec(naked) 910 static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride, 911 uint8* dst_ptr, int dst_width) { 912 __asm { 913 pushad 914 mov esi, [esp + 32 + 4] // src_ptr 915 // src_stride ignored 916 mov edi, [esp + 32 + 12] // dst_ptr 917 mov ecx, [esp + 32 + 16] // dst_width 918 movdqa xmm3, _shuf0 919 movdqa xmm4, _shuf1 920 movdqa xmm5, _shuf2 921 922 wloop: 923 movdqa xmm0, [esi] 924 movdqa xmm1, [esi + 16] 925 lea esi, [esi + 32] 926 movdqa xmm2, xmm1 927 palignr xmm1, xmm0, 8 928 pshufb xmm0, xmm3 929 pshufb xmm1, xmm4 930 pshufb xmm2, xmm5 931 movq qword ptr [edi], xmm0 932 movq qword ptr [edi + 8], xmm1 933 movq qword ptr [edi + 16], xmm2 934 lea edi, [edi + 24] 935 sub ecx, 24 936 ja wloop 937 938 popad 939 ret 940 } 941 } 942 943 // Blends 32x2 rectangle to 24x1 944 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read. 945 // Then shuffled to do the scaling. 946 947 // Register usage: 948 // xmm0 src_row 0 949 // xmm1 src_row 1 950 // xmm2 shuf 0 951 // xmm3 shuf 1 952 // xmm4 shuf 2 953 // xmm5 madd 0 954 // xmm6 madd 1 955 // xmm7 round34 956 957 // Note that movdqa+palign may be better than movdqu. 958 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. 959 __declspec(naked) 960 static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride, 961 uint8* dst_ptr, int dst_width) { 962 __asm { 963 pushad 964 mov esi, [esp + 32 + 4] // src_ptr 965 mov ebx, [esp + 32 + 8] // src_stride 966 mov edi, [esp + 32 + 12] // dst_ptr 967 mov ecx, [esp + 32 + 16] // dst_width 968 movdqa xmm2, _shuf01 969 movdqa xmm3, _shuf11 970 movdqa xmm4, _shuf21 971 movdqa xmm5, _madd01 972 movdqa xmm6, _madd11 973 movdqa xmm7, _round34 974 975 wloop: 976 movdqa xmm0, [esi] // pixels 0..7 977 movdqa xmm1, [esi+ebx] 978 pavgb xmm0, xmm1 979 pshufb xmm0, xmm2 980 pmaddubsw xmm0, xmm5 981 paddsw xmm0, xmm7 982 psrlw xmm0, 2 983 packuswb xmm0, xmm0 984 movq qword ptr [edi], xmm0 985 movdqu xmm0, [esi+8] // pixels 8..15 986 movdqu xmm1, [esi+ebx+8] 987 pavgb xmm0, xmm1 988 pshufb xmm0, xmm3 989 pmaddubsw xmm0, xmm6 990 paddsw xmm0, xmm7 991 psrlw xmm0, 2 992 packuswb xmm0, xmm0 993 movq qword ptr [edi+8], xmm0 994 movdqa xmm0, [esi+16] // pixels 16..23 995 movdqa xmm1, [esi+ebx+16] 996 lea esi, [esi+32] 997 pavgb xmm0, xmm1 998 pshufb xmm0, xmm4 999 movdqa xmm1, _madd21 1000 pmaddubsw xmm0, xmm1 1001 paddsw xmm0, xmm7 1002 psrlw xmm0, 2 1003 packuswb xmm0, xmm0 1004 movq qword ptr [edi+16], xmm0 1005 lea edi, [edi+24] 1006 sub ecx, 24 1007 ja wloop 1008 1009 popad 1010 ret 1011 } 1012 } 1013 1014 // Note that movdqa+palign may be better than movdqu. 1015 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. 1016 __declspec(naked) 1017 static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride, 1018 uint8* dst_ptr, int dst_width) { 1019 __asm { 1020 pushad 1021 mov esi, [esp + 32 + 4] // src_ptr 1022 mov ebx, [esp + 32 + 8] // src_stride 1023 mov edi, [esp + 32 + 12] // dst_ptr 1024 mov ecx, [esp + 32 + 16] // dst_width 1025 movdqa xmm2, _shuf01 1026 movdqa xmm3, _shuf11 1027 movdqa xmm4, _shuf21 1028 movdqa xmm5, _madd01 1029 movdqa xmm6, _madd11 1030 movdqa xmm7, _round34 1031 1032 wloop: 1033 movdqa xmm0, [esi] // pixels 0..7 1034 movdqa xmm1, [esi+ebx] 1035 pavgb xmm1, xmm0 1036 pavgb xmm0, xmm1 1037 pshufb xmm0, xmm2 1038 pmaddubsw xmm0, xmm5 1039 paddsw xmm0, xmm7 1040 psrlw xmm0, 2 1041 packuswb xmm0, xmm0 1042 movq qword ptr [edi], xmm0 1043 movdqu xmm0, [esi+8] // pixels 8..15 1044 movdqu xmm1, [esi+ebx+8] 1045 pavgb xmm1, xmm0 1046 pavgb xmm0, xmm1 1047 pshufb xmm0, xmm3 1048 pmaddubsw xmm0, xmm6 1049 paddsw xmm0, xmm7 1050 psrlw xmm0, 2 1051 packuswb xmm0, xmm0 1052 movq qword ptr [edi+8], xmm0 1053 movdqa xmm0, [esi+16] // pixels 16..23 1054 movdqa xmm1, [esi+ebx+16] 1055 lea esi, [esi+32] 1056 pavgb xmm1, xmm0 1057 pavgb xmm0, xmm1 1058 pshufb xmm0, xmm4 1059 movdqa xmm1, _madd21 1060 pmaddubsw xmm0, xmm1 1061 paddsw xmm0, xmm7 1062 psrlw xmm0, 2 1063 packuswb xmm0, xmm0 1064 movq qword ptr [edi+16], xmm0 1065 lea edi, [edi+24] 1066 sub ecx, 24 1067 ja wloop 1068 1069 popad 1070 ret 1071 } 1072 } 1073 1074 #define HAS_SCALEROWDOWN38_SSSE3 1075 // 3/8 point sampler 1076 1077 // Scale 32 pixels to 12 1078 __declspec(naked) 1079 static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride, 1080 uint8* dst_ptr, int dst_width) { 1081 __asm { 1082 pushad 1083 mov esi, [esp + 32 + 4] // src_ptr 1084 mov edx, [esp + 32 + 8] // src_stride 1085 mov edi, [esp + 32 + 12] // dst_ptr 1086 mov ecx, [esp + 32 + 16] // dst_width 1087 movdqa xmm4, _shuf38a 1088 movdqa xmm5, _shuf38b 1089 1090 xloop: 1091 movdqa xmm0, [esi] // 16 pixels -> 0,1,2,3,4,5 1092 movdqa xmm1, [esi + 16] // 16 pixels -> 6,7,8,9,10,11 1093 lea esi, [esi + 32] 1094 pshufb xmm0, xmm4 1095 pshufb xmm1, xmm5 1096 paddusb xmm0, xmm1 1097 1098 movq qword ptr [edi], xmm0 // write 12 pixels 1099 movhlps xmm1, xmm0 1100 movd [edi + 8], xmm1 1101 lea edi, [edi + 12] 1102 sub ecx, 12 1103 ja xloop 1104 1105 popad 1106 ret 1107 } 1108 } 1109 1110 // Scale 16x3 pixels to 6x1 with interpolation 1111 __declspec(naked) 1112 static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride, 1113 uint8* dst_ptr, int dst_width) { 1114 __asm { 1115 pushad 1116 mov esi, [esp + 32 + 4] // src_ptr 1117 mov edx, [esp + 32 + 8] // src_stride 1118 mov edi, [esp + 32 + 12] // dst_ptr 1119 mov ecx, [esp + 32 + 16] // dst_width 1120 movdqa xmm4, _shufac0 1121 movdqa xmm5, _shufac3 1122 movdqa xmm6, _scaleac3 1123 pxor xmm7, xmm7 1124 1125 xloop: 1126 movdqa xmm0, [esi] // sum up 3 rows into xmm0/1 1127 movdqa xmm2, [esi + edx] 1128 movhlps xmm1, xmm0 1129 movhlps xmm3, xmm2 1130 punpcklbw xmm0, xmm7 1131 punpcklbw xmm1, xmm7 1132 punpcklbw xmm2, xmm7 1133 punpcklbw xmm3, xmm7 1134 paddusw xmm0, xmm2 1135 paddusw xmm1, xmm3 1136 movdqa xmm2, [esi + edx * 2] 1137 lea esi, [esi + 16] 1138 movhlps xmm3, xmm2 1139 punpcklbw xmm2, xmm7 1140 punpcklbw xmm3, xmm7 1141 paddusw xmm0, xmm2 1142 paddusw xmm1, xmm3 1143 1144 movdqa xmm2, xmm0 // 8 pixels -> 0,1,2 of xmm2 1145 psrldq xmm0, 2 1146 paddusw xmm2, xmm0 1147 psrldq xmm0, 2 1148 paddusw xmm2, xmm0 1149 pshufb xmm2, xmm4 1150 1151 movdqa xmm3, xmm1 // 8 pixels -> 3,4,5 of xmm2 1152 psrldq xmm1, 2 1153 paddusw xmm3, xmm1 1154 psrldq xmm1, 2 1155 paddusw xmm3, xmm1 1156 pshufb xmm3, xmm5 1157 paddusw xmm2, xmm3 1158 1159 pmulhuw xmm2, xmm6 // divide by 9,9,6, 9,9,6 1160 packuswb xmm2, xmm2 1161 1162 movd [edi], xmm2 // write 6 pixels 1163 pextrw eax, xmm2, 2 1164 mov [edi + 4], ax 1165 lea edi, [edi + 6] 1166 sub ecx, 6 1167 ja xloop 1168 1169 popad 1170 ret 1171 } 1172 } 1173 1174 // Scale 16x2 pixels to 6x1 with interpolation 1175 __declspec(naked) 1176 static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride, 1177 uint8* dst_ptr, int dst_width) { 1178 __asm { 1179 pushad 1180 mov esi, [esp + 32 + 4] // src_ptr 1181 mov edx, [esp + 32 + 8] // src_stride 1182 mov edi, [esp + 32 + 12] // dst_ptr 1183 mov ecx, [esp + 32 + 16] // dst_width 1184 movdqa xmm4, _shufab0 1185 movdqa xmm5, _shufab1 1186 movdqa xmm6, _shufab2 1187 movdqa xmm7, _scaleab2 1188 1189 xloop: 1190 movdqa xmm2, [esi] // average 2 rows into xmm2 1191 pavgb xmm2, [esi + edx] 1192 lea esi, [esi + 16] 1193 1194 movdqa xmm0, xmm2 // 16 pixels -> 0,1,2,3,4,5 of xmm0 1195 pshufb xmm0, xmm4 1196 movdqa xmm1, xmm2 1197 pshufb xmm1, xmm5 1198 paddusw xmm0, xmm1 1199 pshufb xmm2, xmm6 1200 paddusw xmm0, xmm2 1201 1202 pmulhuw xmm0, xmm7 // divide by 3,3,2, 3,3,2 1203 packuswb xmm0, xmm0 1204 1205 movd [edi], xmm0 // write 6 pixels 1206 pextrw eax, xmm0, 2 1207 mov [edi + 4], ax 1208 lea edi, [edi + 6] 1209 sub ecx, 6 1210 ja xloop 1211 1212 popad 1213 ret 1214 } 1215 } 1216 1217 #define HAS_SCALEADDROWS_SSE2 1218 1219 // Reads 8xN bytes and produces 16 shorts at a time. 1220 __declspec(naked) 1221 static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride, 1222 uint16* dst_ptr, int src_width, 1223 int src_height) { 1224 __asm { 1225 pushad 1226 mov esi, [esp + 32 + 4] // src_ptr 1227 mov edx, [esp + 32 + 8] // src_stride 1228 mov edi, [esp + 32 + 12] // dst_ptr 1229 mov ecx, [esp + 32 + 16] // dst_width 1230 mov ebx, [esp + 32 + 20] // height 1231 pxor xmm5, xmm5 1232 dec ebx 1233 1234 xloop: 1235 // first row 1236 movdqa xmm2, [esi] 1237 lea eax, [esi + edx] 1238 movhlps xmm3, xmm2 1239 mov ebp, ebx 1240 punpcklbw xmm2, xmm5 1241 punpcklbw xmm3, xmm5 1242 1243 // sum remaining rows 1244 yloop: 1245 movdqa xmm0, [eax] // read 16 pixels 1246 lea eax, [eax + edx] // advance to next row 1247 movhlps xmm1, xmm0 1248 punpcklbw xmm0, xmm5 1249 punpcklbw xmm1, xmm5 1250 paddusw xmm2, xmm0 // sum 16 words 1251 paddusw xmm3, xmm1 1252 sub ebp, 1 1253 ja yloop 1254 1255 movdqa [edi], xmm2 1256 movdqa [edi + 16], xmm3 1257 lea edi, [edi + 32] 1258 lea esi, [esi + 16] 1259 1260 sub ecx, 16 1261 ja xloop 1262 1263 popad 1264 ret 1265 } 1266 } 1267 1268 // Bilinear row filtering combines 16x2 -> 16x1. SSE2 version. 1269 #define HAS_SCALEFILTERROWS_SSE2 1270 __declspec(naked) 1271 static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr, 1272 int src_stride, int dst_width, 1273 int source_y_fraction) { 1274 __asm { 1275 push esi 1276 push edi 1277 mov edi, [esp + 8 + 4] // dst_ptr 1278 mov esi, [esp + 8 + 8] // src_ptr 1279 mov edx, [esp + 8 + 12] // src_stride 1280 mov ecx, [esp + 8 + 16] // dst_width 1281 mov eax, [esp + 8 + 20] // source_y_fraction (0..255) 1282 cmp eax, 0 1283 je xloop1 1284 cmp eax, 128 1285 je xloop2 1286 1287 movd xmm6, eax // xmm6 = y fraction 1288 punpcklwd xmm6, xmm6 1289 pshufd xmm6, xmm6, 0 1290 neg eax // xmm5 = 256 - y fraction 1291 add eax, 256 1292 movd xmm5, eax 1293 punpcklwd xmm5, xmm5 1294 pshufd xmm5, xmm5, 0 1295 pxor xmm7, xmm7 1296 1297 xloop: 1298 movdqa xmm0, [esi] 1299 movdqa xmm2, [esi + edx] 1300 lea esi, [esi + 16] 1301 movdqa xmm1, xmm0 1302 movdqa xmm3, xmm2 1303 punpcklbw xmm0, xmm7 1304 punpcklbw xmm2, xmm7 1305 punpckhbw xmm1, xmm7 1306 punpckhbw xmm3, xmm7 1307 pmullw xmm0, xmm5 // scale row 0 1308 pmullw xmm1, xmm5 1309 pmullw xmm2, xmm6 // scale row 1 1310 pmullw xmm3, xmm6 1311 paddusw xmm0, xmm2 // sum rows 1312 paddusw xmm1, xmm3 1313 psrlw xmm0, 8 1314 psrlw xmm1, 8 1315 packuswb xmm0, xmm1 1316 movdqa [edi], xmm0 1317 lea edi, [edi + 16] 1318 sub ecx, 16 1319 ja xloop 1320 1321 mov al, [edi - 1] 1322 mov [edi], al 1323 pop edi 1324 pop esi 1325 ret 1326 1327 xloop1: 1328 movdqa xmm0, [esi] 1329 lea esi, [esi + 16] 1330 movdqa [edi], xmm0 1331 lea edi, [edi + 16] 1332 sub ecx, 16 1333 ja xloop1 1334 1335 mov al, [edi - 1] 1336 mov [edi], al 1337 pop edi 1338 pop esi 1339 ret 1340 1341 xloop2: 1342 movdqa xmm0, [esi] 1343 movdqa xmm2, [esi + edx] 1344 lea esi, [esi + 16] 1345 pavgb xmm0, xmm2 1346 movdqa [edi], xmm0 1347 lea edi, [edi + 16] 1348 sub ecx, 16 1349 ja xloop2 1350 1351 mov al, [edi - 1] 1352 mov [edi], al 1353 pop edi 1354 pop esi 1355 ret 1356 } 1357 } 1358 1359 // Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version. 1360 #define HAS_SCALEFILTERROWS_SSSE3 1361 __declspec(naked) 1362 static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr, 1363 int src_stride, int dst_width, 1364 int source_y_fraction) { 1365 __asm { 1366 push esi 1367 push edi 1368 mov edi, [esp + 8 + 4] // dst_ptr 1369 mov esi, [esp + 8 + 8] // src_ptr 1370 mov edx, [esp + 8 + 12] // src_stride 1371 mov ecx, [esp + 8 + 16] // dst_width 1372 mov eax, [esp + 8 + 20] // source_y_fraction (0..255) 1373 shr eax, 1 1374 cmp eax, 0 1375 je xloop1 1376 cmp eax, 64 1377 je xloop2 1378 1379 mov ah,al 1380 neg al 1381 add al, 128 1382 movd xmm5, eax 1383 punpcklwd xmm5, xmm5 1384 pshufd xmm5, xmm5, 0 1385 1386 xloop: 1387 movdqa xmm0, [esi] 1388 movdqa xmm2, [esi + edx] 1389 lea esi, [esi + 16] 1390 movdqa xmm1, xmm0 1391 punpcklbw xmm0, xmm2 1392 punpckhbw xmm1, xmm2 1393 pmaddubsw xmm0, xmm5 1394 pmaddubsw xmm1, xmm5 1395 psrlw xmm0, 7 1396 psrlw xmm1, 7 1397 packuswb xmm0, xmm1 1398 movdqa [edi], xmm0 1399 lea edi, [edi + 16] 1400 sub ecx, 16 1401 ja xloop 1402 1403 mov al, [edi - 1] 1404 mov [edi], al 1405 pop edi 1406 pop esi 1407 ret 1408 1409 xloop1: 1410 movdqa xmm0, [esi] 1411 lea esi, [esi + 16] 1412 movdqa [edi], xmm0 1413 lea edi, [edi + 16] 1414 sub ecx, 16 1415 ja xloop1 1416 1417 mov al, [edi - 1] 1418 mov [edi], al 1419 pop edi 1420 pop esi 1421 ret 1422 1423 xloop2: 1424 movdqa xmm0, [esi] 1425 movdqa xmm2, [esi + edx] 1426 lea esi, [esi + 16] 1427 pavgb xmm0, xmm2 1428 movdqa [edi], xmm0 1429 lea edi, [edi + 16] 1430 sub ecx, 16 1431 ja xloop2 1432 1433 mov al, [edi - 1] 1434 mov [edi], al 1435 pop edi 1436 pop esi 1437 ret 1438 1439 } 1440 } 1441 1442 // Note that movdqa+palign may be better than movdqu. 1443 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. 1444 __declspec(naked) 1445 static void ScaleFilterCols34_SSSE3(uint8* dst_ptr, const uint8* src_ptr, 1446 int dst_width) { 1447 __asm { 1448 mov edx, [esp + 4] // dst_ptr 1449 mov eax, [esp + 8] // src_ptr 1450 mov ecx, [esp + 12] // dst_width 1451 movdqa xmm1, _round34 1452 movdqa xmm2, _shuf01 1453 movdqa xmm3, _shuf11 1454 movdqa xmm4, _shuf21 1455 movdqa xmm5, _madd01 1456 movdqa xmm6, _madd11 1457 movdqa xmm7, _madd21 1458 1459 wloop: 1460 movdqa xmm0, [eax] // pixels 0..7 1461 pshufb xmm0, xmm2 1462 pmaddubsw xmm0, xmm5 1463 paddsw xmm0, xmm1 1464 psrlw xmm0, 2 1465 packuswb xmm0, xmm0 1466 movq qword ptr [edx], xmm0 1467 movdqu xmm0, [eax+8] // pixels 8..15 1468 pshufb xmm0, xmm3 1469 pmaddubsw xmm0, xmm6 1470 paddsw xmm0, xmm1 1471 psrlw xmm0, 2 1472 packuswb xmm0, xmm0 1473 movq qword ptr [edx+8], xmm0 1474 movdqa xmm0, [eax+16] // pixels 16..23 1475 lea eax, [eax+32] 1476 pshufb xmm0, xmm4 1477 pmaddubsw xmm0, xmm7 1478 paddsw xmm0, xmm1 1479 psrlw xmm0, 2 1480 packuswb xmm0, xmm0 1481 movq qword ptr [edx+16], xmm0 1482 lea edx, [edx+24] 1483 sub ecx, 24 1484 ja wloop 1485 ret 1486 } 1487 } 1488 1489 #elif (defined(__x86_64__) || defined(__i386__)) && !defined(YUV_DISABLE_ASM) 1490 1491 // GCC versions of row functions are verbatim conversions from Visual C. 1492 // Generated using gcc disassembly on Visual C object file: 1493 // objdump -D yuvscaler.obj >yuvscaler.txt 1494 #define HAS_SCALEROWDOWN2_SSE2 1495 static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride, 1496 uint8* dst_ptr, int dst_width) { 1497 asm volatile ( 1498 "pcmpeqb %%xmm5,%%xmm5 \n" 1499 "psrlw $0x8,%%xmm5 \n" 1500 "1:" 1501 "movdqa (%0),%%xmm0 \n" 1502 "movdqa 0x10(%0),%%xmm1 \n" 1503 "lea 0x20(%0),%0 \n" 1504 "pand %%xmm5,%%xmm0 \n" 1505 "pand %%xmm5,%%xmm1 \n" 1506 "packuswb %%xmm1,%%xmm0 \n" 1507 "movdqa %%xmm0,(%1) \n" 1508 "lea 0x10(%1),%1 \n" 1509 "sub $0x10,%2 \n" 1510 "ja 1b \n" 1511 : "+r"(src_ptr), // %0 1512 "+r"(dst_ptr), // %1 1513 "+r"(dst_width) // %2 1514 : 1515 : "memory", "cc" 1516 ); 1517 } 1518 1519 static void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride, 1520 uint8* dst_ptr, int dst_width) { 1521 asm volatile ( 1522 "pcmpeqb %%xmm5,%%xmm5 \n" 1523 "psrlw $0x8,%%xmm5 \n" 1524 "1:" 1525 "movdqa (%0),%%xmm0 \n" 1526 "movdqa 0x10(%0),%%xmm1 \n" 1527 "movdqa (%0,%3,1),%%xmm2 \n" 1528 "movdqa 0x10(%0,%3,1),%%xmm3 \n" 1529 "lea 0x20(%0),%0 \n" 1530 "pavgb %%xmm2,%%xmm0 \n" 1531 "pavgb %%xmm3,%%xmm1 \n" 1532 "movdqa %%xmm0,%%xmm2 \n" 1533 "psrlw $0x8,%%xmm0 \n" 1534 "movdqa %%xmm1,%%xmm3 \n" 1535 "psrlw $0x8,%%xmm1 \n" 1536 "pand %%xmm5,%%xmm2 \n" 1537 "pand %%xmm5,%%xmm3 \n" 1538 "pavgw %%xmm2,%%xmm0 \n" 1539 "pavgw %%xmm3,%%xmm1 \n" 1540 "packuswb %%xmm1,%%xmm0 \n" 1541 "movdqa %%xmm0,(%1) \n" 1542 "lea 0x10(%1),%1 \n" 1543 "sub $0x10,%2 \n" 1544 "ja 1b \n" 1545 : "+r"(src_ptr), // %0 1546 "+r"(dst_ptr), // %1 1547 "+r"(dst_width) // %2 1548 : "r"((intptr_t)(src_stride)) // %3 1549 : "memory", "cc" 1550 ); 1551 } 1552 1553 #define HAS_SCALEROWDOWN4_SSE2 1554 static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride, 1555 uint8* dst_ptr, int dst_width) { 1556 asm volatile ( 1557 "pcmpeqb %%xmm5,%%xmm5 \n" 1558 "psrld $0x18,%%xmm5 \n" 1559 "1:" 1560 "movdqa (%0),%%xmm0 \n" 1561 "movdqa 0x10(%0),%%xmm1 \n" 1562 "lea 0x20(%0),%0 \n" 1563 "pand %%xmm5,%%xmm0 \n" 1564 "pand %%xmm5,%%xmm1 \n" 1565 "packuswb %%xmm1,%%xmm0 \n" 1566 "packuswb %%xmm0,%%xmm0 \n" 1567 "movq %%xmm0,(%1) \n" 1568 "lea 0x8(%1),%1 \n" 1569 "sub $0x8,%2 \n" 1570 "ja 1b \n" 1571 : "+r"(src_ptr), // %0 1572 "+r"(dst_ptr), // %1 1573 "+r"(dst_width) // %2 1574 : 1575 : "memory", "cc" 1576 ); 1577 } 1578 1579 static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride, 1580 uint8* dst_ptr, int dst_width) { 1581 intptr_t temp = 0; 1582 asm volatile ( 1583 "pcmpeqb %%xmm7,%%xmm7 \n" 1584 "psrlw $0x8,%%xmm7 \n" 1585 "lea (%4,%4,2),%3 \n" 1586 "1:" 1587 "movdqa (%0),%%xmm0 \n" 1588 "movdqa 0x10(%0),%%xmm1 \n" 1589 "movdqa (%0,%4,1),%%xmm2 \n" 1590 "movdqa 0x10(%0,%4,1),%%xmm3 \n" 1591 "pavgb %%xmm2,%%xmm0 \n" 1592 "pavgb %%xmm3,%%xmm1 \n" 1593 "movdqa (%0,%4,2),%%xmm2 \n" 1594 "movdqa 0x10(%0,%4,2),%%xmm3 \n" 1595 "movdqa (%0,%3,1),%%xmm4 \n" 1596 "movdqa 0x10(%0,%3,1),%%xmm5 \n" 1597 "lea 0x20(%0),%0 \n" 1598 "pavgb %%xmm4,%%xmm2 \n" 1599 "pavgb %%xmm2,%%xmm0 \n" 1600 "pavgb %%xmm5,%%xmm3 \n" 1601 "pavgb %%xmm3,%%xmm1 \n" 1602 "movdqa %%xmm0,%%xmm2 \n" 1603 "psrlw $0x8,%%xmm0 \n" 1604 "movdqa %%xmm1,%%xmm3 \n" 1605 "psrlw $0x8,%%xmm1 \n" 1606 "pand %%xmm7,%%xmm2 \n" 1607 "pand %%xmm7,%%xmm3 \n" 1608 "pavgw %%xmm2,%%xmm0 \n" 1609 "pavgw %%xmm3,%%xmm1 \n" 1610 "packuswb %%xmm1,%%xmm0 \n" 1611 "movdqa %%xmm0,%%xmm2 \n" 1612 "psrlw $0x8,%%xmm0 \n" 1613 "pand %%xmm7,%%xmm2 \n" 1614 "pavgw %%xmm2,%%xmm0 \n" 1615 "packuswb %%xmm0,%%xmm0 \n" 1616 "movq %%xmm0,(%1) \n" 1617 "lea 0x8(%1),%1 \n" 1618 "sub $0x8,%2 \n" 1619 "ja 1b \n" 1620 : "+r"(src_ptr), // %0 1621 "+r"(dst_ptr), // %1 1622 "+r"(dst_width), // %2 1623 "+r"(temp) // %3 1624 : "r"((intptr_t)(src_stride)) // %4 1625 : "memory", "cc" 1626 #if defined(__x86_64__) 1627 , "xmm6", "xmm7" 1628 #endif 1629 ); 1630 } 1631 1632 #define HAS_SCALEROWDOWN8_SSE2 1633 static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride, 1634 uint8* dst_ptr, int dst_width) { 1635 asm volatile ( 1636 "pcmpeqb %%xmm5,%%xmm5 \n" 1637 "psrlq $0x38,%%xmm5 \n" 1638 "1:" 1639 "movdqa (%0),%%xmm0 \n" 1640 "movdqa 0x10(%0),%%xmm1 \n" 1641 "lea 0x20(%0),%0 \n" 1642 "pand %%xmm5,%%xmm0 \n" 1643 "pand %%xmm5,%%xmm1 \n" 1644 "packuswb %%xmm1,%%xmm0 \n" 1645 "packuswb %%xmm0,%%xmm0 \n" 1646 "packuswb %%xmm0,%%xmm0 \n" 1647 "movd %%xmm0,(%1) \n" 1648 "lea 0x4(%1),%1 \n" 1649 "sub $0x4,%2 \n" 1650 "ja 1b \n" 1651 : "+r"(src_ptr), // %0 1652 "+r"(dst_ptr), // %1 1653 "+r"(dst_width) // %2 1654 : 1655 : "memory", "cc" 1656 ); 1657 } 1658 1659 #if defined(__i386__) 1660 void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride, 1661 uint8* dst_ptr, int dst_width); 1662 asm( 1663 DECLARE_FUNCTION(ScaleRowDown8Int_SSE2) 1664 "pusha \n" 1665 "mov 0x24(%esp),%esi \n" 1666 "mov 0x28(%esp),%ebx \n" 1667 "mov 0x2c(%esp),%edi \n" 1668 "mov 0x30(%esp),%ecx \n" 1669 "lea (%ebx,%ebx,2),%edx \n" 1670 "pxor %xmm7,%xmm7 \n" 1671 1672 "1:" 1673 "movdqa (%esi),%xmm0 \n" 1674 "movdqa 0x10(%esi),%xmm1 \n" 1675 "movdqa (%esi,%ebx,1),%xmm2 \n" 1676 "movdqa 0x10(%esi,%ebx,1),%xmm3 \n" 1677 "pavgb %xmm2,%xmm0 \n" 1678 "pavgb %xmm3,%xmm1 \n" 1679 "movdqa (%esi,%ebx,2),%xmm2 \n" 1680 "movdqa 0x10(%esi,%ebx,2),%xmm3 \n" 1681 "movdqa (%esi,%edx,1),%xmm4 \n" 1682 "movdqa 0x10(%esi,%edx,1),%xmm5 \n" 1683 "lea (%esi,%ebx,4),%ebp \n" 1684 "lea 0x20(%esi),%esi \n" 1685 "pavgb %xmm4,%xmm2 \n" 1686 "pavgb %xmm5,%xmm3 \n" 1687 "pavgb %xmm2,%xmm0 \n" 1688 "pavgb %xmm3,%xmm1 \n" 1689 "movdqa 0x0(%ebp),%xmm2 \n" 1690 "movdqa 0x10(%ebp),%xmm3 \n" 1691 "movdqa 0x0(%ebp,%ebx,1),%xmm4 \n" 1692 "movdqa 0x10(%ebp,%ebx,1),%xmm5 \n" 1693 "pavgb %xmm4,%xmm2 \n" 1694 "pavgb %xmm5,%xmm3 \n" 1695 "movdqa 0x0(%ebp,%ebx,2),%xmm4 \n" 1696 "movdqa 0x10(%ebp,%ebx,2),%xmm5 \n" 1697 "movdqa 0x0(%ebp,%edx,1),%xmm6 \n" 1698 "pavgb %xmm6,%xmm4 \n" 1699 "movdqa 0x10(%ebp,%edx,1),%xmm6 \n" 1700 "pavgb %xmm6,%xmm5 \n" 1701 "pavgb %xmm4,%xmm2 \n" 1702 "pavgb %xmm5,%xmm3 \n" 1703 "pavgb %xmm2,%xmm0 \n" 1704 "pavgb %xmm3,%xmm1 \n" 1705 "psadbw %xmm7,%xmm0 \n" 1706 "psadbw %xmm7,%xmm1 \n" 1707 "pshufd $0xd8,%xmm0,%xmm0 \n" 1708 "pshufd $0x8d,%xmm1,%xmm1 \n" 1709 "por %xmm1,%xmm0 \n" 1710 "psrlw $0x3,%xmm0 \n" 1711 "packuswb %xmm0,%xmm0 \n" 1712 "packuswb %xmm0,%xmm0 \n" 1713 "movd %xmm0,(%edi) \n" 1714 "lea 0x4(%edi),%edi \n" 1715 "sub $0x4,%ecx \n" 1716 "ja 1b \n" 1717 "popa \n" 1718 "ret \n" 1719 ); 1720 1721 // fpic is used for magiccam plugin 1722 #if !defined(__PIC__) 1723 #define HAS_SCALEROWDOWN34_SSSE3 1724 void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride, 1725 uint8* dst_ptr, int dst_width); 1726 asm( 1727 DECLARE_FUNCTION(ScaleRowDown34_SSSE3) 1728 "pusha \n" 1729 "mov 0x24(%esp),%esi \n" 1730 "mov 0x2c(%esp),%edi \n" 1731 "mov 0x30(%esp),%ecx \n" 1732 "movdqa _shuf0,%xmm3 \n" 1733 "movdqa _shuf1,%xmm4 \n" 1734 "movdqa _shuf2,%xmm5 \n" 1735 1736 "1:" 1737 "movdqa (%esi),%xmm0 \n" 1738 "movdqa 0x10(%esi),%xmm2 \n" 1739 "lea 0x20(%esi),%esi \n" 1740 "movdqa %xmm2,%xmm1 \n" 1741 "palignr $0x8,%xmm0,%xmm1 \n" 1742 "pshufb %xmm3,%xmm0 \n" 1743 "pshufb %xmm4,%xmm1 \n" 1744 "pshufb %xmm5,%xmm2 \n" 1745 "movq %xmm0,(%edi) \n" 1746 "movq %xmm1,0x8(%edi) \n" 1747 "movq %xmm2,0x10(%edi) \n" 1748 "lea 0x18(%edi),%edi \n" 1749 "sub $0x18,%ecx \n" 1750 "ja 1b \n" 1751 "popa \n" 1752 "ret \n" 1753 ); 1754 1755 void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride, 1756 uint8* dst_ptr, int dst_width); 1757 asm( 1758 DECLARE_FUNCTION(ScaleRowDown34_1_Int_SSSE3) 1759 "pusha \n" 1760 "mov 0x24(%esp),%esi \n" 1761 "mov 0x28(%esp),%ebp \n" 1762 "mov 0x2c(%esp),%edi \n" 1763 "mov 0x30(%esp),%ecx \n" 1764 "movdqa _shuf01,%xmm2 \n" 1765 "movdqa _shuf11,%xmm3 \n" 1766 "movdqa _shuf21,%xmm4 \n" 1767 "movdqa _madd01,%xmm5 \n" 1768 "movdqa _madd11,%xmm6 \n" 1769 "movdqa _round34,%xmm7 \n" 1770 1771 "1:" 1772 "movdqa (%esi),%xmm0 \n" 1773 "movdqa (%esi,%ebp),%xmm1 \n" 1774 "pavgb %xmm1,%xmm0 \n" 1775 "pshufb %xmm2,%xmm0 \n" 1776 "pmaddubsw %xmm5,%xmm0 \n" 1777 "paddsw %xmm7,%xmm0 \n" 1778 "psrlw $0x2,%xmm0 \n" 1779 "packuswb %xmm0,%xmm0 \n" 1780 "movq %xmm0,(%edi) \n" 1781 "movdqu 0x8(%esi),%xmm0 \n" 1782 "movdqu 0x8(%esi,%ebp),%xmm1 \n" 1783 "pavgb %xmm1,%xmm0 \n" 1784 "pshufb %xmm3,%xmm0 \n" 1785 "pmaddubsw %xmm6,%xmm0 \n" 1786 "paddsw %xmm7,%xmm0 \n" 1787 "psrlw $0x2,%xmm0 \n" 1788 "packuswb %xmm0,%xmm0 \n" 1789 "movq %xmm0,0x8(%edi) \n" 1790 "movdqa 0x10(%esi),%xmm0 \n" 1791 "movdqa 0x10(%esi,%ebp),%xmm1 \n" 1792 "lea 0x20(%esi),%esi \n" 1793 "pavgb %xmm1,%xmm0 \n" 1794 "pshufb %xmm4,%xmm0 \n" 1795 "movdqa _madd21,%xmm1 \n" 1796 "pmaddubsw %xmm1,%xmm0 \n" 1797 "paddsw %xmm7,%xmm0 \n" 1798 "psrlw $0x2,%xmm0 \n" 1799 "packuswb %xmm0,%xmm0 \n" 1800 "movq %xmm0,0x10(%edi) \n" 1801 "lea 0x18(%edi),%edi \n" 1802 "sub $0x18,%ecx \n" 1803 "ja 1b \n" 1804 1805 "popa \n" 1806 "ret \n" 1807 ); 1808 1809 void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride, 1810 uint8* dst_ptr, int dst_width); 1811 asm( 1812 DECLARE_FUNCTION(ScaleRowDown34_0_Int_SSSE3) 1813 "pusha \n" 1814 "mov 0x24(%esp),%esi \n" 1815 "mov 0x28(%esp),%ebp \n" 1816 "mov 0x2c(%esp),%edi \n" 1817 "mov 0x30(%esp),%ecx \n" 1818 "movdqa _shuf01,%xmm2 \n" 1819 "movdqa _shuf11,%xmm3 \n" 1820 "movdqa _shuf21,%xmm4 \n" 1821 "movdqa _madd01,%xmm5 \n" 1822 "movdqa _madd11,%xmm6 \n" 1823 "movdqa _round34,%xmm7 \n" 1824 1825 "1:" 1826 "movdqa (%esi),%xmm0 \n" 1827 "movdqa (%esi,%ebp,1),%xmm1 \n" 1828 "pavgb %xmm0,%xmm1 \n" 1829 "pavgb %xmm1,%xmm0 \n" 1830 "pshufb %xmm2,%xmm0 \n" 1831 "pmaddubsw %xmm5,%xmm0 \n" 1832 "paddsw %xmm7,%xmm0 \n" 1833 "psrlw $0x2,%xmm0 \n" 1834 "packuswb %xmm0,%xmm0 \n" 1835 "movq %xmm0,(%edi) \n" 1836 "movdqu 0x8(%esi),%xmm0 \n" 1837 "movdqu 0x8(%esi,%ebp,1),%xmm1 \n" 1838 "pavgb %xmm0,%xmm1 \n" 1839 "pavgb %xmm1,%xmm0 \n" 1840 "pshufb %xmm3,%xmm0 \n" 1841 "pmaddubsw %xmm6,%xmm0 \n" 1842 "paddsw %xmm7,%xmm0 \n" 1843 "psrlw $0x2,%xmm0 \n" 1844 "packuswb %xmm0,%xmm0 \n" 1845 "movq %xmm0,0x8(%edi) \n" 1846 "movdqa 0x10(%esi),%xmm0 \n" 1847 "movdqa 0x10(%esi,%ebp,1),%xmm1 \n" 1848 "lea 0x20(%esi),%esi \n" 1849 "pavgb %xmm0,%xmm1 \n" 1850 "pavgb %xmm1,%xmm0 \n" 1851 "pshufb %xmm4,%xmm0 \n" 1852 "movdqa _madd21,%xmm1 \n" 1853 "pmaddubsw %xmm1,%xmm0 \n" 1854 "paddsw %xmm7,%xmm0 \n" 1855 "psrlw $0x2,%xmm0 \n" 1856 "packuswb %xmm0,%xmm0 \n" 1857 "movq %xmm0,0x10(%edi) \n" 1858 "lea 0x18(%edi),%edi \n" 1859 "sub $0x18,%ecx \n" 1860 "ja 1b \n" 1861 "popa \n" 1862 "ret \n" 1863 ); 1864 1865 #define HAS_SCALEROWDOWN38_SSSE3 1866 void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride, 1867 uint8* dst_ptr, int dst_width); 1868 asm( 1869 DECLARE_FUNCTION(ScaleRowDown38_SSSE3) 1870 "pusha \n" 1871 "mov 0x24(%esp),%esi \n" 1872 "mov 0x28(%esp),%edx \n" 1873 "mov 0x2c(%esp),%edi \n" 1874 "mov 0x30(%esp),%ecx \n" 1875 "movdqa _shuf38a ,%xmm4 \n" 1876 "movdqa _shuf38b ,%xmm5 \n" 1877 1878 "1:" 1879 "movdqa (%esi),%xmm0 \n" 1880 "movdqa 0x10(%esi),%xmm1 \n" 1881 "lea 0x20(%esi),%esi \n" 1882 "pshufb %xmm4,%xmm0 \n" 1883 "pshufb %xmm5,%xmm1 \n" 1884 "paddusb %xmm1,%xmm0 \n" 1885 "movq %xmm0,(%edi) \n" 1886 "movhlps %xmm0,%xmm1 \n" 1887 "movd %xmm1,0x8(%edi) \n" 1888 "lea 0xc(%edi),%edi \n" 1889 "sub $0xc,%ecx \n" 1890 "ja 1b \n" 1891 "popa \n" 1892 "ret \n" 1893 ); 1894 1895 void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride, 1896 uint8* dst_ptr, int dst_width); 1897 asm( 1898 DECLARE_FUNCTION(ScaleRowDown38_3_Int_SSSE3) 1899 "pusha \n" 1900 "mov 0x24(%esp),%esi \n" 1901 "mov 0x28(%esp),%edx \n" 1902 "mov 0x2c(%esp),%edi \n" 1903 "mov 0x30(%esp),%ecx \n" 1904 "movdqa _shufac0,%xmm4 \n" 1905 "movdqa _shufac3,%xmm5 \n" 1906 "movdqa _scaleac3,%xmm6 \n" 1907 "pxor %xmm7,%xmm7 \n" 1908 1909 "1:" 1910 "movdqa (%esi),%xmm0 \n" 1911 "movdqa (%esi,%edx,1),%xmm2 \n" 1912 "movhlps %xmm0,%xmm1 \n" 1913 "movhlps %xmm2,%xmm3 \n" 1914 "punpcklbw %xmm7,%xmm0 \n" 1915 "punpcklbw %xmm7,%xmm1 \n" 1916 "punpcklbw %xmm7,%xmm2 \n" 1917 "punpcklbw %xmm7,%xmm3 \n" 1918 "paddusw %xmm2,%xmm0 \n" 1919 "paddusw %xmm3,%xmm1 \n" 1920 "movdqa (%esi,%edx,2),%xmm2 \n" 1921 "lea 0x10(%esi),%esi \n" 1922 "movhlps %xmm2,%xmm3 \n" 1923 "punpcklbw %xmm7,%xmm2 \n" 1924 "punpcklbw %xmm7,%xmm3 \n" 1925 "paddusw %xmm2,%xmm0 \n" 1926 "paddusw %xmm3,%xmm1 \n" 1927 "movdqa %xmm0,%xmm2 \n" 1928 "psrldq $0x2,%xmm0 \n" 1929 "paddusw %xmm0,%xmm2 \n" 1930 "psrldq $0x2,%xmm0 \n" 1931 "paddusw %xmm0,%xmm2 \n" 1932 "pshufb %xmm4,%xmm2 \n" 1933 "movdqa %xmm1,%xmm3 \n" 1934 "psrldq $0x2,%xmm1 \n" 1935 "paddusw %xmm1,%xmm3 \n" 1936 "psrldq $0x2,%xmm1 \n" 1937 "paddusw %xmm1,%xmm3 \n" 1938 "pshufb %xmm5,%xmm3 \n" 1939 "paddusw %xmm3,%xmm2 \n" 1940 "pmulhuw %xmm6,%xmm2 \n" 1941 "packuswb %xmm2,%xmm2 \n" 1942 "movd %xmm2,(%edi) \n" 1943 "pextrw $0x2,%xmm2,%eax \n" 1944 "mov %ax,0x4(%edi) \n" 1945 "lea 0x6(%edi),%edi \n" 1946 "sub $0x6,%ecx \n" 1947 "ja 1b \n" 1948 "popa \n" 1949 "ret \n" 1950 ); 1951 1952 void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride, 1953 uint8* dst_ptr, int dst_width); 1954 asm( 1955 DECLARE_FUNCTION(ScaleRowDown38_2_Int_SSSE3) 1956 "pusha \n" 1957 "mov 0x24(%esp),%esi \n" 1958 "mov 0x28(%esp),%edx \n" 1959 "mov 0x2c(%esp),%edi \n" 1960 "mov 0x30(%esp),%ecx \n" 1961 "movdqa _shufab0,%xmm4 \n" 1962 "movdqa _shufab1,%xmm5 \n" 1963 "movdqa _shufab2,%xmm6 \n" 1964 "movdqa _scaleab2,%xmm7 \n" 1965 1966 "1:" 1967 "movdqa (%esi),%xmm2 \n" 1968 "pavgb (%esi,%edx,1),%xmm2 \n" 1969 "lea 0x10(%esi),%esi \n" 1970 "movdqa %xmm2,%xmm0 \n" 1971 "pshufb %xmm4,%xmm0 \n" 1972 "movdqa %xmm2,%xmm1 \n" 1973 "pshufb %xmm5,%xmm1 \n" 1974 "paddusw %xmm1,%xmm0 \n" 1975 "pshufb %xmm6,%xmm2 \n" 1976 "paddusw %xmm2,%xmm0 \n" 1977 "pmulhuw %xmm7,%xmm0 \n" 1978 "packuswb %xmm0,%xmm0 \n" 1979 "movd %xmm0,(%edi) \n" 1980 "pextrw $0x2,%xmm0,%eax \n" 1981 "mov %ax,0x4(%edi) \n" 1982 "lea 0x6(%edi),%edi \n" 1983 "sub $0x6,%ecx \n" 1984 "ja 1b \n" 1985 "popa \n" 1986 "ret \n" 1987 ); 1988 #endif // __PIC__ 1989 1990 #define HAS_SCALEADDROWS_SSE2 1991 void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride, 1992 uint16* dst_ptr, int src_width, 1993 int src_height); 1994 asm( 1995 DECLARE_FUNCTION(ScaleAddRows_SSE2) 1996 "pusha \n" 1997 "mov 0x24(%esp),%esi \n" 1998 "mov 0x28(%esp),%edx \n" 1999 "mov 0x2c(%esp),%edi \n" 2000 "mov 0x30(%esp),%ecx \n" 2001 "mov 0x34(%esp),%ebx \n" 2002 "pxor %xmm5,%xmm5 \n" 2003 2004 "1:" 2005 "movdqa (%esi),%xmm2 \n" 2006 "lea (%esi,%edx,1),%eax \n" 2007 "movhlps %xmm2,%xmm3 \n" 2008 "lea -0x1(%ebx),%ebp \n" 2009 "punpcklbw %xmm5,%xmm2 \n" 2010 "punpcklbw %xmm5,%xmm3 \n" 2011 2012 "2:" 2013 "movdqa (%eax),%xmm0 \n" 2014 "lea (%eax,%edx,1),%eax \n" 2015 "movhlps %xmm0,%xmm1 \n" 2016 "punpcklbw %xmm5,%xmm0 \n" 2017 "punpcklbw %xmm5,%xmm1 \n" 2018 "paddusw %xmm0,%xmm2 \n" 2019 "paddusw %xmm1,%xmm3 \n" 2020 "sub $0x1,%ebp \n" 2021 "ja 2b \n" 2022 2023 "movdqa %xmm2,(%edi) \n" 2024 "movdqa %xmm3,0x10(%edi) \n" 2025 "lea 0x20(%edi),%edi \n" 2026 "lea 0x10(%esi),%esi \n" 2027 "sub $0x10,%ecx \n" 2028 "ja 1b \n" 2029 "popa \n" 2030 "ret \n" 2031 ); 2032 2033 // Bilinear row filtering combines 16x2 -> 16x1. SSE2 version 2034 #define HAS_SCALEFILTERROWS_SSE2 2035 void ScaleFilterRows_SSE2(uint8* dst_ptr, 2036 const uint8* src_ptr, int src_stride, 2037 int dst_width, int source_y_fraction); 2038 asm( 2039 DECLARE_FUNCTION(ScaleFilterRows_SSE2) 2040 "push %esi \n" 2041 "push %edi \n" 2042 "mov 0xc(%esp),%edi \n" 2043 "mov 0x10(%esp),%esi \n" 2044 "mov 0x14(%esp),%edx \n" 2045 "mov 0x18(%esp),%ecx \n" 2046 "mov 0x1c(%esp),%eax \n" 2047 "cmp $0x0,%eax \n" 2048 "je 2f \n" 2049 "cmp $0x80,%eax \n" 2050 "je 3f \n" 2051 "movd %eax,%xmm6 \n" 2052 "punpcklwd %xmm6,%xmm6 \n" 2053 "pshufd $0x0,%xmm6,%xmm6 \n" 2054 "neg %eax \n" 2055 "add $0x100,%eax \n" 2056 "movd %eax,%xmm5 \n" 2057 "punpcklwd %xmm5,%xmm5 \n" 2058 "pshufd $0x0,%xmm5,%xmm5 \n" 2059 "pxor %xmm7,%xmm7 \n" 2060 2061 "1:" 2062 "movdqa (%esi),%xmm0 \n" 2063 "movdqa (%esi,%edx,1),%xmm2 \n" 2064 "lea 0x10(%esi),%esi \n" 2065 "movdqa %xmm0,%xmm1 \n" 2066 "movdqa %xmm2,%xmm3 \n" 2067 "punpcklbw %xmm7,%xmm0 \n" 2068 "punpcklbw %xmm7,%xmm2 \n" 2069 "punpckhbw %xmm7,%xmm1 \n" 2070 "punpckhbw %xmm7,%xmm3 \n" 2071 "pmullw %xmm5,%xmm0 \n" 2072 "pmullw %xmm5,%xmm1 \n" 2073 "pmullw %xmm6,%xmm2 \n" 2074 "pmullw %xmm6,%xmm3 \n" 2075 "paddusw %xmm2,%xmm0 \n" 2076 "paddusw %xmm3,%xmm1 \n" 2077 "psrlw $0x8,%xmm0 \n" 2078 "psrlw $0x8,%xmm1 \n" 2079 "packuswb %xmm1,%xmm0 \n" 2080 "movdqa %xmm0,(%edi) \n" 2081 "lea 0x10(%edi),%edi \n" 2082 "sub $0x10,%ecx \n" 2083 "ja 1b \n" 2084 "mov -0x1(%edi),%al \n" 2085 "mov %al,(%edi) \n" 2086 "pop %edi \n" 2087 "pop %esi \n" 2088 "ret \n" 2089 2090 "2:" 2091 "movdqa (%esi),%xmm0 \n" 2092 "lea 0x10(%esi),%esi \n" 2093 "movdqa %xmm0,(%edi) \n" 2094 "lea 0x10(%edi),%edi \n" 2095 "sub $0x10,%ecx \n" 2096 "ja 2b \n" 2097 2098 "mov -0x1(%edi),%al \n" 2099 "mov %al,(%edi) \n" 2100 "pop %edi \n" 2101 "pop %esi \n" 2102 "ret \n" 2103 2104 "3:" 2105 "movdqa (%esi),%xmm0 \n" 2106 "movdqa (%esi,%edx,1),%xmm2 \n" 2107 "lea 0x10(%esi),%esi \n" 2108 "pavgb %xmm2,%xmm0 \n" 2109 "movdqa %xmm0,(%edi) \n" 2110 "lea 0x10(%edi),%edi \n" 2111 "sub $0x10,%ecx \n" 2112 "ja 3b \n" 2113 2114 "mov -0x1(%edi),%al \n" 2115 "mov %al,(%edi) \n" 2116 "pop %edi \n" 2117 "pop %esi \n" 2118 "ret \n" 2119 ); 2120 2121 // Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version 2122 #define HAS_SCALEFILTERROWS_SSSE3 2123 void ScaleFilterRows_SSSE3(uint8* dst_ptr, 2124 const uint8* src_ptr, int src_stride, 2125 int dst_width, int source_y_fraction); 2126 asm( 2127 DECLARE_FUNCTION(ScaleFilterRows_SSSE3) 2128 "push %esi \n" 2129 "push %edi \n" 2130 "mov 0xc(%esp),%edi \n" 2131 "mov 0x10(%esp),%esi \n" 2132 "mov 0x14(%esp),%edx \n" 2133 "mov 0x18(%esp),%ecx \n" 2134 "mov 0x1c(%esp),%eax \n" 2135 "shr %eax \n" 2136 "cmp $0x0,%eax \n" 2137 "je 2f \n" 2138 "cmp $0x40,%eax \n" 2139 "je 3f \n" 2140 "mov %al,%ah \n" 2141 "neg %al \n" 2142 "add $0x80,%al \n" 2143 "movd %eax,%xmm5 \n" 2144 "punpcklwd %xmm5,%xmm5 \n" 2145 "pshufd $0x0,%xmm5,%xmm5 \n" 2146 2147 "1:" 2148 "movdqa (%esi),%xmm0 \n" 2149 "movdqa (%esi,%edx,1),%xmm2 \n" 2150 "lea 0x10(%esi),%esi \n" 2151 "movdqa %xmm0,%xmm1 \n" 2152 "punpcklbw %xmm2,%xmm0 \n" 2153 "punpckhbw %xmm2,%xmm1 \n" 2154 "pmaddubsw %xmm5,%xmm0 \n" 2155 "pmaddubsw %xmm5,%xmm1 \n" 2156 "psrlw $0x7,%xmm0 \n" 2157 "psrlw $0x7,%xmm1 \n" 2158 "packuswb %xmm1,%xmm0 \n" 2159 "movdqa %xmm0,(%edi) \n" 2160 "lea 0x10(%edi),%edi \n" 2161 "sub $0x10,%ecx \n" 2162 "ja 1b \n" 2163 "mov -0x1(%edi),%al \n" 2164 "mov %al,(%edi) \n" 2165 "pop %edi \n" 2166 "pop %esi \n" 2167 "ret \n" 2168 2169 "2:" 2170 "movdqa (%esi),%xmm0 \n" 2171 "lea 0x10(%esi),%esi \n" 2172 "movdqa %xmm0,(%edi) \n" 2173 "lea 0x10(%edi),%edi \n" 2174 "sub $0x10,%ecx \n" 2175 "ja 2b \n" 2176 "mov -0x1(%edi),%al \n" 2177 "mov %al,(%edi) \n" 2178 "pop %edi \n" 2179 "pop %esi \n" 2180 "ret \n" 2181 2182 "3:" 2183 "movdqa (%esi),%xmm0 \n" 2184 "movdqa (%esi,%edx,1),%xmm2 \n" 2185 "lea 0x10(%esi),%esi \n" 2186 "pavgb %xmm2,%xmm0 \n" 2187 "movdqa %xmm0,(%edi) \n" 2188 "lea 0x10(%edi),%edi \n" 2189 "sub $0x10,%ecx \n" 2190 "ja 3b \n" 2191 "mov -0x1(%edi),%al \n" 2192 "mov %al,(%edi) \n" 2193 "pop %edi \n" 2194 "pop %esi \n" 2195 "ret \n" 2196 ); 2197 2198 #elif defined(__x86_64__) 2199 static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride, 2200 uint8* dst_ptr, int dst_width) { 2201 asm volatile ( 2202 "lea (%3,%3,2),%%r10 \n" 2203 "pxor %%xmm7,%%xmm7 \n" 2204 "1:" 2205 "movdqa (%0),%%xmm0 \n" 2206 "movdqa 0x10(%0),%%xmm1 \n" 2207 "movdqa (%0,%3,1),%%xmm2 \n" 2208 "movdqa 0x10(%0,%3,1),%%xmm3 \n" 2209 "pavgb %%xmm2,%%xmm0 \n" 2210 "pavgb %%xmm3,%%xmm1 \n" 2211 "movdqa (%0,%3,2),%%xmm2 \n" 2212 "movdqa 0x10(%0,%3,2),%%xmm3 \n" 2213 "movdqa (%0,%%r10,1),%%xmm4 \n" 2214 "movdqa 0x10(%0,%%r10,1),%%xmm5 \n" 2215 "lea (%0,%3,4),%%r11 \n" 2216 "lea 0x20(%0),%0 \n" 2217 "pavgb %%xmm4,%%xmm2 \n" 2218 "pavgb %%xmm5,%%xmm3 \n" 2219 "pavgb %%xmm2,%%xmm0 \n" 2220 "pavgb %%xmm3,%%xmm1 \n" 2221 "movdqa 0x0(%%r11),%%xmm2 \n" 2222 "movdqa 0x10(%%r11),%%xmm3 \n" 2223 "movdqa 0x0(%%r11,%3,1),%%xmm4 \n" 2224 "movdqa 0x10(%%r11,%3,1),%%xmm5 \n" 2225 "pavgb %%xmm4,%%xmm2 \n" 2226 "pavgb %%xmm5,%%xmm3 \n" 2227 "movdqa 0x0(%%r11,%3,2),%%xmm4 \n" 2228 "movdqa 0x10(%%r11,%3,2),%%xmm5 \n" 2229 "movdqa 0x0(%%r11,%%r10,1),%%xmm6 \n" 2230 "pavgb %%xmm6,%%xmm4 \n" 2231 "movdqa 0x10(%%r11,%%r10,1),%%xmm6 \n" 2232 "pavgb %%xmm6,%%xmm5 \n" 2233 "pavgb %%xmm4,%%xmm2 \n" 2234 "pavgb %%xmm5,%%xmm3 \n" 2235 "pavgb %%xmm2,%%xmm0 \n" 2236 "pavgb %%xmm3,%%xmm1 \n" 2237 "psadbw %%xmm7,%%xmm0 \n" 2238 "psadbw %%xmm7,%%xmm1 \n" 2239 "pshufd $0xd8,%%xmm0,%%xmm0 \n" 2240 "pshufd $0x8d,%%xmm1,%%xmm1 \n" 2241 "por %%xmm1,%%xmm0 \n" 2242 "psrlw $0x3,%%xmm0 \n" 2243 "packuswb %%xmm0,%%xmm0 \n" 2244 "packuswb %%xmm0,%%xmm0 \n" 2245 "movd %%xmm0,(%1) \n" 2246 "lea 0x4(%1),%1 \n" 2247 "sub $0x4,%2 \n" 2248 "ja 1b \n" 2249 : "+r"(src_ptr), // %0 2250 "+r"(dst_ptr), // %1 2251 "+r"(dst_width) // %2 2252 : "r"((intptr_t)(src_stride)) // %3 2253 : "memory", "cc", "r10", "r11", "xmm6", "xmm7" 2254 ); 2255 } 2256 2257 #define HAS_SCALEROWDOWN34_SSSE3 2258 static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride, 2259 uint8* dst_ptr, int dst_width) { 2260 asm volatile ( 2261 "movdqa (%3),%%xmm3 \n" 2262 "movdqa (%4),%%xmm4 \n" 2263 "movdqa (%5),%%xmm5 \n" 2264 "1:" 2265 "movdqa (%0),%%xmm0 \n" 2266 "movdqa 0x10(%0),%%xmm2 \n" 2267 "lea 0x20(%0),%0 \n" 2268 "movdqa %%xmm2,%%xmm1 \n" 2269 "palignr $0x8,%%xmm0,%%xmm1 \n" 2270 "pshufb %%xmm3,%%xmm0 \n" 2271 "pshufb %%xmm4,%%xmm1 \n" 2272 "pshufb %%xmm5,%%xmm2 \n" 2273 "movq %%xmm0,(%1) \n" 2274 "movq %%xmm1,0x8(%1) \n" 2275 "movq %%xmm2,0x10(%1) \n" 2276 "lea 0x18(%1),%1 \n" 2277 "sub $0x18,%2 \n" 2278 "ja 1b \n" 2279 : "+r"(src_ptr), // %0 2280 "+r"(dst_ptr), // %1 2281 "+r"(dst_width) // %2 2282 : "r"(_shuf0), // %3 2283 "r"(_shuf1), // %4 2284 "r"(_shuf2) // %5 2285 : "memory", "cc" 2286 ); 2287 } 2288 2289 static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride, 2290 uint8* dst_ptr, int dst_width) { 2291 asm volatile ( 2292 "movdqa (%4),%%xmm2 \n" // _shuf01 2293 "movdqa (%5),%%xmm3 \n" // _shuf11 2294 "movdqa (%6),%%xmm4 \n" // _shuf21 2295 "movdqa (%7),%%xmm5 \n" // _madd01 2296 "movdqa (%8),%%xmm6 \n" // _madd11 2297 "movdqa (%9),%%xmm7 \n" // _round34 2298 "movdqa (%10),%%xmm8 \n" // _madd21 2299 "1:" 2300 "movdqa (%0),%%xmm0 \n" 2301 "movdqa (%0,%3),%%xmm1 \n" 2302 "pavgb %%xmm1,%%xmm0 \n" 2303 "pshufb %%xmm2,%%xmm0 \n" 2304 "pmaddubsw %%xmm5,%%xmm0 \n" 2305 "paddsw %%xmm7,%%xmm0 \n" 2306 "psrlw $0x2,%%xmm0 \n" 2307 "packuswb %%xmm0,%%xmm0 \n" 2308 "movq %%xmm0,(%1) \n" 2309 "movdqu 0x8(%0),%%xmm0 \n" 2310 "movdqu 0x8(%0,%3),%%xmm1 \n" 2311 "pavgb %%xmm1,%%xmm0 \n" 2312 "pshufb %%xmm3,%%xmm0 \n" 2313 "pmaddubsw %%xmm6,%%xmm0 \n" 2314 "paddsw %%xmm7,%%xmm0 \n" 2315 "psrlw $0x2,%%xmm0 \n" 2316 "packuswb %%xmm0,%%xmm0 \n" 2317 "movq %%xmm0,0x8(%1) \n" 2318 "movdqa 0x10(%0),%%xmm0 \n" 2319 "movdqa 0x10(%0,%3),%%xmm1 \n" 2320 "lea 0x20(%0),%0 \n" 2321 "pavgb %%xmm1,%%xmm0 \n" 2322 "pshufb %%xmm4,%%xmm0 \n" 2323 "pmaddubsw %%xmm8,%%xmm0 \n" 2324 "paddsw %%xmm7,%%xmm0 \n" 2325 "psrlw $0x2,%%xmm0 \n" 2326 "packuswb %%xmm0,%%xmm0 \n" 2327 "movq %%xmm0,0x10(%1) \n" 2328 "lea 0x18(%1),%1 \n" 2329 "sub $0x18,%2 \n" 2330 "ja 1b \n" 2331 : "+r"(src_ptr), // %0 2332 "+r"(dst_ptr), // %1 2333 "+r"(dst_width) // %2 2334 : "r"((intptr_t)(src_stride)), // %3 2335 "r"(_shuf01), // %4 2336 "r"(_shuf11), // %5 2337 "r"(_shuf21), // %6 2338 "r"(_madd01), // %7 2339 "r"(_madd11), // %8 2340 "r"(_round34), // %9 2341 "r"(_madd21) // %10 2342 : "memory", "cc", "xmm6", "xmm7", "xmm8" 2343 ); 2344 } 2345 2346 static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride, 2347 uint8* dst_ptr, int dst_width) { 2348 asm volatile ( 2349 "movdqa (%4),%%xmm2 \n" // _shuf01 2350 "movdqa (%5),%%xmm3 \n" // _shuf11 2351 "movdqa (%6),%%xmm4 \n" // _shuf21 2352 "movdqa (%7),%%xmm5 \n" // _madd01 2353 "movdqa (%8),%%xmm6 \n" // _madd11 2354 "movdqa (%9),%%xmm7 \n" // _round34 2355 "movdqa (%10),%%xmm8 \n" // _madd21 2356 "1:" 2357 "movdqa (%0),%%xmm0 \n" 2358 "movdqa (%0,%3,1),%%xmm1 \n" 2359 "pavgb %%xmm0,%%xmm1 \n" 2360 "pavgb %%xmm1,%%xmm0 \n" 2361 "pshufb %%xmm2,%%xmm0 \n" 2362 "pmaddubsw %%xmm5,%%xmm0 \n" 2363 "paddsw %%xmm7,%%xmm0 \n" 2364 "psrlw $0x2,%%xmm0 \n" 2365 "packuswb %%xmm0,%%xmm0 \n" 2366 "movq %%xmm0,(%1) \n" 2367 "movdqu 0x8(%0),%%xmm0 \n" 2368 "movdqu 0x8(%0,%3,1),%%xmm1 \n" 2369 "pavgb %%xmm0,%%xmm1 \n" 2370 "pavgb %%xmm1,%%xmm0 \n" 2371 "pshufb %%xmm3,%%xmm0 \n" 2372 "pmaddubsw %%xmm6,%%xmm0 \n" 2373 "paddsw %%xmm7,%%xmm0 \n" 2374 "psrlw $0x2,%%xmm0 \n" 2375 "packuswb %%xmm0,%%xmm0 \n" 2376 "movq %%xmm0,0x8(%1) \n" 2377 "movdqa 0x10(%0),%%xmm0 \n" 2378 "movdqa 0x10(%0,%3,1),%%xmm1 \n" 2379 "lea 0x20(%0),%0 \n" 2380 "pavgb %%xmm0,%%xmm1 \n" 2381 "pavgb %%xmm1,%%xmm0 \n" 2382 "pshufb %%xmm4,%%xmm0 \n" 2383 "pmaddubsw %%xmm8,%%xmm0 \n" 2384 "paddsw %%xmm7,%%xmm0 \n" 2385 "psrlw $0x2,%%xmm0 \n" 2386 "packuswb %%xmm0,%%xmm0 \n" 2387 "movq %%xmm0,0x10(%1) \n" 2388 "lea 0x18(%1),%1 \n" 2389 "sub $0x18,%2 \n" 2390 "ja 1b \n" 2391 : "+r"(src_ptr), // %0 2392 "+r"(dst_ptr), // %1 2393 "+r"(dst_width) // %2 2394 : "r"((intptr_t)(src_stride)), // %3 2395 "r"(_shuf01), // %4 2396 "r"(_shuf11), // %5 2397 "r"(_shuf21), // %6 2398 "r"(_madd01), // %7 2399 "r"(_madd11), // %8 2400 "r"(_round34), // %9 2401 "r"(_madd21) // %10 2402 : "memory", "cc", "xmm6", "xmm7", "xmm8" 2403 ); 2404 } 2405 2406 #define HAS_SCALEROWDOWN38_SSSE3 2407 static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride, 2408 uint8* dst_ptr, int dst_width) { 2409 asm volatile ( 2410 "movdqa (%3),%%xmm4 \n" 2411 "movdqa (%4),%%xmm5 \n" 2412 "1:" 2413 "movdqa (%0),%%xmm0 \n" 2414 "movdqa 0x10(%0),%%xmm1 \n" 2415 "lea 0x20(%0),%0 \n" 2416 "pshufb %%xmm4,%%xmm0 \n" 2417 "pshufb %%xmm5,%%xmm1 \n" 2418 "paddusb %%xmm1,%%xmm0 \n" 2419 "movq %%xmm0,(%1) \n" 2420 "movhlps %%xmm0,%%xmm1 \n" 2421 "movd %%xmm1,0x8(%1) \n" 2422 "lea 0xc(%1),%1 \n" 2423 "sub $0xc,%2 \n" 2424 "ja 1b \n" 2425 : "+r"(src_ptr), // %0 2426 "+r"(dst_ptr), // %1 2427 "+r"(dst_width) // %2 2428 : "r"(_shuf38a), // %3 2429 "r"(_shuf38b) // %4 2430 : "memory", "cc" 2431 ); 2432 } 2433 2434 static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride, 2435 uint8* dst_ptr, int dst_width) { 2436 asm volatile ( 2437 "movdqa (%4),%%xmm4 \n" 2438 "movdqa (%5),%%xmm5 \n" 2439 "movdqa (%6),%%xmm6 \n" 2440 "pxor %%xmm7,%%xmm7 \n" 2441 "1:" 2442 "movdqa (%0),%%xmm0 \n" 2443 "movdqa (%0,%3,1),%%xmm2 \n" 2444 "movhlps %%xmm0,%%xmm1 \n" 2445 "movhlps %%xmm2,%%xmm3 \n" 2446 "punpcklbw %%xmm7,%%xmm0 \n" 2447 "punpcklbw %%xmm7,%%xmm1 \n" 2448 "punpcklbw %%xmm7,%%xmm2 \n" 2449 "punpcklbw %%xmm7,%%xmm3 \n" 2450 "paddusw %%xmm2,%%xmm0 \n" 2451 "paddusw %%xmm3,%%xmm1 \n" 2452 "movdqa (%0,%3,2),%%xmm2 \n" 2453 "lea 0x10(%0),%0 \n" 2454 "movhlps %%xmm2,%%xmm3 \n" 2455 "punpcklbw %%xmm7,%%xmm2 \n" 2456 "punpcklbw %%xmm7,%%xmm3 \n" 2457 "paddusw %%xmm2,%%xmm0 \n" 2458 "paddusw %%xmm3,%%xmm1 \n" 2459 "movdqa %%xmm0,%%xmm2 \n" 2460 "psrldq $0x2,%%xmm0 \n" 2461 "paddusw %%xmm0,%%xmm2 \n" 2462 "psrldq $0x2,%%xmm0 \n" 2463 "paddusw %%xmm0,%%xmm2 \n" 2464 "pshufb %%xmm4,%%xmm2 \n" 2465 "movdqa %%xmm1,%%xmm3 \n" 2466 "psrldq $0x2,%%xmm1 \n" 2467 "paddusw %%xmm1,%%xmm3 \n" 2468 "psrldq $0x2,%%xmm1 \n" 2469 "paddusw %%xmm1,%%xmm3 \n" 2470 "pshufb %%xmm5,%%xmm3 \n" 2471 "paddusw %%xmm3,%%xmm2 \n" 2472 "pmulhuw %%xmm6,%%xmm2 \n" 2473 "packuswb %%xmm2,%%xmm2 \n" 2474 "movd %%xmm2,(%1) \n" 2475 "pextrw $0x2,%%xmm2,%%eax \n" 2476 "mov %%ax,0x4(%1) \n" 2477 "lea 0x6(%1),%1 \n" 2478 "sub $0x6,%2 \n" 2479 "ja 1b \n" 2480 : "+r"(src_ptr), // %0 2481 "+r"(dst_ptr), // %1 2482 "+r"(dst_width) // %2 2483 : "r"((intptr_t)(src_stride)), // %3 2484 "r"(_shufac0), // %4 2485 "r"(_shufac3), // %5 2486 "r"(_scaleac3) // %6 2487 : "memory", "cc", "rax", "xmm6", "xmm7" 2488 ); 2489 } 2490 2491 static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride, 2492 uint8* dst_ptr, int dst_width) { 2493 asm volatile ( 2494 "movdqa (%4),%%xmm4 \n" 2495 "movdqa (%5),%%xmm5 \n" 2496 "movdqa (%6),%%xmm6 \n" 2497 "movdqa (%7),%%xmm7 \n" 2498 "1:" 2499 "movdqa (%0),%%xmm2 \n" 2500 "pavgb (%0,%3,1),%%xmm2 \n" 2501 "lea 0x10(%0),%0 \n" 2502 "movdqa %%xmm2,%%xmm0 \n" 2503 "pshufb %%xmm4,%%xmm0 \n" 2504 "movdqa %%xmm2,%%xmm1 \n" 2505 "pshufb %%xmm5,%%xmm1 \n" 2506 "paddusw %%xmm1,%%xmm0 \n" 2507 "pshufb %%xmm6,%%xmm2 \n" 2508 "paddusw %%xmm2,%%xmm0 \n" 2509 "pmulhuw %%xmm7,%%xmm0 \n" 2510 "packuswb %%xmm0,%%xmm0 \n" 2511 "movd %%xmm0,(%1) \n" 2512 "pextrw $0x2,%%xmm0,%%eax \n" 2513 "mov %%ax,0x4(%1) \n" 2514 "lea 0x6(%1),%1 \n" 2515 "sub $0x6,%2 \n" 2516 "ja 1b \n" 2517 : "+r"(src_ptr), // %0 2518 "+r"(dst_ptr), // %1 2519 "+r"(dst_width) // %2 2520 : "r"((intptr_t)(src_stride)), // %3 2521 "r"(_shufab0), // %4 2522 "r"(_shufab1), // %5 2523 "r"(_shufab2), // %6 2524 "r"(_scaleab2) // %7 2525 : "memory", "cc", "rax", "xmm6", "xmm7" 2526 ); 2527 } 2528 2529 #define HAS_SCALEADDROWS_SSE2 2530 static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride, 2531 uint16* dst_ptr, int src_width, 2532 int src_height) { 2533 asm volatile ( 2534 "pxor %%xmm5,%%xmm5 \n" 2535 "1:" 2536 "movdqa (%0),%%xmm2 \n" 2537 "lea (%0,%4,1),%%r10 \n" 2538 "movhlps %%xmm2,%%xmm3 \n" 2539 "lea -0x1(%3),%%r11 \n" 2540 "punpcklbw %%xmm5,%%xmm2 \n" 2541 "punpcklbw %%xmm5,%%xmm3 \n" 2542 2543 "2:" 2544 "movdqa (%%r10),%%xmm0 \n" 2545 "lea (%%r10,%4,1),%%r10 \n" 2546 "movhlps %%xmm0,%%xmm1 \n" 2547 "punpcklbw %%xmm5,%%xmm0 \n" 2548 "punpcklbw %%xmm5,%%xmm1 \n" 2549 "paddusw %%xmm0,%%xmm2 \n" 2550 "paddusw %%xmm1,%%xmm3 \n" 2551 "sub $0x1,%%r11 \n" 2552 "ja 2b \n" 2553 2554 "movdqa %%xmm2,(%1) \n" 2555 "movdqa %%xmm3,0x10(%1) \n" 2556 "lea 0x20(%1),%1 \n" 2557 "lea 0x10(%0),%0 \n" 2558 "sub $0x10,%2 \n" 2559 "ja 1b \n" 2560 : "+r"(src_ptr), // %0 2561 "+r"(dst_ptr), // %1 2562 "+r"(src_width), // %2 2563 "+r"(src_height) // %3 2564 : "r"((intptr_t)(src_stride)) // %4 2565 : "memory", "cc", "r10", "r11" 2566 ); 2567 } 2568 2569 // Bilinear row filtering combines 16x2 -> 16x1. SSE2 version 2570 #define HAS_SCALEFILTERROWS_SSE2 2571 static void ScaleFilterRows_SSE2(uint8* dst_ptr, 2572 const uint8* src_ptr, int src_stride, 2573 int dst_width, int source_y_fraction) { 2574 if (source_y_fraction == 0) { 2575 asm volatile ( 2576 "1:" 2577 "movdqa (%1),%%xmm0 \n" 2578 "lea 0x10(%1),%1 \n" 2579 "movdqa %%xmm0,(%0) \n" 2580 "lea 0x10(%0),%0 \n" 2581 "sub $0x10,%2 \n" 2582 "ja 1b \n" 2583 "mov -0x1(%0),%%al \n" 2584 "mov %%al,(%0) \n" 2585 : "+r"(dst_ptr), // %0 2586 "+r"(src_ptr), // %1 2587 "+r"(dst_width) // %2 2588 : 2589 : "memory", "cc", "rax" 2590 ); 2591 return; 2592 } else if (source_y_fraction == 128) { 2593 asm volatile ( 2594 "1:" 2595 "movdqa (%1),%%xmm0 \n" 2596 "movdqa (%1,%3,1),%%xmm2 \n" 2597 "lea 0x10(%1),%1 \n" 2598 "pavgb %%xmm2,%%xmm0 \n" 2599 "movdqa %%xmm0,(%0) \n" 2600 "lea 0x10(%0),%0 \n" 2601 "sub $0x10,%2 \n" 2602 "ja 1b \n" 2603 "mov -0x1(%0),%%al \n" 2604 "mov %%al,(%0) \n" 2605 : "+r"(dst_ptr), // %0 2606 "+r"(src_ptr), // %1 2607 "+r"(dst_width) // %2 2608 : "r"((intptr_t)(src_stride)) // %3 2609 : "memory", "cc", "rax" 2610 ); 2611 return; 2612 } else { 2613 asm volatile ( 2614 "mov %3,%%eax \n" 2615 "movd %%eax,%%xmm6 \n" 2616 "punpcklwd %%xmm6,%%xmm6 \n" 2617 "pshufd $0x0,%%xmm6,%%xmm6 \n" 2618 "neg %%eax \n" 2619 "add $0x100,%%eax \n" 2620 "movd %%eax,%%xmm5 \n" 2621 "punpcklwd %%xmm5,%%xmm5 \n" 2622 "pshufd $0x0,%%xmm5,%%xmm5 \n" 2623 "pxor %%xmm7,%%xmm7 \n" 2624 "1:" 2625 "movdqa (%1),%%xmm0 \n" 2626 "movdqa (%1,%4,1),%%xmm2 \n" 2627 "lea 0x10(%1),%1 \n" 2628 "movdqa %%xmm0,%%xmm1 \n" 2629 "movdqa %%xmm2,%%xmm3 \n" 2630 "punpcklbw %%xmm7,%%xmm0 \n" 2631 "punpcklbw %%xmm7,%%xmm2 \n" 2632 "punpckhbw %%xmm7,%%xmm1 \n" 2633 "punpckhbw %%xmm7,%%xmm3 \n" 2634 "pmullw %%xmm5,%%xmm0 \n" 2635 "pmullw %%xmm5,%%xmm1 \n" 2636 "pmullw %%xmm6,%%xmm2 \n" 2637 "pmullw %%xmm6,%%xmm3 \n" 2638 "paddusw %%xmm2,%%xmm0 \n" 2639 "paddusw %%xmm3,%%xmm1 \n" 2640 "psrlw $0x8,%%xmm0 \n" 2641 "psrlw $0x8,%%xmm1 \n" 2642 "packuswb %%xmm1,%%xmm0 \n" 2643 "movdqa %%xmm0,(%0) \n" 2644 "lea 0x10(%0),%0 \n" 2645 "sub $0x10,%2 \n" 2646 "ja 1b \n" 2647 "mov -0x1(%0),%%al \n" 2648 "mov %%al,(%0) \n" 2649 : "+r"(dst_ptr), // %0 2650 "+r"(src_ptr), // %1 2651 "+r"(dst_width), // %2 2652 "+r"(source_y_fraction) // %3 2653 : "r"((intptr_t)(src_stride)) // %4 2654 : "memory", "cc", "rax", "xmm6", "xmm7" 2655 ); 2656 } 2657 return; 2658 } 2659 2660 // Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version 2661 #define HAS_SCALEFILTERROWS_SSSE3 2662 static void ScaleFilterRows_SSSE3(uint8* dst_ptr, 2663 const uint8* src_ptr, int src_stride, 2664 int dst_width, int source_y_fraction) { 2665 source_y_fraction >>= 1; 2666 if (source_y_fraction == 0) { 2667 asm volatile ( 2668 "1:" 2669 "movdqa (%1),%%xmm0 \n" 2670 "lea 0x10(%1),%1 \n" 2671 "movdqa %%xmm0,(%0) \n" 2672 "lea 0x10(%0),%0 \n" 2673 "sub $0x10,%2 \n" 2674 "ja 1b \n" 2675 "mov -0x1(%0),%%al \n" 2676 "mov %%al,(%0) \n" 2677 : "+r"(dst_ptr), // %0 2678 "+r"(src_ptr), // %1 2679 "+r"(dst_width) // %2 2680 : 2681 : "memory", "cc", "rax" 2682 ); 2683 return; 2684 } else if (source_y_fraction == 64) { 2685 asm volatile ( 2686 "1:" 2687 "movdqa (%1),%%xmm0 \n" 2688 "movdqa (%1,%3,1),%%xmm2 \n" 2689 "lea 0x10(%1),%1 \n" 2690 "pavgb %%xmm2,%%xmm0 \n" 2691 "movdqa %%xmm0,(%0) \n" 2692 "lea 0x10(%0),%0 \n" 2693 "sub $0x10,%2 \n" 2694 "ja 1b \n" 2695 "mov -0x1(%0),%%al \n" 2696 "mov %%al,(%0) \n" 2697 : "+r"(dst_ptr), // %0 2698 "+r"(src_ptr), // %1 2699 "+r"(dst_width) // %2 2700 : "r"((intptr_t)(src_stride)) // %3 2701 : "memory", "cc", "rax" 2702 ); 2703 return; 2704 } else { 2705 asm volatile ( 2706 "mov %3,%%eax \n" 2707 "mov %%al,%%ah \n" 2708 "neg %%al \n" 2709 "add $0x80,%%al \n" 2710 "movd %%eax,%%xmm5 \n" 2711 "punpcklwd %%xmm5,%%xmm5 \n" 2712 "pshufd $0x0,%%xmm5,%%xmm5 \n" 2713 "1:" 2714 "movdqa (%1),%%xmm0 \n" 2715 "movdqa (%1,%4,1),%%xmm2 \n" 2716 "lea 0x10(%1),%1 \n" 2717 "movdqa %%xmm0,%%xmm1 \n" 2718 "punpcklbw %%xmm2,%%xmm0 \n" 2719 "punpckhbw %%xmm2,%%xmm1 \n" 2720 "pmaddubsw %%xmm5,%%xmm0 \n" 2721 "pmaddubsw %%xmm5,%%xmm1 \n" 2722 "psrlw $0x7,%%xmm0 \n" 2723 "psrlw $0x7,%%xmm1 \n" 2724 "packuswb %%xmm1,%%xmm0 \n" 2725 "movdqa %%xmm0,(%0) \n" 2726 "lea 0x10(%0),%0 \n" 2727 "sub $0x10,%2 \n" 2728 "ja 1b \n" 2729 "mov -0x1(%0),%%al \n" 2730 "mov %%al,(%0) \n" 2731 : "+r"(dst_ptr), // %0 2732 "+r"(src_ptr), // %1 2733 "+r"(dst_width), // %2 2734 "+r"(source_y_fraction) // %3 2735 : "r"((intptr_t)(src_stride)) // %4 2736 : "memory", "cc", "rax" 2737 ); 2738 } 2739 return; 2740 } 2741 #endif 2742 #endif 2743 2744 // CPU agnostic row functions 2745 static void ScaleRowDown2_C(const uint8* src_ptr, int src_stride, 2746 uint8* dst, int dst_width) { 2747 int x; 2748 for (x = 0; x < dst_width; ++x) { 2749 *dst++ = *src_ptr; 2750 src_ptr += 2; 2751 } 2752 } 2753 2754 static void ScaleRowDown2Int_C(const uint8* src_ptr, int src_stride, 2755 uint8* dst, int dst_width) { 2756 int x; 2757 for (x = 0; x < dst_width; ++x) { 2758 *dst++ = (src_ptr[0] + src_ptr[1] + 2759 src_ptr[src_stride] + src_ptr[src_stride + 1] + 2) >> 2; 2760 src_ptr += 2; 2761 } 2762 } 2763 2764 static void ScaleRowDown4_C(const uint8* src_ptr, int src_stride, 2765 uint8* dst, int dst_width) { 2766 int x; 2767 for (x = 0; x < dst_width; ++x) { 2768 *dst++ = *src_ptr; 2769 src_ptr += 4; 2770 } 2771 } 2772 2773 static void ScaleRowDown4Int_C(const uint8* src_ptr, int src_stride, 2774 uint8* dst, int dst_width) { 2775 int x; 2776 for (x = 0; x < dst_width; ++x) { 2777 *dst++ = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + 2778 src_ptr[src_stride + 0] + src_ptr[src_stride + 1] + 2779 src_ptr[src_stride + 2] + src_ptr[src_stride + 3] + 2780 src_ptr[src_stride * 2 + 0] + src_ptr[src_stride * 2 + 1] + 2781 src_ptr[src_stride * 2 + 2] + src_ptr[src_stride * 2 + 3] + 2782 src_ptr[src_stride * 3 + 0] + src_ptr[src_stride * 3 + 1] + 2783 src_ptr[src_stride * 3 + 2] + src_ptr[src_stride * 3 + 3] + 2784 8) >> 4; 2785 src_ptr += 4; 2786 } 2787 } 2788 2789 // 640 output pixels is enough to allow 5120 input pixels with 1/8 scale down. 2790 // Keeping the total buffer under 4096 bytes avoids a stackcheck, saving 4% cpu. 2791 // The following 2 lines cause error on Windows. 2792 //static const int kMaxOutputWidth = 640; 2793 //static const int kMaxRow12 = 1280; //kMaxOutputWidth * 2; 2794 #define kMaxOutputWidth 640 2795 #define kMaxRow12 1280 2796 2797 static void ScaleRowDown8_C(const uint8* src_ptr, int src_stride, 2798 uint8* dst, int dst_width) { 2799 int x; 2800 for (x = 0; x < dst_width; ++x) { 2801 *dst++ = *src_ptr; 2802 src_ptr += 8; 2803 } 2804 } 2805 2806 // Note calling code checks width is less than max and if not 2807 // uses ScaleRowDown8_C instead. 2808 static void ScaleRowDown8Int_C(const uint8* src_ptr, int src_stride, 2809 uint8* dst, int dst_width) { 2810 ALIGN16(uint8 src_row[kMaxRow12 * 2]); 2811 assert(dst_width <= kMaxOutputWidth); 2812 ScaleRowDown4Int_C(src_ptr, src_stride, src_row, dst_width * 2); 2813 ScaleRowDown4Int_C(src_ptr + src_stride * 4, src_stride, 2814 src_row + kMaxOutputWidth, 2815 dst_width * 2); 2816 ScaleRowDown2Int_C(src_row, kMaxOutputWidth, dst, dst_width); 2817 } 2818 2819 static void ScaleRowDown34_C(const uint8* src_ptr, int src_stride, 2820 uint8* dst, int dst_width) { 2821 uint8* dend; 2822 assert((dst_width % 3 == 0) && (dst_width > 0)); 2823 dend = dst + dst_width; 2824 do { 2825 dst[0] = src_ptr[0]; 2826 dst[1] = src_ptr[1]; 2827 dst[2] = src_ptr[3]; 2828 dst += 3; 2829 src_ptr += 4; 2830 } while (dst < dend); 2831 } 2832 2833 // Filter rows 0 and 1 together, 3 : 1 2834 static void ScaleRowDown34_0_Int_C(const uint8* src_ptr, int src_stride, 2835 uint8* d, int dst_width) { 2836 uint8* dend; 2837 const uint8* s; 2838 const uint8* t; 2839 assert((dst_width % 3 == 0) && (dst_width > 0)); 2840 dend = d + dst_width; 2841 s = src_ptr; 2842 t = src_ptr + src_stride; 2843 do { 2844 uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; 2845 uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; 2846 uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; 2847 uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; 2848 uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; 2849 uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; 2850 d[0] = (a0 * 3 + b0 + 2) >> 2; 2851 d[1] = (a1 * 3 + b1 + 2) >> 2; 2852 d[2] = (a2 * 3 + b2 + 2) >> 2; 2853 d += 3; 2854 s += 4; 2855 t += 4; 2856 } while (d < dend); 2857 } 2858 2859 // Filter rows 1 and 2 together, 1 : 1 2860 static void ScaleRowDown34_1_Int_C(const uint8* src_ptr, int src_stride, 2861 uint8* d, int dst_width) { 2862 uint8* dend; 2863 const uint8* s; 2864 const uint8* t; 2865 assert((dst_width % 3 == 0) && (dst_width > 0)); 2866 dend = d + dst_width; 2867 s = src_ptr; 2868 t = src_ptr + src_stride; 2869 do { 2870 uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; 2871 uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; 2872 uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; 2873 uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; 2874 uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; 2875 uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; 2876 d[0] = (a0 + b0 + 1) >> 1; 2877 d[1] = (a1 + b1 + 1) >> 1; 2878 d[2] = (a2 + b2 + 1) >> 1; 2879 d += 3; 2880 s += 4; 2881 t += 4; 2882 } while (d < dend); 2883 } 2884 2885 #if defined(HAS_SCALEFILTERROWS_SSE2) 2886 // Filter row to 3/4 2887 static void ScaleFilterCols34_C(uint8* dst_ptr, const uint8* src_ptr, 2888 int dst_width) { 2889 uint8* dend; 2890 const uint8* s; 2891 assert((dst_width % 3 == 0) && (dst_width > 0)); 2892 dend = dst_ptr + dst_width; 2893 s = src_ptr; 2894 do { 2895 dst_ptr[0] = (s[0] * 3 + s[1] * 1 + 2) >> 2; 2896 dst_ptr[1] = (s[1] * 1 + s[2] * 1 + 1) >> 1; 2897 dst_ptr[2] = (s[2] * 1 + s[3] * 3 + 2) >> 2; 2898 dst_ptr += 3; 2899 s += 4; 2900 } while (dst_ptr < dend); 2901 } 2902 #endif 2903 2904 static void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr, 2905 int dst_width, int dx) { 2906 int x = 0; 2907 int j; 2908 for (j = 0; j < dst_width; ++j) { 2909 int xi = x >> 16; 2910 int xf1 = x & 0xffff; 2911 int xf0 = 65536 - xf1; 2912 2913 *dst_ptr++ = (src_ptr[xi] * xf0 + src_ptr[xi + 1] * xf1) >> 16; 2914 x += dx; 2915 } 2916 } 2917 2918 //Not work on Windows 2919 //static const int kMaxInputWidth = 2560; 2920 #define kMaxInputWidth 2560 2921 #if defined(HAS_SCALEFILTERROWS_SSE2) 2922 #define HAS_SCALEROWDOWN34_SSE2 2923 // Filter rows 0 and 1 together, 3 : 1 2924 static void ScaleRowDown34_0_Int_SSE2(const uint8* src_ptr, int src_stride, 2925 uint8* dst_ptr, int dst_width) { 2926 ALIGN16(uint8 row[kMaxInputWidth]); 2927 assert((dst_width % 3 == 0) && (dst_width > 0)); 2928 ScaleFilterRows_SSE2(row, src_ptr, src_stride, dst_width * 4 / 3, 256 / 4); 2929 ScaleFilterCols34_C(dst_ptr, row, dst_width); 2930 } 2931 2932 // Filter rows 1 and 2 together, 1 : 1 2933 static void ScaleRowDown34_1_Int_SSE2(const uint8* src_ptr, int src_stride, 2934 uint8* dst_ptr, int dst_width) { 2935 ALIGN16(uint8 row[kMaxInputWidth]); 2936 assert((dst_width % 3 == 0) && (dst_width > 0)); 2937 ScaleFilterRows_SSE2(row, src_ptr, src_stride, dst_width * 4 / 3, 256 / 2); 2938 ScaleFilterCols34_C(dst_ptr, row, dst_width); 2939 } 2940 #endif 2941 2942 static void ScaleRowDown38_C(const uint8* src_ptr, int src_stride, 2943 uint8* dst, int dst_width) { 2944 int x; 2945 assert(dst_width % 3 == 0); 2946 for (x = 0; x < dst_width; x += 3) { 2947 dst[0] = src_ptr[0]; 2948 dst[1] = src_ptr[3]; 2949 dst[2] = src_ptr[6]; 2950 dst += 3; 2951 src_ptr += 8; 2952 } 2953 } 2954 2955 // 8x3 -> 3x1 2956 static void ScaleRowDown38_3_Int_C(const uint8* src_ptr, int src_stride, 2957 uint8* dst_ptr, int dst_width) { 2958 int i; 2959 assert((dst_width % 3 == 0) && (dst_width > 0)); 2960 for (i = 0; i < dst_width; i+=3) { 2961 dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + 2962 src_ptr[src_stride + 0] + src_ptr[src_stride + 1] + 2963 src_ptr[src_stride + 2] + src_ptr[src_stride * 2 + 0] + 2964 src_ptr[src_stride * 2 + 1] + src_ptr[src_stride * 2 + 2]) * 2965 (65536 / 9) >> 16; 2966 dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + 2967 src_ptr[src_stride + 3] + src_ptr[src_stride + 4] + 2968 src_ptr[src_stride + 5] + src_ptr[src_stride * 2 + 3] + 2969 src_ptr[src_stride * 2 + 4] + src_ptr[src_stride * 2 + 5]) * 2970 (65536 / 9) >> 16; 2971 dst_ptr[2] = (src_ptr[6] + src_ptr[7] + 2972 src_ptr[src_stride + 6] + src_ptr[src_stride + 7] + 2973 src_ptr[src_stride * 2 + 6] + src_ptr[src_stride * 2 + 7]) * 2974 (65536 / 6) >> 16; 2975 src_ptr += 8; 2976 dst_ptr += 3; 2977 } 2978 } 2979 2980 // 8x2 -> 3x1 2981 static void ScaleRowDown38_2_Int_C(const uint8* src_ptr, int src_stride, 2982 uint8* dst_ptr, int dst_width) { 2983 int i; 2984 assert((dst_width % 3 == 0) && (dst_width > 0)); 2985 for (i = 0; i < dst_width; i+=3) { 2986 dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + 2987 src_ptr[src_stride + 0] + src_ptr[src_stride + 1] + 2988 src_ptr[src_stride + 2]) * (65536 / 6) >> 16; 2989 dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + 2990 src_ptr[src_stride + 3] + src_ptr[src_stride + 4] + 2991 src_ptr[src_stride + 5]) * (65536 / 6) >> 16; 2992 dst_ptr[2] = (src_ptr[6] + src_ptr[7] + 2993 src_ptr[src_stride + 6] + src_ptr[src_stride + 7]) * 2994 (65536 / 4) >> 16; 2995 src_ptr += 8; 2996 dst_ptr += 3; 2997 } 2998 } 2999 3000 // C version 8x2 -> 8x1 3001 static void ScaleFilterRows_C(uint8* dst_ptr, 3002 const uint8* src_ptr, int src_stride, 3003 int dst_width, int source_y_fraction) { 3004 int y1_fraction; 3005 int y0_fraction; 3006 const uint8* src_ptr1; 3007 uint8* end; 3008 assert(dst_width > 0); 3009 y1_fraction = source_y_fraction; 3010 y0_fraction = 256 - y1_fraction; 3011 src_ptr1 = src_ptr + src_stride; 3012 end = dst_ptr + dst_width; 3013 do { 3014 dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8; 3015 dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8; 3016 dst_ptr[2] = (src_ptr[2] * y0_fraction + src_ptr1[2] * y1_fraction) >> 8; 3017 dst_ptr[3] = (src_ptr[3] * y0_fraction + src_ptr1[3] * y1_fraction) >> 8; 3018 dst_ptr[4] = (src_ptr[4] * y0_fraction + src_ptr1[4] * y1_fraction) >> 8; 3019 dst_ptr[5] = (src_ptr[5] * y0_fraction + src_ptr1[5] * y1_fraction) >> 8; 3020 dst_ptr[6] = (src_ptr[6] * y0_fraction + src_ptr1[6] * y1_fraction) >> 8; 3021 dst_ptr[7] = (src_ptr[7] * y0_fraction + src_ptr1[7] * y1_fraction) >> 8; 3022 src_ptr += 8; 3023 src_ptr1 += 8; 3024 dst_ptr += 8; 3025 } while (dst_ptr < end); 3026 dst_ptr[0] = dst_ptr[-1]; 3027 } 3028 3029 void ScaleAddRows_C(const uint8* src_ptr, int src_stride, 3030 uint16* dst_ptr, int src_width, int src_height) { 3031 int x,y; 3032 assert(src_width > 0); 3033 assert(src_height > 0); 3034 for (x = 0; x < src_width; ++x) { 3035 const uint8* s = src_ptr + x; 3036 int sum = 0; 3037 for (y = 0; y < src_height; ++y) { 3038 sum += s[0]; 3039 s += src_stride; 3040 } 3041 dst_ptr[x] = sum; 3042 } 3043 } 3044 3045 /** 3046 * Scale plane, 1/2 3047 * 3048 * This is an optimized version for scaling down a plane to 1/2 of 3049 * its original size. 3050 * 3051 */ 3052 static void ScalePlaneDown2(int src_width, int src_height, 3053 int dst_width, int dst_height, 3054 int src_stride, int dst_stride, 3055 const uint8* src_ptr, uint8* dst_ptr, 3056 FilterMode filtering) { 3057 void (*ScaleRowDown2)(const uint8* src_ptr, int src_stride, 3058 uint8* dst_ptr, int dst_width); 3059 assert(IS_ALIGNED(src_width, 2)); 3060 assert(IS_ALIGNED(src_height, 2)); 3061 3062 #if defined(HAS_SCALEROWDOWN2_NEON) 3063 if (TestCpuFlag(kCpuHasNEON) && 3064 IS_ALIGNED(dst_width, 16)) { 3065 ScaleRowDown2 = filtering ? ScaleRowDown2Int_NEON : ScaleRowDown2_NEON; 3066 } else 3067 #endif 3068 #if defined(HAS_SCALEROWDOWN2_SSE2) 3069 if (TestCpuFlag(kCpuHasSSE2) && 3070 IS_ALIGNED(dst_width, 16) && 3071 IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) && 3072 IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) { 3073 ScaleRowDown2 = filtering ? ScaleRowDown2Int_SSE2 : ScaleRowDown2_SSE2; 3074 } else 3075 #endif 3076 { 3077 ScaleRowDown2 = filtering ? ScaleRowDown2Int_C : ScaleRowDown2_C; 3078 } 3079 3080 { 3081 int y; 3082 for (y = 0; y < dst_height; ++y) { 3083 ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width); 3084 src_ptr += (src_stride << 1); 3085 dst_ptr += dst_stride; 3086 } 3087 } 3088 } 3089 3090 /** 3091 * Scale plane, 1/4 3092 * 3093 * This is an optimized version for scaling down a plane to 1/4 of 3094 * its original size. 3095 */ 3096 static void ScalePlaneDown4(int src_width, int src_height, 3097 int dst_width, int dst_height, 3098 int src_stride, int dst_stride, 3099 const uint8* src_ptr, uint8* dst_ptr, 3100 FilterMode filtering) { 3101 void (*ScaleRowDown4)(const uint8* src_ptr, int src_stride, 3102 uint8* dst_ptr, int dst_width); 3103 assert(IS_ALIGNED(src_width, 4)); 3104 assert(IS_ALIGNED(src_height, 4)); 3105 3106 #if defined(HAS_SCALEROWDOWN4_NEON) 3107 if (TestCpuFlag(kCpuHasNEON) && 3108 IS_ALIGNED(dst_width, 4)) { 3109 ScaleRowDown4 = filtering ? ScaleRowDown4Int_NEON : ScaleRowDown4_NEON; 3110 } else 3111 #endif 3112 #if defined(HAS_SCALEROWDOWN4_SSE2) 3113 if (TestCpuFlag(kCpuHasSSE2) && 3114 IS_ALIGNED(dst_width, 8) && 3115 IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) && 3116 IS_ALIGNED(dst_ptr, 8) && IS_ALIGNED(dst_stride, 8)) { 3117 ScaleRowDown4 = filtering ? ScaleRowDown4Int_SSE2 : ScaleRowDown4_SSE2; 3118 } else 3119 #endif 3120 { 3121 ScaleRowDown4 = filtering ? ScaleRowDown4Int_C : ScaleRowDown4_C; 3122 } 3123 3124 { 3125 int y; 3126 for (y = 0; y < dst_height; ++y) { 3127 ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width); 3128 src_ptr += (src_stride << 2); 3129 dst_ptr += dst_stride; 3130 } 3131 } 3132 } 3133 3134 /** 3135 * Scale plane, 1/8 3136 * 3137 * This is an optimized version for scaling down a plane to 1/8 3138 * of its original size. 3139 * 3140 */ 3141 static void ScalePlaneDown8(int src_width, int src_height, 3142 int dst_width, int dst_height, 3143 int src_stride, int dst_stride, 3144 const uint8* src_ptr, uint8* dst_ptr, 3145 FilterMode filtering) { 3146 void (*ScaleRowDown8)(const uint8* src_ptr, int src_stride, 3147 uint8* dst_ptr, int dst_width); 3148 assert(IS_ALIGNED(src_width, 8)); 3149 assert(IS_ALIGNED(src_height, 8)); 3150 3151 #if defined(HAS_SCALEROWDOWN8_SSE2) 3152 if (TestCpuFlag(kCpuHasSSE2) && 3153 IS_ALIGNED(dst_width, 4) && 3154 IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) && 3155 IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { 3156 ScaleRowDown8 = filtering ? ScaleRowDown8Int_SSE2 : ScaleRowDown8_SSE2; 3157 } else 3158 #endif 3159 { 3160 ScaleRowDown8 = filtering && (dst_width <= kMaxOutputWidth) ? 3161 ScaleRowDown8Int_C : ScaleRowDown8_C; 3162 } 3163 3164 { 3165 int y; 3166 for (y = 0; y < dst_height; ++y) { 3167 ScaleRowDown8(src_ptr, src_stride, dst_ptr, dst_width); 3168 src_ptr += (src_stride << 3); 3169 dst_ptr += dst_stride; 3170 } 3171 } 3172 } 3173 3174 /** 3175 * Scale plane down, 3/4 3176 * 3177 * Provided by Frank Barchard (fbarchard (at) google.com) 3178 * 3179 */ 3180 static void ScalePlaneDown34(int src_width, int src_height, 3181 int dst_width, int dst_height, 3182 int src_stride, int dst_stride, 3183 const uint8* src_ptr, uint8* dst_ptr, 3184 FilterMode filtering) { 3185 void (*ScaleRowDown34_0)(const uint8* src_ptr, int src_stride, 3186 uint8* dst_ptr, int dst_width); 3187 void (*ScaleRowDown34_1)(const uint8* src_ptr, int src_stride, 3188 uint8* dst_ptr, int dst_width); 3189 assert(dst_width % 3 == 0); 3190 #if defined(HAS_SCALEROWDOWN34_NEON) 3191 if (TestCpuFlag(kCpuHasNEON) && 3192 (dst_width % 24 == 0)) { 3193 if (!filtering) { 3194 ScaleRowDown34_0 = ScaleRowDown34_NEON; 3195 ScaleRowDown34_1 = ScaleRowDown34_NEON; 3196 } else { 3197 ScaleRowDown34_0 = ScaleRowDown34_0_Int_NEON; 3198 ScaleRowDown34_1 = ScaleRowDown34_1_Int_NEON; 3199 } 3200 } else 3201 #endif 3202 3203 #if defined(HAS_SCALEROWDOWN34_SSSE3) 3204 if (TestCpuFlag(kCpuHasSSSE3) && 3205 (dst_width % 24 == 0) && 3206 IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) && 3207 IS_ALIGNED(dst_ptr, 8) && IS_ALIGNED(dst_stride, 8)) { 3208 if (!filtering) { 3209 ScaleRowDown34_0 = ScaleRowDown34_SSSE3; 3210 ScaleRowDown34_1 = ScaleRowDown34_SSSE3; 3211 } else { 3212 ScaleRowDown34_0 = ScaleRowDown34_0_Int_SSSE3; 3213 ScaleRowDown34_1 = ScaleRowDown34_1_Int_SSSE3; 3214 } 3215 } else 3216 #endif 3217 #if defined(HAS_SCALEROWDOWN34_SSE2) 3218 if (TestCpuFlag(kCpuHasSSE2) && 3219 (dst_width % 24 == 0) && IS_ALIGNED(src_stride, 16) && 3220 IS_ALIGNED(dst_stride, 8) && 3221 IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 8) && 3222 filtering) { 3223 ScaleRowDown34_0 = ScaleRowDown34_0_Int_SSE2; 3224 ScaleRowDown34_1 = ScaleRowDown34_1_Int_SSE2; 3225 } else 3226 #endif 3227 { 3228 if (!filtering) { 3229 ScaleRowDown34_0 = ScaleRowDown34_C; 3230 ScaleRowDown34_1 = ScaleRowDown34_C; 3231 } else { 3232 ScaleRowDown34_0 = ScaleRowDown34_0_Int_C; 3233 ScaleRowDown34_1 = ScaleRowDown34_1_Int_C; 3234 } 3235 } 3236 { 3237 int src_row = 0; 3238 int y; 3239 for (y = 0; y < dst_height; ++y) { 3240 switch (src_row) { 3241 case 0: 3242 ScaleRowDown34_0(src_ptr, src_stride, dst_ptr, dst_width); 3243 break; 3244 3245 case 1: 3246 ScaleRowDown34_1(src_ptr, src_stride, dst_ptr, dst_width); 3247 break; 3248 3249 case 2: 3250 ScaleRowDown34_0(src_ptr + src_stride, -src_stride, 3251 dst_ptr, dst_width); 3252 break; 3253 } 3254 ++src_row; 3255 src_ptr += src_stride; 3256 dst_ptr += dst_stride; 3257 if (src_row >= 3) { 3258 src_ptr += src_stride; 3259 src_row = 0; 3260 } 3261 } 3262 } 3263 } 3264 3265 /** 3266 * Scale plane, 3/8 3267 * 3268 * This is an optimized version for scaling down a plane to 3/8 3269 * of its original size. 3270 * 3271 * Reduces 16x3 to 6x1 3272 */ 3273 static void ScalePlaneDown38(int src_width, int src_height, 3274 int dst_width, int dst_height, 3275 int src_stride, int dst_stride, 3276 const uint8* src_ptr, uint8* dst_ptr, 3277 FilterMode filtering) { 3278 void (*ScaleRowDown38_3)(const uint8* src_ptr, int src_stride, 3279 uint8* dst_ptr, int dst_width); 3280 void (*ScaleRowDown38_2)(const uint8* src_ptr, int src_stride, 3281 uint8* dst_ptr, int dst_width); 3282 assert(dst_width % 3 == 0); 3283 #if defined(HAS_SCALEROWDOWN38_NEON) 3284 if (TestCpuFlag(kCpuHasNEON) && 3285 (dst_width % 12 == 0)) { 3286 if (!filtering) { 3287 ScaleRowDown38_3 = ScaleRowDown38_NEON; 3288 ScaleRowDown38_2 = ScaleRowDown38_NEON; 3289 } else { 3290 ScaleRowDown38_3 = ScaleRowDown38_3_Int_NEON; 3291 ScaleRowDown38_2 = ScaleRowDown38_2_Int_NEON; 3292 } 3293 } else 3294 #endif 3295 3296 #if defined(HAS_SCALEROWDOWN38_SSSE3) 3297 if (TestCpuFlag(kCpuHasSSSE3) && 3298 (dst_width % 24 == 0) && IS_ALIGNED(src_stride, 16) && 3299 IS_ALIGNED(dst_stride, 8) && 3300 IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 8)) { 3301 if (!filtering) { 3302 ScaleRowDown38_3 = ScaleRowDown38_SSSE3; 3303 ScaleRowDown38_2 = ScaleRowDown38_SSSE3; 3304 } else { 3305 ScaleRowDown38_3 = ScaleRowDown38_3_Int_SSSE3; 3306 ScaleRowDown38_2 = ScaleRowDown38_2_Int_SSSE3; 3307 } 3308 } else 3309 #endif 3310 { 3311 if (!filtering) { 3312 ScaleRowDown38_3 = ScaleRowDown38_C; 3313 ScaleRowDown38_2 = ScaleRowDown38_C; 3314 } else { 3315 ScaleRowDown38_3 = ScaleRowDown38_3_Int_C; 3316 ScaleRowDown38_2 = ScaleRowDown38_2_Int_C; 3317 } 3318 } 3319 { 3320 int src_row = 0; 3321 int y; 3322 for (y = 0; y < dst_height; ++y) { 3323 switch (src_row) { 3324 case 0: 3325 case 1: 3326 ScaleRowDown38_3(src_ptr, src_stride, dst_ptr, dst_width); 3327 src_ptr += src_stride * 3; 3328 ++src_row; 3329 break; 3330 3331 case 2: 3332 ScaleRowDown38_2(src_ptr, src_stride, dst_ptr, dst_width); 3333 src_ptr += src_stride * 2; 3334 src_row = 0; 3335 break; 3336 } 3337 dst_ptr += dst_stride; 3338 } 3339 } 3340 } 3341 3342 __inline static uint32 SumBox(int iboxwidth, int iboxheight, 3343 int src_stride, const uint8* src_ptr) { 3344 int x, y; 3345 uint32 sum; 3346 assert(iboxwidth > 0); 3347 assert(iboxheight > 0); 3348 sum = 0u; 3349 for (y = 0; y < iboxheight; ++y) { 3350 for (x = 0; x < iboxwidth; ++x) { 3351 sum += src_ptr[x]; 3352 } 3353 src_ptr += src_stride; 3354 } 3355 return sum; 3356 } 3357 3358 static void ScalePlaneBoxRow(int dst_width, int boxheight, 3359 int dx, int src_stride, 3360 const uint8* src_ptr, uint8* dst_ptr) { 3361 int x = 0; 3362 int i; 3363 for (i = 0; i < dst_width; ++i) { 3364 int ix = x >> 16; 3365 int boxwidth; 3366 x += dx; 3367 boxwidth = (x >> 16) - ix; 3368 *dst_ptr++ = SumBox(boxwidth, boxheight, src_stride, src_ptr + ix) / 3369 (boxwidth * boxheight); 3370 } 3371 } 3372 3373 __inline static uint32 SumPixels(int iboxwidth, const uint16* src_ptr) { 3374 uint32 sum; 3375 int x; 3376 assert(iboxwidth > 0); 3377 sum = 0u; 3378 for (x = 0; x < iboxwidth; ++x) { 3379 sum += src_ptr[x]; 3380 } 3381 return sum; 3382 } 3383 3384 static void ScaleAddCols2_C(int dst_width, int boxheight, int dx, 3385 const uint16* src_ptr, uint8* dst_ptr) { 3386 int scaletbl[2]; 3387 int minboxwidth = (dx >> 16); 3388 scaletbl[0] = 65536 / (minboxwidth * boxheight); 3389 scaletbl[1] = 65536 / ((minboxwidth + 1) * boxheight); 3390 { 3391 int *scaleptr = scaletbl - minboxwidth; 3392 int x = 0; 3393 int i; 3394 for (i = 0; i < dst_width; ++i) { 3395 int ix = x >> 16; 3396 int boxwidth; 3397 x += dx; 3398 boxwidth = (x >> 16) - ix; 3399 *dst_ptr++ = SumPixels(boxwidth, src_ptr + ix) * scaleptr[boxwidth] >> 16; 3400 } 3401 } 3402 } 3403 3404 static void ScaleAddCols1_C(int dst_width, int boxheight, int dx, 3405 const uint16* src_ptr, uint8* dst_ptr) { 3406 int boxwidth = (dx >> 16); 3407 int scaleval = 65536 / (boxwidth * boxheight); 3408 int x = 0; 3409 int i; 3410 for (i = 0; i < dst_width; ++i) { 3411 *dst_ptr++ = SumPixels(boxwidth, src_ptr + x) * scaleval >> 16; 3412 x += boxwidth; 3413 } 3414 } 3415 3416 /** 3417 * Scale plane down to any dimensions, with interpolation. 3418 * (boxfilter). 3419 * 3420 * Same method as SimpleScale, which is fixed point, outputting 3421 * one pixel of destination using fixed point (16.16) to step 3422 * through source, sampling a box of pixel with simple 3423 * averaging. 3424 */ 3425 static void ScalePlaneBox(int src_width, int src_height, 3426 int dst_width, int dst_height, 3427 int src_stride, int dst_stride, 3428 const uint8* src_ptr, uint8* dst_ptr) { 3429 int dx, dy; 3430 assert(dst_width > 0); 3431 assert(dst_height > 0); 3432 dy = (src_height << 16) / dst_height; 3433 dx = (src_width << 16) / dst_width; 3434 if (!IS_ALIGNED(src_width, 16) || (src_width > kMaxInputWidth) || 3435 dst_height * 2 > src_height) { 3436 uint8* dst = dst_ptr; 3437 int dy = (src_height << 16) / dst_height; 3438 int dx = (src_width << 16) / dst_width; 3439 int y = 0; 3440 int j; 3441 for (j = 0; j < dst_height; ++j) { 3442 int iy = y >> 16; 3443 const uint8* const src = src_ptr + iy * src_stride; 3444 int boxheight; 3445 y += dy; 3446 if (y > (src_height << 16)) { 3447 y = (src_height << 16); 3448 } 3449 boxheight = (y >> 16) - iy; 3450 ScalePlaneBoxRow(dst_width, boxheight, 3451 dx, src_stride, 3452 src, dst); 3453 3454 dst += dst_stride; 3455 } 3456 } else { 3457 ALIGN16(uint16 row[kMaxInputWidth]); 3458 void (*ScaleAddRows)(const uint8* src_ptr, int src_stride, 3459 uint16* dst_ptr, int src_width, int src_height); 3460 void (*ScaleAddCols)(int dst_width, int boxheight, int dx, 3461 const uint16* src_ptr, uint8* dst_ptr); 3462 #if defined(HAS_SCALEADDROWS_SSE2) 3463 if (TestCpuFlag(kCpuHasSSE2) && 3464 IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16) && 3465 IS_ALIGNED(src_width, 16)) { 3466 ScaleAddRows = ScaleAddRows_SSE2; 3467 } else 3468 #endif 3469 { 3470 ScaleAddRows = ScaleAddRows_C; 3471 } 3472 if (dx & 0xffff) { 3473 ScaleAddCols = ScaleAddCols2_C; 3474 } else { 3475 ScaleAddCols = ScaleAddCols1_C; 3476 } 3477 3478 { 3479 int y = 0; 3480 int j; 3481 for (j = 0; j < dst_height; ++j) { 3482 int iy = y >> 16; 3483 const uint8* const src = src_ptr + iy * src_stride; 3484 int boxheight; 3485 y += dy; 3486 if (y > (src_height << 16)) { 3487 y = (src_height << 16); 3488 } 3489 boxheight = (y >> 16) - iy; 3490 ScaleAddRows(src, src_stride, row, src_width, boxheight); 3491 ScaleAddCols(dst_width, boxheight, dx, row, dst_ptr); 3492 dst_ptr += dst_stride; 3493 } 3494 } 3495 } 3496 } 3497 3498 /** 3499 * Scale plane to/from any dimensions, with interpolation. 3500 */ 3501 static void ScalePlaneBilinearSimple(int src_width, int src_height, 3502 int dst_width, int dst_height, 3503 int src_stride, int dst_stride, 3504 const uint8* src_ptr, uint8* dst_ptr) { 3505 int i, j; 3506 uint8* dst = dst_ptr; 3507 int dx = (src_width << 16) / dst_width; 3508 int dy = (src_height << 16) / dst_height; 3509 int maxx = ((src_width - 1) << 16) - 1; 3510 int maxy = ((src_height - 1) << 16) - 1; 3511 int y = (dst_height < src_height) ? 32768 : 3512 (src_height << 16) / dst_height - 32768; 3513 for (i = 0; i < dst_height; ++i) { 3514 int cy = (y < 0) ? 0 : y; 3515 int yi = cy >> 16; 3516 int yf = cy & 0xffff; 3517 const uint8* const src = src_ptr + yi * src_stride; 3518 int x = (dst_width < src_width) ? 32768 : 3519 (src_width << 16) / dst_width - 32768; 3520 for (j = 0; j < dst_width; ++j) { 3521 int cx = (x < 0) ? 0 : x; 3522 int xi = cx >> 16; 3523 int xf = cx & 0xffff; 3524 int r0 = (src[xi] * (65536 - xf) + src[xi + 1] * xf) >> 16; 3525 int r1 = (src[xi + src_stride] * (65536 - xf) + 3526 src[xi + src_stride + 1] * xf) >> 16; 3527 *dst++ = (r0 * (65536 - yf) + r1 * yf) >> 16; 3528 x += dx; 3529 if (x > maxx) 3530 x = maxx; 3531 } 3532 dst += dst_stride - dst_width; 3533 y += dy; 3534 if (y > maxy) 3535 y = maxy; 3536 } 3537 } 3538 3539 /** 3540 * Scale plane to/from any dimensions, with bilinear 3541 * interpolation. 3542 */ 3543 static void ScalePlaneBilinear(int src_width, int src_height, 3544 int dst_width, int dst_height, 3545 int src_stride, int dst_stride, 3546 const uint8* src_ptr, uint8* dst_ptr) { 3547 int dy; 3548 int dx; 3549 assert(dst_width > 0); 3550 assert(dst_height > 0); 3551 dy = (src_height << 16) / dst_height; 3552 dx = (src_width << 16) / dst_width; 3553 if (!IS_ALIGNED(src_width, 8) || (src_width > kMaxInputWidth)) { 3554 ScalePlaneBilinearSimple(src_width, src_height, dst_width, dst_height, 3555 src_stride, dst_stride, src_ptr, dst_ptr); 3556 3557 } else { 3558 ALIGN16(uint8 row[kMaxInputWidth + 1]); 3559 void (*ScaleFilterRows)(uint8* dst_ptr, const uint8* src_ptr, 3560 int src_stride, 3561 int dst_width, int source_y_fraction); 3562 void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr, 3563 int dst_width, int dx); 3564 #if defined(HAS_SCALEFILTERROWS_SSSE3) 3565 if (TestCpuFlag(kCpuHasSSSE3) && 3566 IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16) && 3567 IS_ALIGNED(src_width, 16)) { 3568 ScaleFilterRows = ScaleFilterRows_SSSE3; 3569 } else 3570 #endif 3571 #if defined(HAS_SCALEFILTERROWS_SSE2) 3572 if (TestCpuFlag(kCpuHasSSE2) && 3573 IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16) && 3574 IS_ALIGNED(src_width, 16)) { 3575 ScaleFilterRows = ScaleFilterRows_SSE2; 3576 } else 3577 #endif 3578 { 3579 ScaleFilterRows = ScaleFilterRows_C; 3580 } 3581 ScaleFilterCols = ScaleFilterCols_C; 3582 3583 { 3584 int y = 0; 3585 int maxy = ((src_height - 1) << 16) - 1; // max is filter of last 2 rows. 3586 int j; 3587 for (j = 0; j < dst_height; ++j) { 3588 int iy = y >> 16; 3589 int fy = (y >> 8) & 255; 3590 const uint8* const src = src_ptr + iy * src_stride; 3591 ScaleFilterRows(row, src, src_stride, src_width, fy); 3592 ScaleFilterCols(dst_ptr, row, dst_width, dx); 3593 dst_ptr += dst_stride; 3594 y += dy; 3595 if (y > maxy) { 3596 y = maxy; 3597 } 3598 } 3599 } 3600 } 3601 } 3602 3603 /** 3604 * Scale plane to/from any dimensions, without interpolation. 3605 * Fixed point math is used for performance: The upper 16 bits 3606 * of x and dx is the integer part of the source position and 3607 * the lower 16 bits are the fixed decimal part. 3608 */ 3609 static void ScalePlaneSimple(int src_width, int src_height, 3610 int dst_width, int dst_height, 3611 int src_stride, int dst_stride, 3612 const uint8* src_ptr, uint8* dst_ptr) { 3613 uint8* dst = dst_ptr; 3614 int dx = (src_width << 16) / dst_width; 3615 int y; 3616 for (y = 0; y < dst_height; ++y) { 3617 const uint8* const src = src_ptr + (y * src_height / dst_height) * 3618 src_stride; 3619 // TODO(fbarchard): Round X coordinate by setting x=0x8000. 3620 int x = 0; 3621 int i; 3622 for (i = 0; i < dst_width; ++i) { 3623 *dst++ = src[x >> 16]; 3624 x += dx; 3625 } 3626 dst += dst_stride - dst_width; 3627 } 3628 } 3629 3630 /** 3631 * Scale plane to/from any dimensions. 3632 */ 3633 static void ScalePlaneAnySize(int src_width, int src_height, 3634 int dst_width, int dst_height, 3635 int src_stride, int dst_stride, 3636 const uint8* src_ptr, uint8* dst_ptr, 3637 FilterMode filtering) { 3638 if (!filtering) { 3639 ScalePlaneSimple(src_width, src_height, dst_width, dst_height, 3640 src_stride, dst_stride, src_ptr, dst_ptr); 3641 } else { 3642 // fall back to non-optimized version 3643 ScalePlaneBilinear(src_width, src_height, dst_width, dst_height, 3644 src_stride, dst_stride, src_ptr, dst_ptr); 3645 } 3646 } 3647 3648 /** 3649 * Scale plane down, any size 3650 * 3651 * This is an optimized version for scaling down a plane to any size. 3652 * The current implementation is ~10 times faster compared to the 3653 * reference implementation for e.g. XGA->LowResPAL 3654 * 3655 */ 3656 static void ScalePlaneDown(int src_width, int src_height, 3657 int dst_width, int dst_height, 3658 int src_stride, int dst_stride, 3659 const uint8* src_ptr, uint8* dst_ptr, 3660 FilterMode filtering) { 3661 if (!filtering) { 3662 ScalePlaneSimple(src_width, src_height, dst_width, dst_height, 3663 src_stride, dst_stride, src_ptr, dst_ptr); 3664 } else if (filtering == kFilterBilinear || src_height * 2 > dst_height) { 3665 // between 1/2x and 1x use bilinear 3666 ScalePlaneBilinear(src_width, src_height, dst_width, dst_height, 3667 src_stride, dst_stride, src_ptr, dst_ptr); 3668 } else { 3669 ScalePlaneBox(src_width, src_height, dst_width, dst_height, 3670 src_stride, dst_stride, src_ptr, dst_ptr); 3671 } 3672 } 3673 3674 /** 3675 * Copy plane, no scaling 3676 * 3677 * This simply copies the given plane without scaling. 3678 * The current implementation is ~115 times faster 3679 * compared to the reference implementation. 3680 * 3681 */ 3682 static void CopyPlane(int src_width, int src_height, 3683 int dst_width, int dst_height, 3684 int src_stride, int dst_stride, 3685 const uint8* src_ptr, uint8* dst_ptr) { 3686 if (src_stride == src_width && dst_stride == dst_width) { 3687 // All contiguous, so can use REALLY fast path. 3688 memcpy(dst_ptr, src_ptr, src_width * src_height); 3689 } else { 3690 // Not all contiguous; must copy scanlines individually 3691 const uint8* src = src_ptr; 3692 uint8* dst = dst_ptr; 3693 int i; 3694 for (i = 0; i < src_height; ++i) { 3695 memcpy(dst, src, src_width); 3696 dst += dst_stride; 3697 src += src_stride; 3698 } 3699 } 3700 } 3701 3702 static void ScalePlane(const uint8* src, int src_stride, 3703 int src_width, int src_height, 3704 uint8* dst, int dst_stride, 3705 int dst_width, int dst_height, 3706 FilterMode filtering, int use_ref) { 3707 // Use specialized scales to improve performance for common resolutions. 3708 // For example, all the 1/2 scalings will use ScalePlaneDown2() 3709 if (dst_width == src_width && dst_height == src_height) { 3710 // Straight copy. 3711 CopyPlane(src_width, src_height, dst_width, dst_height, src_stride, 3712 dst_stride, src, dst); 3713 } else if (dst_width <= src_width && dst_height <= src_height) { 3714 // Scale down. 3715 if (use_ref) { 3716 // For testing, allow the optimized versions to be disabled. 3717 ScalePlaneDown(src_width, src_height, dst_width, dst_height, 3718 src_stride, dst_stride, src, dst, filtering); 3719 } else if (4 * dst_width == 3 * src_width && 3720 4 * dst_height == 3 * src_height) { 3721 // optimized, 3/4 3722 ScalePlaneDown34(src_width, src_height, dst_width, dst_height, 3723 src_stride, dst_stride, src, dst, filtering); 3724 } else if (2 * dst_width == src_width && 2 * dst_height == src_height) { 3725 // optimized, 1/2 3726 ScalePlaneDown2(src_width, src_height, dst_width, dst_height, 3727 src_stride, dst_stride, src, dst, filtering); 3728 // 3/8 rounded up for odd sized chroma height. 3729 } else if (8 * dst_width == 3 * src_width && 3730 dst_height == ((src_height * 3 + 7) / 8)) { 3731 // optimized, 3/8 3732 ScalePlaneDown38(src_width, src_height, dst_width, dst_height, 3733 src_stride, dst_stride, src, dst, filtering); 3734 } else if (4 * dst_width == src_width && 4 * dst_height == src_height) { 3735 // optimized, 1/4 3736 ScalePlaneDown4(src_width, src_height, dst_width, dst_height, 3737 src_stride, dst_stride, src, dst, filtering); 3738 } else if (8 * dst_width == src_width && 8 * dst_height == src_height) { 3739 // optimized, 1/8 3740 ScalePlaneDown8(src_width, src_height, dst_width, dst_height, 3741 src_stride, dst_stride, src, dst, filtering); 3742 } else { 3743 // Arbitrary downsample 3744 ScalePlaneDown(src_width, src_height, dst_width, dst_height, 3745 src_stride, dst_stride, src, dst, filtering); 3746 } 3747 } else { 3748 // Arbitrary scale up and/or down. 3749 ScalePlaneAnySize(src_width, src_height, dst_width, dst_height, 3750 src_stride, dst_stride, src, dst, filtering); 3751 } 3752 } 3753 3754 /** 3755 * Scale a plane. 3756 * 3757 * This function in turn calls a scaling function 3758 * suitable for handling the desired resolutions. 3759 * 3760 */ 3761 3762 int I420Scale(const uint8* src_y, int src_stride_y, 3763 const uint8* src_u, int src_stride_u, 3764 const uint8* src_v, int src_stride_v, 3765 int src_width, int src_height, 3766 uint8* dst_y, int dst_stride_y, 3767 uint8* dst_u, int dst_stride_u, 3768 uint8* dst_v, int dst_stride_v, 3769 int dst_width, int dst_height, 3770 FilterMode filtering) { 3771 if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 || 3772 !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) { 3773 return -1; 3774 } 3775 // Negative height means invert the image. 3776 if (src_height < 0) { 3777 int halfheight; 3778 src_height = -src_height; 3779 halfheight = (src_height + 1) >> 1; 3780 src_y = src_y + (src_height - 1) * src_stride_y; 3781 src_u = src_u + (halfheight - 1) * src_stride_u; 3782 src_v = src_v + (halfheight - 1) * src_stride_v; 3783 src_stride_y = -src_stride_y; 3784 src_stride_u = -src_stride_u; 3785 src_stride_v = -src_stride_v; 3786 } 3787 { 3788 int src_halfwidth = (src_width + 1) >> 1; 3789 int src_halfheight = (src_height + 1) >> 1; 3790 int dst_halfwidth = (dst_width + 1) >> 1; 3791 int dst_halfheight = (dst_height + 1) >> 1; 3792 3793 ScalePlane(src_y, src_stride_y, src_width, src_height, 3794 dst_y, dst_stride_y, dst_width, dst_height, 3795 filtering, use_reference_impl_); 3796 ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight, 3797 dst_u, dst_stride_u, dst_halfwidth, dst_halfheight, 3798 filtering, use_reference_impl_); 3799 ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight, 3800 dst_v, dst_stride_v, dst_halfwidth, dst_halfheight, 3801 filtering, use_reference_impl_); 3802 } 3803 return 0; 3804 } 3805 3806 // Deprecated api 3807 int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v, 3808 int src_stride_y, int src_stride_u, int src_stride_v, 3809 int src_width, int src_height, 3810 uint8* dst_y, uint8* dst_u, uint8* dst_v, 3811 int dst_stride_y, int dst_stride_u, int dst_stride_v, 3812 int dst_width, int dst_height, 3813 int interpolate) { 3814 if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 || 3815 !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) { 3816 return -1; 3817 } 3818 // Negative height means invert the image. 3819 if (src_height < 0) { 3820 int halfheight; 3821 src_height = -src_height; 3822 halfheight = (src_height + 1) >> 1; 3823 src_y = src_y + (src_height - 1) * src_stride_y; 3824 src_u = src_u + (halfheight - 1) * src_stride_u; 3825 src_v = src_v + (halfheight - 1) * src_stride_v; 3826 src_stride_y = -src_stride_y; 3827 src_stride_u = -src_stride_u; 3828 src_stride_v = -src_stride_v; 3829 } 3830 { 3831 int src_halfwidth = (src_width + 1) >> 1; 3832 int src_halfheight = (src_height + 1) >> 1; 3833 int dst_halfwidth = (dst_width + 1) >> 1; 3834 int dst_halfheight = (dst_height + 1) >> 1; 3835 FilterMode filtering = interpolate ? kFilterBox : kFilterNone; 3836 3837 ScalePlane(src_y, src_stride_y, src_width, src_height, 3838 dst_y, dst_stride_y, dst_width, dst_height, 3839 filtering, use_reference_impl_); 3840 ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight, 3841 dst_u, dst_stride_u, dst_halfwidth, dst_halfheight, 3842 filtering, use_reference_impl_); 3843 ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight, 3844 dst_v, dst_stride_v, dst_halfwidth, dst_halfheight, 3845 filtering, use_reference_impl_); 3846 } 3847 return 0; 3848 } 3849 3850 // Deprecated api 3851 int ScaleOffset(const uint8* src, int src_width, int src_height, 3852 uint8* dst, int dst_width, int dst_height, int dst_yoffset, 3853 int interpolate) { 3854 if (!src || src_width <= 0 || src_height <= 0 || 3855 !dst || dst_width <= 0 || dst_height <= 0 || dst_yoffset < 0 || 3856 dst_yoffset >= dst_height) { 3857 return -1; 3858 } 3859 dst_yoffset = dst_yoffset & ~1; // chroma requires offset to multiple of 2. 3860 { 3861 int src_halfwidth = (src_width + 1) >> 1; 3862 int src_halfheight = (src_height + 1) >> 1; 3863 int dst_halfwidth = (dst_width + 1) >> 1; 3864 int dst_halfheight = (dst_height + 1) >> 1; 3865 int aheight = dst_height - dst_yoffset * 2; // actual output height 3866 const uint8* const src_y = src; 3867 const uint8* const src_u = src + src_width * src_height; 3868 const uint8* const src_v = src + src_width * src_height + 3869 src_halfwidth * src_halfheight; 3870 uint8* dst_y = dst + dst_yoffset * dst_width; 3871 uint8* dst_u = dst + dst_width * dst_height + 3872 (dst_yoffset >> 1) * dst_halfwidth; 3873 uint8* dst_v = dst + dst_width * dst_height + dst_halfwidth * dst_halfheight + 3874 (dst_yoffset >> 1) * dst_halfwidth; 3875 return Scale(src_y, src_u, src_v, src_width, src_halfwidth, src_halfwidth, 3876 src_width, src_height, dst_y, dst_u, dst_v, dst_width, 3877 dst_halfwidth, dst_halfwidth, dst_width, aheight, interpolate); 3878 } 3879 } 3880 3881 #ifdef __cplusplus 3882 } // extern "C" 3883 } // namespace libyuv 3884 #endif 3885