1 /* 2 * Copyright (C) 2011 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include <stdint.h> 18 #include <x86intrin.h> 19 20 /* Unsigned extend packed 8-bit integer (in LBS) into packed 32-bit integer */ 21 static inline __m128i cvtepu8_epi32(__m128i x) { 22 #if defined(__SSE4_1__) 23 return _mm_cvtepu8_epi32(x); 24 #elif defined(__SSSE3__) 25 const __m128i M8to32 = _mm_set_epi32(0xffffff03, 0xffffff02, 0xffffff01, 0xffffff00); 26 x = _mm_shuffle_epi8(x, M8to32); 27 return x; 28 #else 29 # error "Require at least SSSE3" 30 #endif 31 } 32 33 static inline __m128i packus_epi32(__m128i lo, __m128i hi) { 34 #if defined(__SSE4_1__) 35 return _mm_packus_epi32(lo, hi); 36 #elif defined(__SSSE3__) 37 const __m128i C0 = _mm_set_epi32(0x0000, 0x0000, 0x0000, 0x0000); 38 const __m128i C1 = _mm_set_epi32(0xffff, 0xffff, 0xffff, 0xffff); 39 const __m128i M32to16L = _mm_set_epi32(0xffffffff, 0xffffffff, 0x0d0c0908, 0x05040100); 40 const __m128i M32to16H = _mm_set_epi32(0x0d0c0908, 0x05040100, 0xffffffff, 0xffffffff); 41 lo = _mm_and_si128(lo, _mm_cmpgt_epi32(lo, C0)); 42 lo = _mm_or_si128(lo, _mm_cmpgt_epi32(lo, C1)); 43 hi = _mm_and_si128(hi, _mm_cmpgt_epi32(hi, C0)); 44 hi = _mm_or_si128(hi, _mm_cmpgt_epi32(hi, C1)); 45 return _mm_or_si128(_mm_shuffle_epi8(lo, M32to16L), 46 _mm_shuffle_epi8(hi, M32to16H)); 47 #else 48 # error "Require at least SSSE3" 49 #endif 50 } 51 52 static inline __m128i mullo_epi32(__m128i x, __m128i y) { 53 #if defined(__SSE4_1__) 54 return _mm_mullo_epi32(x, y); 55 #elif defined(__SSSE3__) 56 const __m128i Meven = _mm_set_epi32(0x00000000, 0xffffffff, 0x00000000, 0xffffffff); 57 __m128i even = _mm_mul_epu32(x, y); 58 __m128i odd = _mm_mul_epu32(_mm_srli_si128(x, 4), 59 _mm_srli_si128(y, 4)); 60 even = _mm_and_si128(even, Meven); 61 odd = _mm_and_si128(odd, Meven); 62 return _mm_or_si128(even, _mm_slli_si128(odd, 4)); 63 #else 64 # error "Require at least SSSE3" 65 #endif 66 } 67 68 /* 'mask' must packed 8-bit of 0x00 or 0xff */ 69 static inline __m128i blendv_epi8(__m128i x, __m128i y, __m128i mask) { 70 #if defined(__SSE4_1__) 71 return _mm_blendv_epi8(x, y, mask); 72 #elif defined(__SSSE3__) 73 return _mm_or_si128(_mm_andnot_si128(mask, x), _mm_and_si128(y, mask)); 74 #else 75 # error "Require at least SSSE3" 76 #endif 77 } 78 79 extern "C" void rsdIntrinsicConvolve3x3_K(void *dst, const void *y0, 80 const void *y1, const void *y2, 81 const short *coef, uint32_t count) { 82 __m128i x; 83 __m128i c0, c2, c4, c6, c8; 84 __m128i r0, r1, r2; 85 __m128i p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11; 86 __m128i o0, o1; 87 uint32_t i; 88 89 x = _mm_loadl_epi64((const __m128i *)(coef+0)); 90 c0 = _mm_shuffle_epi32(x, 0x00); 91 c2 = _mm_shuffle_epi32(x, 0x55); 92 x = _mm_loadl_epi64((const __m128i *)(coef+4)); 93 c4 = _mm_shuffle_epi32(x, 0x00); 94 c6 = _mm_shuffle_epi32(x, 0x55); 95 x = _mm_loadl_epi64((const __m128i *)(coef+8)); 96 c8 = _mm_shuffle_epi32(x, 0x00); 97 98 for (i = 0; i < count; ++i) { 99 100 p0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0)), _mm_setzero_si128()); 101 p1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+1)), _mm_setzero_si128()); 102 p2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+2)), _mm_setzero_si128()); 103 p3 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+3)), _mm_setzero_si128()); 104 p4 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1)), _mm_setzero_si128()); 105 p5 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+1)), _mm_setzero_si128()); 106 p6 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+2)), _mm_setzero_si128()); 107 p7 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+3)), _mm_setzero_si128()); 108 p8 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2)), _mm_setzero_si128()); 109 p9 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+1)), _mm_setzero_si128()); 110 p10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+2)), _mm_setzero_si128()); 111 p11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+3)), _mm_setzero_si128()); 112 113 o0 = _mm_madd_epi16(_mm_unpacklo_epi16(p0, p1), c0); 114 o1 = _mm_madd_epi16(_mm_unpacklo_epi16(p1, p2), c0); 115 116 o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p2, p4), c2)); 117 o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p3, p5), c2)); 118 119 o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p5, p6), c4)); 120 o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p6, p7), c4)); 121 122 o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p8, p9), c6)); 123 o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p9, p10), c6)); 124 125 o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p10, _mm_setzero_si128()), c8)); 126 o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p11, _mm_setzero_si128()), c8)); 127 128 o0 = _mm_srai_epi32(o0, 8); 129 o1 = _mm_srai_epi32(o1, 8); 130 131 o0 = packus_epi32(o0, o1); 132 o0 = _mm_packus_epi16(o0, o0); 133 _mm_storel_epi64((__m128i *)dst, o0); 134 135 y0 = (const char *)y0 + 8; 136 y1 = (const char *)y1 + 8; 137 y2 = (const char *)y2 + 8; 138 dst = (char *)dst + 8; 139 } 140 } 141 142 void rsdIntrinsicColorMatrix4x4_K(void *dst, const void *src, 143 const short *coef, uint32_t count) { 144 const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3, 145 14, 10, 6, 2, 146 13, 9, 5, 1, 147 12, 8, 4, 0); 148 149 const __m128i Mxy = _mm_set_epi32(0xff0dff0c, 0xff09ff08, 0xff05ff04, 0xff01ff00); 150 const __m128i Mzw = _mm_set_epi32(0xff0fff0e, 0xff0bff0a, 0xff07ff06, 0xff03ff02); 151 __m128i c0, c1, c2, c3; 152 __m128i i4, o4; 153 __m128i xy, zw; 154 __m128i x2, y2, z2, w2; 155 uint32_t i; 156 157 c0 = _mm_loadl_epi64((const __m128i *)(coef+0)); 158 c1 = _mm_loadl_epi64((const __m128i *)(coef+4)); 159 c0 = _mm_unpacklo_epi16(c0, c1); 160 161 c2 = _mm_loadl_epi64((const __m128i *)(coef+8)); 162 c3 = _mm_loadl_epi64((const __m128i *)(coef+12)); 163 c2 = _mm_unpacklo_epi16(c2, c3); 164 165 for (i = 0; i < count; ++i) { 166 i4 = _mm_load_si128((const __m128i *)src); 167 xy = _mm_shuffle_epi8(i4, Mxy); 168 zw = _mm_shuffle_epi8(i4, Mzw); 169 170 x2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x00)); 171 y2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x55)); 172 z2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0xaa)); 173 w2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0xff)); 174 175 x2 = _mm_add_epi32(x2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x00))); 176 y2 = _mm_add_epi32(y2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x55))); 177 z2 = _mm_add_epi32(z2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0xaa))); 178 w2 = _mm_add_epi32(w2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0xff))); 179 180 x2 = _mm_srai_epi32(x2, 8); 181 y2 = _mm_srai_epi32(y2, 8); 182 z2 = _mm_srai_epi32(z2, 8); 183 w2 = _mm_srai_epi32(w2, 8); 184 185 x2 = packus_epi32(x2, y2); 186 z2 = packus_epi32(z2, w2); 187 o4 = _mm_packus_epi16(x2, z2); 188 189 o4 = _mm_shuffle_epi8(o4, T4x4); 190 _mm_storeu_si128((__m128i *)dst, o4); 191 192 src = (const char *)src + 16; 193 dst = (char *)dst + 16; 194 } 195 } 196 197 void rsdIntrinsicColorMatrix3x3_K(void *dst, const void *src, 198 const short *coef, uint32_t count) { 199 const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3, 200 14, 10, 6, 2, 201 13, 9, 5, 1, 202 12, 8, 4, 0); 203 204 const __m128i Mxy = _mm_set_epi32(0xff0dff0c, 0xff09ff08, 0xff05ff04, 0xff01ff00); 205 const __m128i Mzw = _mm_set_epi32(0xff0fff0e, 0xff0bff0a, 0xff07ff06, 0xff03ff02); 206 207 __m128i c0, c1, c2, c3; 208 __m128i i4, o4; 209 __m128i xy, zw; 210 __m128i x2, y2, z2, w2; 211 uint32_t i; 212 213 c0 = _mm_loadl_epi64((const __m128i *)(coef+0)); 214 c1 = _mm_loadl_epi64((const __m128i *)(coef+4)); 215 c0 = _mm_unpacklo_epi16(c0, c1); 216 217 c2 = _mm_loadl_epi64((const __m128i *)(coef+8)); 218 c3 = _mm_loadl_epi64((const __m128i *)(coef+12)); 219 c2 = _mm_unpacklo_epi16(c2, c3); 220 221 for (i = 0; i < count; ++i) { 222 i4 = _mm_loadu_si128((const __m128i *)src); 223 xy = _mm_shuffle_epi8(i4, Mxy); 224 zw = _mm_shuffle_epi8(i4, Mzw); 225 226 x2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x00)); 227 y2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x55)); 228 z2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0xaa)); 229 230 x2 = _mm_add_epi32(x2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x00))); 231 y2 = _mm_add_epi32(y2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x55))); 232 z2 = _mm_add_epi32(z2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0xaa))); 233 234 x2 = _mm_srai_epi32(x2, 8); 235 y2 = _mm_srai_epi32(y2, 8); 236 z2 = _mm_srai_epi32(z2, 8); 237 w2 = _mm_srli_epi32(zw, 16); 238 239 x2 = packus_epi32(x2, y2); 240 z2 = packus_epi32(z2, w2); 241 o4 = _mm_packus_epi16(x2, z2); 242 243 o4 = _mm_shuffle_epi8(o4, T4x4); 244 _mm_storeu_si128((__m128i *)dst, o4); 245 246 src = (const char *)src + 16; 247 dst = (char *)dst + 16; 248 } 249 } 250 251 void rsdIntrinsicColorMatrixDot_K(void *dst, const void *src, 252 const short *coef, uint32_t count) { 253 const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3, 254 14, 10, 6, 2, 255 13, 9, 5, 1, 256 12, 8, 4, 0); 257 const __m128i Mxy = _mm_set_epi32(0xff0dff0c, 0xff09ff08, 0xff05ff04, 0xff01ff00); 258 const __m128i Mzw = _mm_set_epi32(0xff0fff0e, 0xff0bff0a, 0xff07ff06, 0xff03ff02); 259 __m128i c0, c1, c2, c3; 260 __m128i i4, o4; 261 __m128i xy, zw; 262 __m128i x2, y2, z2, w2; 263 uint32_t i; 264 265 c0 = _mm_loadl_epi64((const __m128i *)(coef+0)); 266 c0 = _mm_shufflelo_epi16(c0, 0); 267 c1 = _mm_loadl_epi64((const __m128i *)(coef+4)); 268 c1 = _mm_shufflelo_epi16(c1, 0); 269 c0 = _mm_unpacklo_epi16(c0, c1); 270 271 c2 = _mm_loadl_epi64((const __m128i *)(coef+8)); 272 c2 = _mm_shufflelo_epi16(c2, 0); 273 c3 = _mm_loadl_epi64((const __m128i *)(coef+12)); 274 c3 = _mm_shufflelo_epi16(c3, 0); 275 c2 = _mm_unpacklo_epi16(c2, c3); 276 277 for (i = 0; i < count; ++i) { 278 i4 = _mm_loadu_si128((const __m128i *)src); 279 280 xy = _mm_shuffle_epi8(i4, Mxy); 281 zw = _mm_shuffle_epi8(i4, Mzw); 282 283 x2 = _mm_madd_epi16(xy, c0); 284 x2 = _mm_add_epi32(x2, _mm_madd_epi16(zw, c2)); 285 286 x2 = _mm_srai_epi32(x2, 8); 287 y2 = x2; 288 z2 = x2; 289 w2 = _mm_srli_epi32(zw, 16); 290 291 x2 = packus_epi32(x2, y2); 292 z2 = packus_epi32(z2, w2); 293 o4 = _mm_packus_epi16(x2, z2); 294 295 o4 = _mm_shuffle_epi8(o4, T4x4); 296 _mm_storeu_si128((__m128i *)dst, o4); 297 298 src = (const char *)src + 16; 299 dst = (char *)dst + 16; 300 } 301 } 302 303 void rsdIntrinsicBlurVFU4_K(void *dst, 304 const void *pin, int stride, const void *gptr, 305 int rct, int x1, int x2) { 306 const char *pi; 307 __m128i pi0, pi1; 308 __m128 pf0, pf1; 309 __m128 bp0, bp1; 310 __m128 x; 311 int r; 312 313 for (; x1 < x2; x1 += 2) { 314 pi = (const char *)pin + (x1 << 2); 315 bp0 = _mm_setzero_ps(); 316 bp1 = _mm_setzero_ps(); 317 318 for (r = 0; r < rct; ++r) { 319 x = _mm_load_ss((const float *)gptr + r); 320 x = _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 0, 0, 0)); 321 322 pi0 = _mm_cvtsi32_si128(*(const int *)pi); 323 pi1 = _mm_cvtsi32_si128(*((const int *)pi + 1)); 324 325 pf0 = _mm_cvtepi32_ps(cvtepu8_epi32(pi0)); 326 pf1 = _mm_cvtepi32_ps(cvtepu8_epi32(pi1)); 327 328 bp0 = _mm_add_ps(bp0, _mm_mul_ps(pf0, x)); 329 bp1 = _mm_add_ps(bp1, _mm_mul_ps(pf1, x)); 330 331 pi += stride; 332 } 333 334 _mm_storeu_ps((float *)dst, bp0); 335 _mm_storeu_ps((float *)dst + 4, bp1); 336 dst = (char *)dst + 32; 337 } 338 } 339 340 void rsdIntrinsicBlurHFU4_K(void *dst, 341 const void *pin, const void *gptr, 342 int rct, int x1, int x2) { 343 const __m128i Mu8 = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0x0c080400); 344 const float *pi; 345 __m128 pf, x, y; 346 __m128i o; 347 int r; 348 349 for (; x1 < x2; ++x1) { 350 /* rct is define as 2*r+1 by the caller */ 351 x = _mm_load_ss((const float *)gptr); 352 x = _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 0, 0, 0)); 353 354 pi = (const float *)pin + (x1 << 2); 355 pf = _mm_mul_ps(x, _mm_load_ps(pi)); 356 357 for (r = 1; r < rct; r += 2) { 358 x = _mm_load_ss((const float *)gptr + r); 359 y = _mm_load_ss((const float *)gptr + r + 1); 360 x = _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 0, 0, 0)); 361 y = _mm_shuffle_ps(y, y, _MM_SHUFFLE(0, 0, 0, 0)); 362 363 pf = _mm_add_ps(pf, _mm_mul_ps(x, _mm_load_ps(pi + (r << 2)))); 364 pf = _mm_add_ps(pf, _mm_mul_ps(y, _mm_load_ps(pi + (r << 2) + 4))); 365 } 366 367 o = _mm_cvtps_epi32(pf); 368 *(int *)dst = _mm_cvtsi128_si32(_mm_shuffle_epi8(o, Mu8)); 369 dst = (char *)dst + 4; 370 } 371 } 372 373 void rsdIntrinsicBlurHFU1_K(void *dst, 374 const void *pin, const void *gptr, 375 int rct, int x1, int x2) { 376 const __m128i Mu8 = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0x0c080400); 377 const float *pi; 378 __m128 pf, g0, g1, g2, g3, gx, p0, p1; 379 __m128i o; 380 int r; 381 382 for (; x1 < x2; x1+=4) { 383 g0 = _mm_load_ss((const float *)gptr); 384 g0 = _mm_shuffle_ps(g0, g0, _MM_SHUFFLE(0, 0, 0, 0)); 385 386 pi = (const float *)pin + x1; 387 pf = _mm_mul_ps(g0, _mm_loadu_ps(pi)); 388 389 for (r = 1; r < rct; r += 4) { 390 gx = _mm_loadu_ps((const float *)gptr + r); 391 p0 = _mm_loadu_ps(pi + r); 392 p1 = _mm_loadu_ps(pi + r + 4); 393 394 g0 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(0, 0, 0, 0)); 395 pf = _mm_add_ps(pf, _mm_mul_ps(g0, p0)); 396 g1 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(1, 1, 1, 1)); 397 pf = _mm_add_ps(pf, _mm_mul_ps(g1, _mm_alignr_epi8(p1, p0, 4))); 398 g2 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(2, 2, 2, 2)); 399 pf = _mm_add_ps(pf, _mm_mul_ps(g2, _mm_alignr_epi8(p1, p0, 8))); 400 g3 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(3, 3, 3, 3)); 401 pf = _mm_add_ps(pf, _mm_mul_ps(g3, _mm_alignr_epi8(p1, p0, 12))); 402 } 403 404 o = _mm_cvtps_epi32(pf); 405 *(int *)dst = _mm_cvtsi128_si32(_mm_shuffle_epi8(o, Mu8)); 406 dst = (char *)dst + 4; 407 } 408 } 409 410 void rsdIntrinsicYuv_K(void *dst, 411 const unsigned char *pY, const unsigned char *pUV, 412 uint32_t count, const short *param) { 413 __m128i biasY, biasUV; 414 __m128i c0, c1, c2, c3, c4; 415 416 biasY = _mm_set1_epi32(param[8]); /* 16 */ 417 biasUV = _mm_set1_epi32(param[16]); /* 128 */ 418 419 c0 = _mm_set1_epi32(param[0]); /* 298 */ 420 c1 = _mm_set1_epi32(param[1]); /* 409 */ 421 c2 = _mm_set1_epi32(param[2]); /* -100 */ 422 c3 = _mm_set1_epi32(param[3]); /* 516 */ 423 c4 = _mm_set1_epi32(param[4]); /* -208 */ 424 425 __m128i Y, UV, U, V, R, G, B, A; 426 427 A = _mm_set1_epi32(255); 428 uint32_t i; 429 430 for (i = 0; i < (count << 1); ++i) { 431 Y = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pY)); 432 UV = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pUV)); 433 434 Y = _mm_sub_epi32(Y, biasY); 435 UV = _mm_sub_epi32(UV, biasUV); 436 437 U = _mm_shuffle_epi32(UV, 0xf5); 438 V = _mm_shuffle_epi32(UV, 0xa0); 439 440 Y = mullo_epi32(Y, c0); 441 442 R = _mm_add_epi32(Y, mullo_epi32(V, c1)); 443 R = _mm_add_epi32(R, biasUV); 444 R = _mm_srai_epi32(R, 8); 445 446 G = _mm_add_epi32(Y, mullo_epi32(U, c2)); 447 G = _mm_add_epi32(G, mullo_epi32(V, c4)); 448 G = _mm_add_epi32(G, biasUV); 449 G = _mm_srai_epi32(G, 8); 450 451 B = _mm_add_epi32(Y, mullo_epi32(U, c3)); 452 B = _mm_add_epi32(B, biasUV); 453 B = _mm_srai_epi32(B, 8); 454 455 __m128i y1, y2, y3, y4; 456 457 y1 = packus_epi32(R, G); 458 y2 = packus_epi32(B, A); 459 y3 = _mm_packus_epi16(y1, y2); 460 const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3, 461 14, 10, 6, 2, 462 13, 9, 5, 1, 463 12, 8, 4, 0); 464 y4 = _mm_shuffle_epi8(y3, T4x4); 465 _mm_storeu_si128((__m128i *)dst, y4); 466 pY += 4; 467 pUV += 4; 468 dst = (__m128i *)dst + 1; 469 } 470 } 471 472 void rsdIntrinsicYuvR_K(void *dst, 473 const unsigned char *pY, const unsigned char *pUV, 474 uint32_t count, const short *param) { 475 __m128i biasY, biasUV; 476 __m128i c0, c1, c2, c3, c4; 477 478 biasY = _mm_set1_epi32(param[8]); /* 16 */ 479 biasUV = _mm_set1_epi32(param[16]); /* 128 */ 480 481 c0 = _mm_set1_epi32(param[0]); /* 298 */ 482 c1 = _mm_set1_epi32(param[1]); /* 409 */ 483 c2 = _mm_set1_epi32(param[2]); /* -100 */ 484 c3 = _mm_set1_epi32(param[3]); /* 516 */ 485 c4 = _mm_set1_epi32(param[4]); /* -208 */ 486 487 __m128i Y, UV, U, V, R, G, B, A; 488 489 A = _mm_set1_epi32(255); 490 uint32_t i; 491 492 for (i = 0; i < (count << 1); ++i) { 493 Y = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pY)); 494 UV = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pUV)); 495 496 Y = _mm_sub_epi32(Y, biasY); 497 UV = _mm_sub_epi32(UV, biasUV); 498 499 V = _mm_shuffle_epi32(UV, 0xf5); 500 U = _mm_shuffle_epi32(UV, 0xa0); 501 502 Y = mullo_epi32(Y, c0); 503 504 R = _mm_add_epi32(Y, mullo_epi32(V, c1)); 505 R = _mm_add_epi32(R, biasUV); 506 R = _mm_srai_epi32(R, 8); 507 508 G = _mm_add_epi32(Y, mullo_epi32(U, c2)); 509 G = _mm_add_epi32(G, mullo_epi32(V, c4)); 510 G = _mm_add_epi32(G, biasUV); 511 G = _mm_srai_epi32(G, 8); 512 513 B = _mm_add_epi32(Y, mullo_epi32(U, c3)); 514 B = _mm_add_epi32(B, biasUV); 515 B = _mm_srai_epi32(B, 8); 516 517 __m128i y1, y2, y3, y4; 518 519 y1 = packus_epi32(R, G); 520 y2 = packus_epi32(B, A); 521 y3 = _mm_packus_epi16(y1, y2); 522 const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3, 523 14, 10, 6, 2, 524 13, 9, 5, 1, 525 12, 8, 4, 0); 526 y4 = _mm_shuffle_epi8(y3, T4x4); 527 _mm_storeu_si128((__m128i *)dst, y4); 528 pY += 4; 529 pUV += 4; 530 dst = (__m128i *)dst + 1; 531 } 532 } 533 534 void rsdIntrinsicYuv2_K(void *dst, 535 const unsigned char *pY, const unsigned char *pU, 536 const unsigned char *pV, uint32_t count, const short *param) { 537 __m128i biasY, biasUV; 538 __m128i c0, c1, c2, c3, c4; 539 540 biasY = _mm_set1_epi32(param[8]); /* 16 */ 541 biasUV = _mm_set1_epi32(param[16]); /* 128 */ 542 543 c0 = _mm_set1_epi32(param[0]); /* 298 */ 544 c1 = _mm_set1_epi32(param[1]); /* 409 */ 545 c2 = _mm_set1_epi32(param[2]); /* -100 */ 546 c3 = _mm_set1_epi32(param[3]); /* 516 */ 547 c4 = _mm_set1_epi32(param[4]); /* -208 */ 548 549 __m128i Y, U, V, R, G, B, A; 550 551 A = _mm_set1_epi32(255); 552 uint32_t i; 553 554 for (i = 0; i < (count << 1); ++i) { 555 Y = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pY)); 556 U = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pU)); 557 V = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pV)); 558 559 Y = _mm_sub_epi32(Y, biasY); 560 U = _mm_sub_epi32(U, biasUV); 561 V = _mm_sub_epi32(V, biasUV); 562 563 Y = mullo_epi32(Y, c0); 564 565 R = _mm_add_epi32(Y, mullo_epi32(V, c1)); 566 R = _mm_add_epi32(R, biasUV); 567 R = _mm_srai_epi32(R, 8); 568 569 G = _mm_add_epi32(Y, mullo_epi32(U, c2)); 570 G = _mm_add_epi32(G, mullo_epi32(V, c4)); 571 G = _mm_add_epi32(G, biasUV); 572 G = _mm_srai_epi32(G, 8); 573 574 B = _mm_add_epi32(Y, mullo_epi32(U, c3)); 575 B = _mm_add_epi32(B, biasUV); 576 B = _mm_srai_epi32(B, 8); 577 578 __m128i y1, y2, y3, y4; 579 580 y1 = packus_epi32(R, G); 581 y2 = packus_epi32(B, A); 582 y3 = _mm_packus_epi16(y1, y2); 583 const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3, 584 14, 10, 6, 2, 585 13, 9, 5, 1, 586 12, 8, 4, 0); 587 y4 = _mm_shuffle_epi8(y3, T4x4); 588 _mm_storeu_si128((__m128i *)dst, y4); 589 pY += 4; 590 pU += 4; 591 pV += 4; 592 dst = (__m128i *)dst + 1; 593 } 594 } 595 596 extern "C" void rsdIntrinsicConvolve5x5_K(void *dst, const void *y0, 597 const void *y1, const void *y2, 598 const void *y3, const void *y4, 599 const short *coef, uint32_t count) { 600 __m128i x; 601 __m128i c0, c2, c4, c6, c8, c10, c12; 602 __m128i c14, c16, c18, c20, c22, c24; 603 __m128i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9; 604 __m128i p0, p1, p2, p3, p4, p5, p6, p7; 605 __m128i p8, p9, p10, p11, p12, p13, p14, p15; 606 __m128i p16, p17, p18, p19, p20, p21, p22, p23; 607 __m128i p24, p25, p26, p27, p28, p29, p30, p31; 608 __m128i p32, p33, p34, p35, p36, p37, p38, p39; 609 __m128i o0, o1, o2, o3; 610 uint32_t i; 611 612 x = _mm_loadl_epi64((const __m128i *)(coef+0)); 613 c0 = _mm_shuffle_epi32(x, 0x00); 614 c2 = _mm_shuffle_epi32(x, 0x55); 615 616 x = _mm_loadl_epi64((const __m128i *)(coef+4)); 617 c4 = _mm_shuffle_epi32(x, 0x00); 618 c6 = _mm_shuffle_epi32(x, 0x55); 619 620 x = _mm_loadl_epi64((const __m128i *)(coef+8)); 621 c8 = _mm_shuffle_epi32(x, 0x00); 622 c10 = _mm_shuffle_epi32(x, 0x55); 623 624 x = _mm_loadl_epi64((const __m128i *)(coef+12)); 625 c12 = _mm_shuffle_epi32(x, 0x00); 626 c14 = _mm_shuffle_epi32(x, 0x55); 627 628 x = _mm_loadl_epi64((const __m128i *)(coef+16)); 629 c16 = _mm_shuffle_epi32(x, 0x00); 630 c18 = _mm_shuffle_epi32(x, 0x55); 631 632 x = _mm_loadl_epi64((const __m128i *)(coef+20)); 633 c20 = _mm_shuffle_epi32(x, 0x00); 634 c22 = _mm_shuffle_epi32(x, 0x55); 635 636 x = _mm_loadl_epi64((const __m128i *)(coef+24)); 637 c24 = _mm_shuffle_epi32(x, 0x00); 638 639 for (i = 0; i < count; ++i) { 640 641 p0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int32_t *)y0), _mm_setzero_si128()); 642 p1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+1)), _mm_setzero_si128()); 643 p2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+2)), _mm_setzero_si128()); 644 p3 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+3)), _mm_setzero_si128()); 645 p4 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+4)), _mm_setzero_si128()); 646 p5 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+5)), _mm_setzero_si128()); 647 p6 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+6)), _mm_setzero_si128()); 648 p7 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+7)), _mm_setzero_si128()); 649 650 p8 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1)), _mm_setzero_si128()); 651 p9 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+1)), _mm_setzero_si128()); 652 p10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+2)), _mm_setzero_si128()); 653 p11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+3)), _mm_setzero_si128()); 654 p12 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+4)), _mm_setzero_si128()); 655 p13 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+5)), _mm_setzero_si128()); 656 p14 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+6)), _mm_setzero_si128()); 657 p15 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+7)), _mm_setzero_si128()); 658 659 p16 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2)), _mm_setzero_si128()); 660 p17 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+1)), _mm_setzero_si128()); 661 p18 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+2)), _mm_setzero_si128()); 662 p19 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+3)), _mm_setzero_si128()); 663 p20 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+4)), _mm_setzero_si128()); 664 p21 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+5)), _mm_setzero_si128()); 665 p22 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+6)), _mm_setzero_si128()); 666 p23 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+7)), _mm_setzero_si128()); 667 668 p24 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3)), _mm_setzero_si128()); 669 p25 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+1)), _mm_setzero_si128()); 670 p26 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+2)), _mm_setzero_si128()); 671 p27 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+3)), _mm_setzero_si128()); 672 p28 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+4)), _mm_setzero_si128()); 673 p29 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+5)), _mm_setzero_si128()); 674 p30 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+6)), _mm_setzero_si128()); 675 p31 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+7)), _mm_setzero_si128()); 676 677 p32 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4)), _mm_setzero_si128()); 678 p33 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+1)), _mm_setzero_si128()); 679 p34 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+2)), _mm_setzero_si128()); 680 p35 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+3)), _mm_setzero_si128()); 681 p36 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+4)), _mm_setzero_si128()); 682 p37 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+5)), _mm_setzero_si128()); 683 p38 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+6)), _mm_setzero_si128()); 684 p39 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+7)), _mm_setzero_si128()); 685 686 o0 = _mm_madd_epi16( _mm_unpacklo_epi16(p0, p1), c0); 687 o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p2, p3), c2)); 688 o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p4, p8), c4)); 689 o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p9,p10), c6)); 690 o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p11, p12), c8)); 691 o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p16, p17), c10)); 692 o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p18, p19), c12)); 693 o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p20, p24), c14)); 694 o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p25,p26), c16)); 695 o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p27, p28), c18)); 696 o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p32, p33), c20)); 697 o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p34, p35), c22)); 698 o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p36, _mm_setzero_si128()), c24)); 699 o0 = _mm_srai_epi32(o0, 8); 700 701 o1 = _mm_madd_epi16( _mm_unpacklo_epi16(p1, p2), c0); 702 o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p3,p4), c2)); 703 o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p5, p9), c4)); 704 o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p10,p11), c6)); 705 o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p12,p13), c8)); 706 o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p17,p18), c10)); 707 o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p19,p20), c12)); 708 o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p21,p25), c14)); 709 o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p26, p27), c16)); 710 o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p28, p29), c18)); 711 o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p33, p34), c20)); 712 o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p35, p36), c22)); 713 o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p37, _mm_setzero_si128()), c24)); 714 o1 = _mm_srai_epi32(o1, 8); 715 716 o2 = _mm_madd_epi16( _mm_unpacklo_epi16(p2,p3), c0); 717 o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p4, p5), c2)); 718 o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p6, p10), c4)); 719 o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p11, p12), c6)); 720 o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p13, p14), c8)); 721 o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p18, p19), c10)); 722 o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p20, p21), c12)); 723 o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p22, p26), c14)); 724 o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p27, p28), c16)); 725 o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p29, p30), c18)); 726 o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p34, p35), c20)); 727 o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p36, p37), c22)); 728 o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p38, _mm_setzero_si128()), c24)); 729 o2 = _mm_srai_epi32(o2, 8); 730 731 o3 = _mm_madd_epi16( _mm_unpacklo_epi16(p3,p4), c0); 732 o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p5, p6), c2)); 733 o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p7, p11), c4)); 734 o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p12, p13), c6)); 735 o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p14, p15), c8)); 736 o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p19, p20), c10)); 737 o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p21, p22), c12)); 738 o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p23, p27), c14)); 739 o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p28, p29), c16)); 740 o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p30, p31), c18)); 741 o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p35, p36), c20)); 742 o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p37,p38), c22)); 743 o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p39, _mm_setzero_si128()), c24)); 744 o3 = _mm_srai_epi32(o3, 8); 745 746 o0 = packus_epi32(o0, o1); 747 o2 = packus_epi32(o2, o3); 748 o0 = _mm_packus_epi16(o0, o2); 749 _mm_storeu_si128((__m128i *)dst, o0); 750 751 y0 = (const char *)y0 + 16; 752 y1 = (const char *)y1 + 16; 753 y2 = (const char *)y2 + 16; 754 y3 = (const char *)y3 + 16; 755 y4 = (const char *)y4 + 16; 756 dst = (char *)dst + 16; 757 } 758 } 759 760 void rsdIntrinsicBlendSrcOver_K(void *dst, const void *src, uint32_t count8) { 761 __m128i all1s, ina, ins; 762 __m128i in0, in1, out0, out1; 763 __m128i t0, t1, t2, t3; 764 uint32_t i; 765 766 all1s = _mm_set1_epi16(255); 767 768 for (i = 0; i < count8; ++i) { 769 in0 = _mm_loadu_si128((const __m128i *)src); 770 in1 = _mm_loadu_si128((const __m128i *)src + 1); 771 out0 = _mm_loadu_si128((const __m128i *)dst); 772 out1 = _mm_loadu_si128((const __m128i *)dst + 1); 773 774 ins = _mm_unpacklo_epi8(in0, _mm_setzero_si128()); 775 ina = _mm_shufflelo_epi16(ins, 0xFF); 776 ina = _mm_shufflehi_epi16(ina, 0xFF); 777 t0 = _mm_unpacklo_epi8(out0, _mm_setzero_si128()); 778 t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, ina)); 779 t0 = _mm_srli_epi16(t0, 8); 780 t0 = _mm_add_epi16(t0, ins); 781 782 ins = _mm_unpackhi_epi8(in0, _mm_setzero_si128()); 783 ina = _mm_shufflelo_epi16(ins, 0xFF); 784 ina = _mm_shufflehi_epi16(ina, 0xFF); 785 t1 = _mm_unpackhi_epi8(out0, _mm_setzero_si128()); 786 t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, ina)); 787 t1 = _mm_srli_epi16(t1, 8); 788 t1 = _mm_add_epi16(t1, ins); 789 790 ins = _mm_unpacklo_epi8(in1, _mm_setzero_si128()); 791 ina = _mm_shufflelo_epi16(ins, 0xFF); 792 ina = _mm_shufflehi_epi16(ina, 0xFF); 793 t2 = _mm_unpacklo_epi8(out1, _mm_setzero_si128()); 794 t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, ina)); 795 t2 = _mm_srli_epi16(t2, 8); 796 t2 = _mm_add_epi16(t2, ins); 797 798 ins = _mm_unpackhi_epi8(in1, _mm_setzero_si128()); 799 ina = _mm_shufflelo_epi16(ins, 0xFF); 800 ina = _mm_shufflehi_epi16(ina, 0xFF); 801 t3 = _mm_unpackhi_epi8(out1, _mm_setzero_si128()); 802 t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, ina)); 803 t3 = _mm_srli_epi16(t3, 8); 804 t3 = _mm_add_epi16(t3, ins); 805 806 t0 = _mm_packus_epi16(t0, t1); 807 t2 = _mm_packus_epi16(t2, t3); 808 _mm_storeu_si128((__m128i *)dst, t0); 809 _mm_storeu_si128((__m128i *)dst + 1, t2); 810 811 src = (const __m128i *)src + 2; 812 dst = (__m128i *)dst + 2; 813 } 814 } 815 816 void rsdIntrinsicBlendDstOver_K(void *dst, const void *src, uint32_t count8) { 817 __m128i all1s, outa, outs; 818 __m128i in0, in1, out0, out1; 819 __m128i t0, t1, t2, t3; 820 uint32_t i; 821 822 all1s = _mm_set1_epi16(255); 823 824 for (i = 0; i < count8; ++i) { 825 in0 = _mm_loadu_si128((const __m128i *)src); 826 in1 = _mm_loadu_si128((const __m128i *)src + 1); 827 out0 = _mm_loadu_si128((const __m128i *)dst); 828 out1 = _mm_loadu_si128((const __m128i *)dst + 1); 829 830 831 outs = _mm_unpacklo_epi8(out0, _mm_setzero_si128()); 832 outa = _mm_shufflelo_epi16(outs, 0xFF); 833 outa = _mm_shufflehi_epi16(outa, 0xFF); 834 t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128()); 835 t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, outa)); 836 t0 = _mm_srli_epi16(t0, 8); 837 t0 = _mm_add_epi16(t0, outs); 838 839 outs = _mm_unpackhi_epi8(out0, _mm_setzero_si128()); 840 outa = _mm_shufflelo_epi16(outs, 0xFF); 841 outa = _mm_shufflehi_epi16(outa, 0xFF); 842 t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128()); 843 t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, outa)); 844 t1 = _mm_srli_epi16(t1, 8); 845 t1 = _mm_add_epi16(t1, outs); 846 847 outs = _mm_unpacklo_epi8(out1, _mm_setzero_si128()); 848 outa = _mm_shufflelo_epi16(outs, 0xFF); 849 outa = _mm_shufflehi_epi16(outa, 0xFF); 850 t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128()); 851 t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, outa)); 852 t2 = _mm_srli_epi16(t2, 8); 853 t2 = _mm_add_epi16(t2, outs); 854 855 outs = _mm_unpackhi_epi8(out1, _mm_setzero_si128()); 856 outa = _mm_shufflelo_epi16(outs, 0xFF); 857 outa = _mm_shufflehi_epi16(outa, 0xFF); 858 t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128()); 859 t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, outa)); 860 t3 = _mm_srli_epi16(t3, 8); 861 t3 = _mm_add_epi16(t3, outs); 862 863 t0 = _mm_packus_epi16(t0, t1); 864 t2 = _mm_packus_epi16(t2, t3); 865 _mm_storeu_si128((__m128i *)dst, t0); 866 _mm_storeu_si128((__m128i *)dst + 1, t2); 867 868 src = (const __m128i *)src + 2; 869 dst = (__m128i *)dst + 2; 870 } 871 } 872 873 void rsdIntrinsicBlendSrcIn_K(void *dst, const void *src, uint32_t count8) { 874 __m128i outa; 875 __m128i in0, in1, out0, out1; 876 __m128i t0, t1, t2, t3; 877 uint32_t i; 878 879 for (i = 0; i < count8; ++i) { 880 in0 = _mm_loadu_si128((const __m128i *)src); 881 in1 = _mm_loadu_si128((const __m128i *)src + 1); 882 out0 = _mm_loadu_si128((const __m128i *)dst); 883 out1 = _mm_loadu_si128((const __m128i *)dst + 1); 884 885 outa = _mm_unpacklo_epi8(out0, _mm_setzero_si128()); 886 outa = _mm_shufflelo_epi16(outa, 0xFF); 887 outa = _mm_shufflehi_epi16(outa, 0xFF); 888 t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128()); 889 t0 = _mm_mullo_epi16(t0, outa); 890 t0 = _mm_srli_epi16(t0, 8); 891 892 outa = _mm_unpackhi_epi8(out0, _mm_setzero_si128()); 893 outa = _mm_shufflelo_epi16(outa, 0xFF); 894 outa = _mm_shufflehi_epi16(outa, 0xFF); 895 t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128()); 896 t1 = _mm_mullo_epi16(t1, outa); 897 t1 = _mm_srli_epi16(t1, 8); 898 899 outa = _mm_unpacklo_epi8(out1, _mm_setzero_si128()); 900 outa = _mm_shufflelo_epi16(outa, 0xFF); 901 outa = _mm_shufflehi_epi16(outa, 0xFF); 902 t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128()); 903 t2 = _mm_mullo_epi16(t2, outa); 904 t2 = _mm_srli_epi16(t2, 8); 905 906 outa = _mm_unpackhi_epi8(out1, _mm_setzero_si128()); 907 outa = _mm_shufflelo_epi16(outa, 0xFF); 908 outa = _mm_shufflehi_epi16(outa, 0xFF); 909 t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128()); 910 t3 = _mm_mullo_epi16(t3, outa); 911 t3 = _mm_srli_epi16(t3, 8); 912 913 t0 = _mm_packus_epi16(t0, t1); 914 t2 = _mm_packus_epi16(t2, t3); 915 _mm_storeu_si128((__m128i *)dst, t0); 916 _mm_storeu_si128((__m128i *)dst + 1, t2); 917 918 src = (const __m128i *)src + 2; 919 dst = (__m128i *)dst + 2; 920 } 921 } 922 923 void rsdIntrinsicBlendDstIn_K(void *dst, const void *src, uint32_t count8) { 924 __m128i ina; 925 __m128i in0, in1, out0, out1; 926 __m128i t0, t1, t2, t3; 927 uint32_t i; 928 929 for (i = 0; i < count8; ++i) { 930 in0 = _mm_loadu_si128((const __m128i *)src); 931 in1 = _mm_loadu_si128((const __m128i *)src + 1); 932 out0 = _mm_loadu_si128((const __m128i *)dst); 933 out1 = _mm_loadu_si128((const __m128i *)dst + 1); 934 935 ina = _mm_unpacklo_epi8(in0, _mm_setzero_si128()); 936 ina = _mm_shufflelo_epi16(ina, 0xFF); 937 ina = _mm_shufflehi_epi16(ina, 0xFF); 938 t0 = _mm_unpacklo_epi8(out0, _mm_setzero_si128()); 939 t0 = _mm_mullo_epi16(t0, ina); 940 t0 = _mm_srli_epi16(t0, 8); 941 942 ina = _mm_unpackhi_epi8(in0, _mm_setzero_si128()); 943 ina = _mm_shufflelo_epi16(ina, 0xFF); 944 ina = _mm_shufflehi_epi16(ina, 0xFF); 945 t1 = _mm_unpackhi_epi8(out0, _mm_setzero_si128()); 946 t1 = _mm_mullo_epi16(t1, ina); 947 t1 = _mm_srli_epi16(t1, 8); 948 949 ina = _mm_unpacklo_epi8(in1, _mm_setzero_si128()); 950 ina = _mm_shufflelo_epi16(ina, 0xFF); 951 ina = _mm_shufflehi_epi16(ina, 0xFF); 952 t2 = _mm_unpacklo_epi8(out1, _mm_setzero_si128()); 953 t2 = _mm_mullo_epi16(t2, ina); 954 t2 = _mm_srli_epi16(t2, 8); 955 956 ina = _mm_unpackhi_epi8(in1, _mm_setzero_si128()); 957 ina = _mm_shufflelo_epi16(ina, 0xFF); 958 ina = _mm_shufflehi_epi16(ina, 0xFF); 959 t3 = _mm_unpackhi_epi8(out1, _mm_setzero_si128()); 960 t3 = _mm_mullo_epi16(t3, ina); 961 t3 = _mm_srli_epi16(t3, 8); 962 963 t0 = _mm_packus_epi16(t0, t1); 964 t2 = _mm_packus_epi16(t2, t3); 965 _mm_storeu_si128((__m128i *)dst, t0); 966 _mm_storeu_si128((__m128i *)dst + 1, t2); 967 968 src = (const __m128i *)src + 2; 969 dst = (__m128i *)dst + 2; 970 } 971 } 972 973 void rsdIntrinsicBlendSrcOut_K(void *dst, const void *src, uint32_t count8) { 974 __m128i all1s, outa; 975 __m128i in0, in1, out0, out1; 976 __m128i t0, t1, t2, t3; 977 uint32_t i; 978 979 all1s = _mm_set1_epi16(255); 980 981 for (i = 0; i < count8; ++i) { 982 in0 = _mm_loadu_si128((const __m128i *)src); 983 in1 = _mm_loadu_si128((const __m128i *)src + 1); 984 out0 = _mm_loadu_si128((const __m128i *)dst); 985 out1 = _mm_loadu_si128((const __m128i *)dst + 1); 986 987 outa = _mm_unpacklo_epi8(out0, _mm_setzero_si128()); 988 outa = _mm_shufflelo_epi16(outa, 0xFF); 989 outa = _mm_shufflehi_epi16(outa, 0xFF); 990 t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128()); 991 t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, outa)); 992 t0 = _mm_srli_epi16(t0, 8); 993 994 outa = _mm_unpackhi_epi8(out0, _mm_setzero_si128()); 995 outa = _mm_shufflelo_epi16(outa, 0xFF); 996 outa = _mm_shufflehi_epi16(outa, 0xFF); 997 t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128()); 998 t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, outa)); 999 t1 = _mm_srli_epi16(t1, 8); 1000 1001 outa = _mm_unpacklo_epi8(out1, _mm_setzero_si128()); 1002 outa = _mm_shufflelo_epi16(outa, 0xFF); 1003 outa = _mm_shufflehi_epi16(outa, 0xFF); 1004 t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128()); 1005 t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, outa)); 1006 t2 = _mm_srli_epi16(t2, 8); 1007 1008 outa = _mm_unpackhi_epi8(out1, _mm_setzero_si128()); 1009 outa = _mm_shufflelo_epi16(outa, 0xFF); 1010 outa = _mm_shufflehi_epi16(outa, 0xFF); 1011 t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128()); 1012 t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, outa)); 1013 t3 = _mm_srli_epi16(t3, 8); 1014 1015 t0 = _mm_packus_epi16(t0, t1); 1016 t2 = _mm_packus_epi16(t2, t3); 1017 _mm_storeu_si128((__m128i *)dst, t0); 1018 _mm_storeu_si128((__m128i *)dst + 1, t2); 1019 1020 src = (const __m128i *)src + 2; 1021 dst = (__m128i *)dst + 2; 1022 } 1023 } 1024 1025 void rsdIntrinsicBlendDstOut_K(void *dst, const void *src, uint32_t count8) { 1026 __m128i all1s, ina; 1027 __m128i in0, in1, out0, out1; 1028 __m128i t0, t1, t2, t3; 1029 uint32_t i; 1030 1031 all1s = _mm_set1_epi16(255); 1032 1033 for (i = 0; i < count8; ++i) { 1034 in0 = _mm_loadu_si128((const __m128i *)src); 1035 in1 = _mm_loadu_si128((const __m128i *)src + 1); 1036 out0 = _mm_loadu_si128((const __m128i *)dst); 1037 out1 = _mm_loadu_si128((const __m128i *)dst + 1); 1038 1039 ina = _mm_unpacklo_epi8(in0, _mm_setzero_si128()); 1040 ina = _mm_shufflelo_epi16(ina, 0xFF); 1041 ina = _mm_shufflehi_epi16(ina, 0xFF); 1042 t0 = _mm_unpacklo_epi8(out0, _mm_setzero_si128()); 1043 t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, ina)); 1044 t0 = _mm_srli_epi16(t0, 8); 1045 1046 ina = _mm_unpackhi_epi8(in0, _mm_setzero_si128()); 1047 ina = _mm_shufflelo_epi16(ina, 0xFF); 1048 ina = _mm_shufflehi_epi16(ina, 0xFF); 1049 t1 = _mm_unpackhi_epi8(out0, _mm_setzero_si128()); 1050 t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, ina)); 1051 t1 = _mm_srli_epi16(t1, 8); 1052 1053 ina = _mm_unpacklo_epi8(in1, _mm_setzero_si128()); 1054 ina = _mm_shufflelo_epi16(ina, 0xFF); 1055 ina = _mm_shufflehi_epi16(ina, 0xFF); 1056 t2 = _mm_unpacklo_epi8(out1, _mm_setzero_si128()); 1057 t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, ina)); 1058 t2 = _mm_srli_epi16(t2, 8); 1059 1060 ina = _mm_unpackhi_epi8(in1, _mm_setzero_si128()); 1061 ina = _mm_shufflelo_epi16(ina, 0xFF); 1062 ina = _mm_shufflehi_epi16(ina, 0xFF); 1063 t3 = _mm_unpackhi_epi8(out1, _mm_setzero_si128()); 1064 t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, ina)); 1065 t3 = _mm_srli_epi16(t3, 8); 1066 1067 t0 = _mm_packus_epi16(t0, t1); 1068 t2 = _mm_packus_epi16(t2, t3); 1069 _mm_storeu_si128((__m128i *)dst, t0); 1070 _mm_storeu_si128((__m128i *)dst + 1, t2); 1071 1072 src = (const __m128i *)src + 2; 1073 dst = (__m128i *)dst + 2; 1074 } 1075 } 1076 1077 void rsdIntrinsicBlendSrcAtop_K(void *dst, const void *src, uint32_t count8) { 1078 const __m128i M0001 = _mm_set_epi32(0xff000000, 0xff000000, 0xff000000, 0xff000000); 1079 __m128i all1s, ina, outa, ins, outs; 1080 __m128i in0, in1, out0, out1; 1081 __m128i t0, t1, t2, t3; 1082 uint32_t i; 1083 1084 all1s = _mm_set1_epi16(255); 1085 1086 for (i = 0; i < count8; ++i) { 1087 in0 = _mm_loadu_si128((const __m128i *)src); 1088 in1 = _mm_loadu_si128((const __m128i *)src + 1); 1089 out0 = _mm_loadu_si128((const __m128i *)dst); 1090 out1 = _mm_loadu_si128((const __m128i *)dst + 1); 1091 1092 ins = _mm_unpacklo_epi8(in0, _mm_setzero_si128()); 1093 ina = _mm_shufflelo_epi16(ins, 0xFF); 1094 ina = _mm_shufflehi_epi16(ina, 0xFF); 1095 outs = _mm_unpacklo_epi8(out0, _mm_setzero_si128()); 1096 outa = _mm_shufflelo_epi16(outs, 0xFF); 1097 outa = _mm_shufflehi_epi16(outa, 0xFF); 1098 t0 = _mm_sub_epi16(all1s, ina); 1099 t0 = _mm_mullo_epi16(t0, outs); 1100 t0 = _mm_adds_epu16(t0, _mm_mullo_epi16(outa, ins)); 1101 t0 = _mm_srli_epi16(t0, 8); 1102 1103 ins = _mm_unpackhi_epi8(in0, _mm_setzero_si128()); 1104 ina = _mm_shufflelo_epi16(ins, 0xFF); 1105 ina = _mm_shufflehi_epi16(ina, 0xFF); 1106 outs = _mm_unpackhi_epi8(out0, _mm_setzero_si128()); 1107 outa = _mm_shufflelo_epi16(outs, 0xFF); 1108 outa = _mm_shufflehi_epi16(outa, 0xFF); 1109 t1 = _mm_sub_epi16(all1s, ina); 1110 t1 = _mm_mullo_epi16(t1, outs); 1111 t1 = _mm_adds_epu16(t1, _mm_mullo_epi16(outa, ins)); 1112 t1 = _mm_srli_epi16(t1, 8); 1113 1114 ins = _mm_unpacklo_epi8(in1, _mm_setzero_si128()); 1115 ina = _mm_shufflelo_epi16(ins, 0xFF); 1116 ina = _mm_shufflehi_epi16(ina, 0xFF); 1117 outs = _mm_unpacklo_epi8(out1, _mm_setzero_si128()); 1118 outa = _mm_shufflelo_epi16(outs, 0xFF); 1119 outa = _mm_shufflehi_epi16(outa, 0xFF); 1120 t2 = _mm_sub_epi16(all1s, ina); 1121 t2 = _mm_mullo_epi16(t2, outs); 1122 t2 = _mm_adds_epu16(t2, _mm_mullo_epi16(outa, ins)); 1123 t2 = _mm_srli_epi16(t2, 8); 1124 1125 ins = _mm_unpackhi_epi8(in1, _mm_setzero_si128()); 1126 ina = _mm_shufflelo_epi16(ins, 0xFF); 1127 ina = _mm_shufflehi_epi16(ina, 0xFF); 1128 outs = _mm_unpackhi_epi8(out1, _mm_setzero_si128()); 1129 outa = _mm_shufflelo_epi16(outs, 0xFF); 1130 outa = _mm_shufflehi_epi16(outa, 0xFF); 1131 t3 = _mm_sub_epi16(all1s, ina); 1132 t3 = _mm_mullo_epi16(t3, outs); 1133 t3 = _mm_adds_epu16(t3, _mm_mullo_epi16(outa, ins)); 1134 t3 = _mm_srli_epi16(t3, 8); 1135 1136 t0 = _mm_packus_epi16(t0, t1); 1137 t0 = blendv_epi8(t0, out0, M0001); 1138 t2 = _mm_packus_epi16(t2, t3); 1139 t2 = blendv_epi8(t2, out1, M0001); 1140 _mm_storeu_si128((__m128i *)dst, t0); 1141 _mm_storeu_si128((__m128i *)dst + 1, t2); 1142 1143 src = (const __m128i *)src + 2; 1144 dst = (__m128i *)dst + 2; 1145 } 1146 } 1147 1148 void rsdIntrinsicBlendDstAtop_K(void *dst, const void *src, uint32_t count8) { 1149 const __m128i M0001 = _mm_set_epi32(0xff000000, 0xff000000, 0xff000000, 0xff000000); 1150 __m128i all1s, ina, ins, outa, outs; 1151 __m128i in0, in1, out0, out1; 1152 __m128i t0, t1, t2, t3; 1153 uint32_t i; 1154 1155 all1s = _mm_set1_epi16(255); 1156 1157 for (i = 0; i < count8; ++i) { 1158 in0 = _mm_loadu_si128((const __m128i *)src); 1159 in1 = _mm_loadu_si128((const __m128i *)src + 1); 1160 out0 = _mm_loadu_si128((const __m128i *)dst); 1161 out1 = _mm_loadu_si128((const __m128i *)dst + 1); 1162 1163 ins = _mm_unpacklo_epi8(in0, _mm_setzero_si128()); 1164 ina = _mm_shufflelo_epi16(ins, 0xFF); 1165 ina = _mm_shufflehi_epi16(ina, 0xFF); 1166 outs = _mm_unpacklo_epi8(out0, _mm_setzero_si128()); 1167 outa = _mm_shufflelo_epi16(outs, 0xFF); 1168 outa = _mm_shufflehi_epi16(outa, 0xFF); 1169 t0 = _mm_sub_epi16(all1s, outa); 1170 t0 = _mm_mullo_epi16(t0, ins); 1171 t0 = _mm_adds_epu16(t0, _mm_mullo_epi16(ina, outs)); 1172 t0 = _mm_srli_epi16(t0, 8); 1173 1174 ins = _mm_unpackhi_epi8(in0, _mm_setzero_si128()); 1175 ina = _mm_shufflelo_epi16(ins, 0xFF); 1176 ina = _mm_shufflehi_epi16(ina, 0xFF); 1177 outs = _mm_unpackhi_epi8(out0, _mm_setzero_si128()); 1178 outa = _mm_shufflelo_epi16(outs, 0xFF); 1179 outa = _mm_shufflehi_epi16(outa, 0xFF); 1180 t1 = _mm_sub_epi16(all1s, outa); 1181 t1 = _mm_mullo_epi16(t1, ins); 1182 t1 = _mm_adds_epu16(t1, _mm_mullo_epi16(ina, outs)); 1183 t1 = _mm_srli_epi16(t1, 8); 1184 1185 ins = _mm_unpacklo_epi8(in1, _mm_setzero_si128()); 1186 ina = _mm_shufflelo_epi16(ins, 0xFF); 1187 ina = _mm_shufflehi_epi16(ina, 0xFF); 1188 outs = _mm_unpacklo_epi8(out1, _mm_setzero_si128()); 1189 outa = _mm_shufflelo_epi16(outs, 0xFF); 1190 outa = _mm_shufflehi_epi16(outa, 0xFF); 1191 t2 = _mm_sub_epi16(all1s, outa); 1192 t2 = _mm_mullo_epi16(t2, ins); 1193 t2 = _mm_adds_epu16(t2, _mm_mullo_epi16(ina, outs)); 1194 t2 = _mm_srli_epi16(t2, 8); 1195 1196 ins = _mm_unpackhi_epi8(in1, _mm_setzero_si128()); 1197 ina = _mm_shufflelo_epi16(ins, 0xFF); 1198 ina = _mm_shufflehi_epi16(ina, 0xFF); 1199 outs = _mm_unpackhi_epi8(out1, _mm_setzero_si128()); 1200 outa = _mm_shufflelo_epi16(outs, 0xFF); 1201 outa = _mm_shufflehi_epi16(outa, 0xFF); 1202 t3 = _mm_sub_epi16(all1s, outa); 1203 t3 = _mm_mullo_epi16(t3, ins); 1204 t3 = _mm_adds_epu16(t3, _mm_mullo_epi16(ina, outs)); 1205 t3 = _mm_srli_epi16(t3, 8); 1206 1207 t0 = _mm_packus_epi16(t0, t1); 1208 t0 = blendv_epi8(t0, in0, M0001); 1209 t2 = _mm_packus_epi16(t2, t3); 1210 t2 = blendv_epi8(t2, in1, M0001); 1211 _mm_storeu_si128((__m128i *)dst, t0); 1212 _mm_storeu_si128((__m128i *)dst + 1, t2); 1213 1214 src = (const __m128i *)src + 2; 1215 dst = (__m128i *)dst + 2; 1216 } 1217 } 1218 1219 void rsdIntrinsicBlendXor_K(void *dst, const void *src, uint32_t count8) { 1220 __m128i in0, in1, out0, out1; 1221 uint32_t i; 1222 1223 for (i = 0; i < count8; ++i) { 1224 in0 = _mm_loadu_si128((const __m128i *)src); 1225 in1 = _mm_loadu_si128((const __m128i *)src + 1); 1226 out0 = _mm_loadu_si128((const __m128i *)dst); 1227 out1 = _mm_loadu_si128((const __m128i *)dst + 1); 1228 1229 out0 = _mm_xor_si128(out0, in0); 1230 out1 = _mm_xor_si128(out1, in1); 1231 1232 _mm_storeu_si128((__m128i *)dst, out0); 1233 _mm_storeu_si128((__m128i *)dst + 1, out1); 1234 1235 src = (const __m128i *)src + 2; 1236 dst = (__m128i *)dst + 2; 1237 } 1238 } 1239 1240 void rsdIntrinsicBlendMultiply_K(void *dst, const void *src, uint32_t count8) { 1241 __m128i in0, in1, out0, out1; 1242 __m128i t0, t1, t2, t3; 1243 uint32_t i; 1244 1245 for (i = 0; i < count8; ++i) { 1246 in0 = _mm_loadu_si128((const __m128i *)src); 1247 in1 = _mm_loadu_si128((const __m128i *)src + 1); 1248 out0 = _mm_loadu_si128((const __m128i *)dst); 1249 out1 = _mm_loadu_si128((const __m128i *)dst + 1); 1250 1251 t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128()); 1252 t0 = _mm_mullo_epi16(t0, _mm_unpacklo_epi8(out0, _mm_setzero_si128())); 1253 t0 = _mm_srli_epi16(t0, 8); 1254 1255 t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128()); 1256 t1 = _mm_mullo_epi16(t1, _mm_unpackhi_epi8(out0, _mm_setzero_si128())); 1257 t1 = _mm_srli_epi16(t1, 8); 1258 1259 t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128()); 1260 t2 = _mm_mullo_epi16(t2, _mm_unpacklo_epi8(out1, _mm_setzero_si128())); 1261 t2 = _mm_srli_epi16(t2, 8); 1262 1263 t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128()); 1264 t3 = _mm_mullo_epi16(t3, _mm_unpackhi_epi8(out1, _mm_setzero_si128())); 1265 t3 = _mm_srli_epi16(t3, 8); 1266 1267 t0 = _mm_packus_epi16(t0, t1); 1268 t2 = _mm_packus_epi16(t2, t3); 1269 _mm_storeu_si128((__m128i *)dst, t0); 1270 _mm_storeu_si128((__m128i *)dst + 1, t2); 1271 1272 src = (const __m128i *)src + 2; 1273 dst = (__m128i *)dst + 2; 1274 } 1275 } 1276 1277 void rsdIntrinsicBlendAdd_K(void *dst, const void *src, uint32_t count8) { 1278 __m128i in0, in1, out0, out1; 1279 uint32_t i; 1280 1281 for (i = 0; i < count8; ++i) { 1282 in0 = _mm_loadu_si128((const __m128i *)src); 1283 in1 = _mm_loadu_si128((const __m128i *)src + 1); 1284 out0 = _mm_loadu_si128((const __m128i *)dst); 1285 out1 = _mm_loadu_si128((const __m128i *)dst + 1); 1286 1287 out0 = _mm_adds_epu8(out0, in0); 1288 out1 = _mm_adds_epu8(out1, in1); 1289 1290 _mm_storeu_si128((__m128i *)dst, out0); 1291 _mm_storeu_si128((__m128i *)dst + 1, out1); 1292 1293 src = (const __m128i *)src + 2; 1294 dst = (__m128i *)dst + 2; 1295 } 1296 } 1297 1298 void rsdIntrinsicBlendSub_K(void *dst, const void *src, uint32_t count8) { 1299 __m128i in0, in1, out0, out1; 1300 uint32_t i; 1301 1302 for (i = 0; i < count8; ++i) { 1303 in0 = _mm_loadu_si128((const __m128i *)src); 1304 in1 = _mm_loadu_si128((const __m128i *)src + 1); 1305 out0 = _mm_loadu_si128((const __m128i *)dst); 1306 out1 = _mm_loadu_si128((const __m128i *)dst + 1); 1307 1308 out0 = _mm_subs_epu8(out0, in0); 1309 out1 = _mm_subs_epu8(out1, in1); 1310 1311 _mm_storeu_si128((__m128i *)dst, out0); 1312 _mm_storeu_si128((__m128i *)dst + 1, out1); 1313 1314 src = (const __m128i *)src + 2; 1315 dst = (__m128i *)dst + 2; 1316 } 1317 } 1318