Home | History | Annotate | Download | only in dsp

Lines Matching refs:__m128i

33     const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); // argb
34 const __m128i A = _mm_srli_epi16(in, 8); // 0 a 0 g
35 const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0));
36 const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0)); // 0g0g
37 const __m128i out = _mm_sub_epi8(in, C);
38 _mm_storeu_si128((__m128i*)&argb_data[i], out);
51 const __m128i mults_rb = _mm_set_epi16(
56 const __m128i mults_b2 = _mm_set_epi16(
59 const __m128i mask_ag = _mm_set1_epi32(0xff00ff00); // alpha-green masks
60 const __m128i mask_rb = _mm_set1_epi32(0x00ff00ff); // red-blue masks
63 const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); // argb
64 const __m128i A = _mm_and_si128(in, mask_ag); // a 0 g 0
65 const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0));
66 const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0)); // g0g0
67 const __m128i D = _mm_mulhi_epi16(C, mults_rb); // x dr x db1
68 const __m128i E = _mm_slli_epi16(in, 8); // r 0 b 0
69 const __m128i F = _mm_mulhi_epi16(E, mults_b2); // x db2 0 0
70 const __m128i G = _mm_srli_epi32(F, 16); // 0 0 x db2
71 const __m128i H = _mm_add_epi8(G, D); // x dr x db
72 const __m128i I = _mm_and_si128(H, mask_rb); // 0 dr 0 db
73 const __m128i out = _mm_sub_epi8(in, I);
74 _mm_storeu_si128((__m128i*)&argb_data[i], out);
88 const __m128i mults_r = _mm_set_epi16(
91 const __m128i mults_g = _mm_set_epi16(
94 const __m128i mask_g = _mm_set1_epi32(0x00ff00); // green mask
95 const __m128i mask_b = _mm_set1_epi32(0x0000ff); // blue mask
102 const __m128i in0 = _mm_loadu_si128((__m128i*)&src[x + 0]);
103 const __m128i in1 = _mm_loadu_si128((__m128i*)&src[x + SPAN / 2]);
104 const __m128i A0 = _mm_slli_epi16(in0, 8); // r 0 | b 0
105 const __m128i A1 = _mm_slli_epi16(in1, 8);
106 const __m128i B0 = _mm_and_si128(in0, mask_g); // 0 0 | g 0
107 const __m128i B1 = _mm_and_si128(in1, mask_g);
108 const __m128i C0 = _mm_mulhi_epi16(A0, mults_r); // x db | 0 0
109 const __m128i C1 = _mm_mulhi_epi16(A1, mults_r);
110 const __m128i D0 = _mm_mulhi_epi16(B0, mults_g); // 0 0 | x db
111 const __m128i D1 = _mm_mulhi_epi16(B1, mults_g);
112 const __m128i E0 = _mm_sub_epi8(in0, D0); // x x | x b'
113 const __m128i E1 = _mm_sub_epi8(in1, D1);
114 const __m128i F0 = _mm_srli_epi32(C0, 16); // 0 0 | x db
115 const __m128i F1 = _mm_srli_epi32(C1, 16);
116 const __m128i G0 = _mm_sub_epi8(E0, F0); // 0 0 | x b'
117 const __m128i G1 = _mm_sub_epi8(E1, F1);
118 const __m128i H0 = _mm_and_si128(G0, mask_b); // 0 0 | 0 b
119 const __m128i H1 = _mm_and_si128(G1, mask_b);
120 const __m128i I = _mm_packs_epi32(H0, H1); // 0 b' | 0 b'
121 _mm_storeu_si128((__m128i*)values, I);
138 const __m128i mults_g = _mm_set_epi16(
141 const __m128i mask_g = _mm_set1_epi32(0x00ff00); // green mask
142 const __m128i mask = _mm_set1_epi32(0xff);
150 const __m128i in0 = _mm_loadu_si128((__m128i*)&src[x + 0]);
151 const __m128i in1 = _mm_loadu_si128((__m128i*)&src[x + SPAN / 2]);
152 const __m128i A0 = _mm_and_si128(in0, mask_g); // 0 0 | g 0
153 const __m128i A1 = _mm_and_si128(in1, mask_g);
154 const __m128i B0 = _mm_srli_epi32(in0, 16); // 0 0 | x r
155 const __m128i B1 = _mm_srli_epi32(in1, 16);
156 const __m128i C0 = _mm_mulhi_epi16(A0, mults_g); // 0 0 | x dr
157 const __m128i C1 = _mm_mulhi_epi16(A1, mults_g);
158 const __m128i E0 = _mm_sub_epi8(B0, C0); // x x | x r'
159 const __m128i E1 = _mm_sub_epi8(B1, C1);
160 const __m128i F0 = _mm_and_si128(E0, mask); // 0 0 | 0 r'
161 const __m128i F1 = _mm_and_si128(E1, mask);
162 const __m128i I = _mm_packs_epi32(F0, F1);
163 _mm_storeu_si128((__m128i*)values, I);
186 const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i + 0]);
187 const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i + 4]);
189 const __m128i a2 = _mm_loadu_si128((const __m128i*)&a[i + 8]);
190 const __m128i a3 = _mm_loadu_si128((const __m128i*)&a[i + 12]);
192 const __m128i b0 = _mm_loadu_si128((const __m128i*)&b[i + 0]);
193 const __m128i b1 = _mm_loadu_si128((const __m128i*)&b[i + 4]);
195 const __m128i b2 = _mm_loadu_si128((const __m128i*)&b[i + 8]);
196 const __m128i b3 = _mm_loadu_si128((const __m128i*)&b[i + 12]);
198 _mm_storeu_si128((__m128i*)&out[i + 0], _mm_add_epi32(a0, b0));
199 _mm_storeu_si128((__m128i*)&out[i + 4], _mm_add_epi32(a1, b1));
201 _mm_storeu_si128((__m128i*)&out[i + 8], _mm_add_epi32(a2, b2));
202 _mm_storeu_si128((__m128i*)&out[i + 12], _mm_add_epi32(a3, b3));
211 const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i + 0]);
212 const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i + 4]);
214 const __m128i a2 = _mm_loadu_si128((const __m128i*)&a[i + 8]);
215 const __m128i a3 = _mm_loadu_si128((const __m128i*)&a[i + 12]);
217 const __m128i b0 = _mm_loadu_si128((const __m128i*)&out[i + 0]);
218 const __m128i b1 = _mm_loadu_si128((const __m128i*)&out[i + 4]);
220 const __m128i b2 = _mm_loadu_si128((const __m128i*)&out[i + 8]);
221 const __m128i b3 = _mm_loadu_si128((const __m128i*)&out[i + 12]);
223 _mm_storeu_si128((__m128i*)&out[i + 0], _mm_add_epi32(a0, b0));
224 _mm_storeu_si128((__m128i*)&out[i + 4], _mm_add_epi32(a1, b1));
226 _mm_storeu_si128((__m128i*)&out[i + 8], _mm_add_epi32(a2, b2));
227 _mm_storeu_si128((__m128i*)&out[i + 12], _mm_add_epi32(a3, b3));
285 __m128i zero = _mm_setzero_si128();
287 __m128i sumXY_128 = zero;
288 __m128i sumX_128 = zero;
291 const __m128i x = _mm_loadu_si128((const __m128i*)(X + i));
292 const __m128i y = _mm_loadu_si128((const __m128i*)(Y + i));
297 const __m128i xy_128 = _mm_add_epi32(x, y);
303 _mm_storeu_si128((__m128i*)tmp, xy_128);
321 _mm_storeu_si128((__m128i*)tmp, sumX_128);
325 _mm_storeu_si128((__m128i*)tmp, sumXY_128);
341 __m128i A0 = _mm_loadu_si128((const __m128i*)&array1[0]);
342 __m128i A1 = _mm_loadu_si128((const __m128i*)&array2[0]);
347 const __m128i cmpA = _mm_cmpeq_epi32(A0, A1);
348 const __m128i B0 =
349 _mm_loadu_si128((const __m128i*)&array1[match_len + 4]);
350 const __m128i B1 =
351 _mm_loadu_si128((const __m128i*)&array2[match_len + 4]);
356 const __m128i cmpB = _mm_cmpeq_epi32(B0, B1);
357 A0 = _mm_loadu_si128((const __m128i*)&array1[match_len + 4]);
358 A1 = _mm_loadu_si128((const __m128i*)&array2[match_len + 4]);
368 _mm_loadu_si128((const __m128i*)&array1[0]),
369 _mm_loadu_si128((const __m128i*)&array2[0]))) == 0xffff) {
373 _mm_loadu_si128((const __m128i*)&array1[4]),
374 _mm_loadu_si128((const __m128i*)&array2[4]))) == 0xffff) {
394 const __m128i ff = _mm_set1_epi16(0xff00);
395 const __m128i zero = _mm_setzero_si128();
398 const __m128i in = _mm_loadu_si128((const __m128i*)&row[x]);
399 const __m128i in_lo = _mm_unpacklo_epi8(zero, in);
400 const __m128i dst0 = _mm_unpacklo_epi16(in_lo, ff);
401 const __m128i dst1 = _mm_unpackhi_epi16(in_lo, ff);
402 const __m128i in_hi = _mm_unpackhi_epi8(zero, in);
403 const __m128i dst2 = _mm_unpacklo_epi16(in_hi, ff);
404 const __m128i dst3 = _mm_unpackhi_epi16(in_hi, ff);
405 _mm_storeu_si128((__m128i*)&dst[0], dst0);
406 _mm_storeu_si128((__m128i*)&dst[4], dst1);
407 _mm_storeu_si128((__m128i*)&dst[8], dst2);
408 _mm_storeu_si128((__m128i*)&dst[12], dst3);
413 const __m128i ff = _mm_set1_epi16(0xff00);
414 const __m128i mul = _mm_set1_epi16(0x110);
417 const __m128i in = _mm_loadu_si128((const __m128i*)&row[x]);
418 const __m128i tmp = _mm_mullo_epi16(in, mul); // aba0
419 const __m128i pack = _mm_and_si128(tmp, ff); // ab00
420 const __m128i dst0 = _mm_unpacklo_epi16(pack, ff);
421 const __m128i dst1 = _mm_unpackhi_epi16(pack, ff);
422 _mm_storeu_si128((__m128i*)&dst[0], dst0);
423 _mm_storeu_si128((__m128i*)&dst[4], dst1);
428 const __m128i mask_or = _mm_set1_epi32(0xff000000);
429 const __m128i mul_cst = _mm_set1_epi16(0x0104);
430 const __m128i mask_mul = _mm_set1_epi16(0x0f00);
433 const __m128i in = _mm_loadu_si128((const __m128i*)&row[x]);
434 const __m128i mul = _mm_mullo_epi16(in, mul_cst); // 00ab00b000cd00d0
435 const __m128i tmp = _mm_and_si128(mul, mask_mul); // 00ab000000cd0000
436 const __m128i shift = _mm_srli_epi32(tmp, 12); // 00000000ab000000
437 const __m128i pack = _mm_or_si128(shift, tmp); // 00000000abcd0000
439 const __m128i res = _mm_or_si128(pack, mask_or);
440 _mm_storeu_si128((__m128i*)dst, res);
448 const __m128i in = _mm_loadu_si128((const __m128i*)&row[x]);
449 const __m128i shift = _mm_slli_epi64(in, 7);
465 static WEBP_INLINE void Average2_m128i(const __m128i* const a0,
466 const __m128i* const a1,
467 __m128i* const avg) {
469 const __m128i ones = _mm_set1_epi8(1);
470 const __m128i avg1 = _mm_avg_epu8(*a0, *a1);
471 const __m128i one = _mm_and_si128(_mm_xor_si128(*a0, *a1), ones);
479 const __m128i black = _mm_set1_epi32(ARGB_BLACK);
481 const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
482 const __m128i res = _mm_sub_epi8(src, black);
483 _mm_storeu_si128((__m128i*)&out[i], res);
495 const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); \
496 const __m128i pred = _mm_loadu_si128((const __m128i*)&(IN)); \
497 const __m128i res = _mm_sub_epi8(src, pred); \
498 _mm_storeu_si128((__m128i*)&out[i], res); \
516 const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]);
517 const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
518 const __m128i TR = _mm_loadu_si128((const __m128i*)&upper[i + 1]);
519 const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
520 __m128i avg, pred, res;
524 _mm_storeu_si128((__m128i*)&out[i], res);
536 const __m128i tA = _mm_loadu_si128((const __m128i*)&(A)); \
537 const __m128i tB = _mm_loadu_si128((const __m128i*)&(B)); \
538 const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); \
539 __m128i pred, res; \
542 _mm_storeu_si128((__m128i*)&out[i], res); \
560 const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]);
561 const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
562 const __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
563 const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
564 const __m128i TR = _mm_loadu_si128((const __m128i*)&upper[i + 1]);
565 __m128i avgTTR, avgLTL, avg, res;
570 _mm_storeu_si128((__m128i*)&out[i], res);
578 static void GetSumAbsDiff32_SSE2(const __m128i* const A, const __m128i* const B,
579 __m128i* const out) {
582 const __m128i A_lo = _mm_unpacklo_epi32(*A, *A);
583 const __m128i B_lo = _mm_unpacklo_epi32(*B, *A);
584 const __m128i A_hi = _mm_unpackhi_epi32(*A, *A);
585 const __m128i B_hi = _mm_unpackhi_epi32(*B, *A);
586 const __m128i s_lo = _mm_sad_epu8(A_lo, B_lo);
587 const __m128i s_hi = _mm_sad_epu8(A_hi, B_hi);
595 const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]);
596 const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
597 const __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
598 const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
599 __m128i pa, pb;
603 const __m128i mask = _mm_cmpgt_epi32(pb, pa);
604 const __m128i A = _mm_and_si128(mask, L);
605 const __m128i B = _mm_andnot_si128(mask, T);
606 const __m128i pred = _mm_or_si128(A, B); // pred = (L > T)? L : T
607 const __m128i res = _mm_sub_epi8(src, pred);
608 _mm_storeu_si128((__m128i*)&out[i], res);
620 const __m128i zero = _mm_setzero_si128();
622 const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
623 const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]);
624 const __m128i L_lo = _mm_unpacklo_epi8(L, zero);
625 const __m128i L_hi = _mm_unpackhi_epi8(L, zero);
626 const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
627 const __m128i T_lo = _mm_unpacklo_epi8(T, zero);
628 const __m128i T_hi = _mm_unpackhi_epi8(T, zero);
629 const __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
630 const __m128i TL_lo = _mm_unpacklo_epi8(TL, zero);
631 const __m128i TL_hi = _mm_unpackhi_epi8(TL, zero);
632 const __m128i diff_lo = _mm_sub_epi16(T_lo, TL_lo);
633 const __m128i diff_hi = _mm_sub_epi16(T_hi, TL_hi);
634 const __m128i pred_lo = _mm_add_epi16(L_lo, diff_lo);
635 const __m128i pred_hi = _mm_add_epi16(L_hi, diff_hi);
636 const __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
637 const __m128i res = _mm_sub_epi8(src, pred);
638 _mm_storeu_si128((__m128i*)&out[i], res);
649 const __m128i zero = _mm_setzero_si128();
652 const __m128i L = _mm_loadl_epi64((const __m128i*)&in[i - 1]);
653 const __m128i src = _mm_loadl_epi64((const __m128i*)&in[i]);
654 const __m128i T = _mm_loadl_epi64((const __m128i*)&upper[i]);
655 const __m128i TL = _mm_loadl_epi64((const __m128i*)&upper[i - 1]);
656 const __m128i L_lo = _mm_unpacklo_epi8(L, zero);
657 const __m128i T_lo = _mm_unpacklo_epi8(T, zero);
658 const __m128i TL_lo = _mm_unpacklo_epi8(TL, zero);
659 const __m128i sum = _mm_add_epi16(T_lo, L_lo);
660 const __m128i avg = _mm_srli_epi16(sum, 1);
661 const __m128i A1 = _mm_sub_epi16(avg, TL_lo);
662 const __m128i bit_fix = _mm_cmpgt_epi16(TL_lo, avg);
663 const __m128i A2 = _mm_sub_epi16(A1, bit_fix);
664 const __m128i A3 = _mm_srai_epi16(A2, 1);
665 const __m128i A4 = _mm_add_epi16(avg, A3);
666 const __m128i pred = _mm_packus_epi16(A4, A4);
667 const __m128i res = _mm_sub_epi8(src, pred);
668 _mm_storel_epi64((__m128i*)&out[i], res);