Home | History | Annotate | Download | only in dsp

Lines Matching refs:__m128i

29 static void ConvertYUV444ToRGB_SSE2(const __m128i* const Y0,
30 const __m128i* const U0,
31 const __m128i* const V0,
32 __m128i* const R,
33 __m128i* const G,
34 __m128i* const B) {
35 const __m128i k19077 = _mm_set1_epi16(19077);
36 const __m128i k26149 = _mm_set1_epi16(26149);
37 const __m128i k14234 = _mm_set1_epi16(14234);
39 const __m128i k33050 = _mm_set1_epi16((short)33050);
40 const __m128i k17685 = _mm_set1_epi16(17685);
41 const __m128i k6419 = _mm_set1_epi16(6419);
42 const __m128i k13320 = _mm_set1_epi16(13320);
43 const __m128i k8708 = _mm_set1_epi16(8708);
45 const __m128i Y1 = _mm_mulhi_epu16(*Y0, k19077);
47 const __m128i R0 = _mm_mulhi_epu16(*V0, k26149);
48 const __m128i R1 = _mm_sub_epi16(Y1, k14234);
49 const __m128i R2 = _mm_add_epi16(R1, R0);
51 const __m128i G0 = _mm_mulhi_epu16(*U0, k6419);
52 const __m128i G1 = _mm_mulhi_epu16(*V0, k13320);
53 const __m128i G2 = _mm_add_epi16(Y1, k8708);
54 const __m128i G3 = _mm_add_epi16(G0, G1);
55 const __m128i G4 = _mm_sub_epi16(G2, G3);
58 const __m128i B0 = _mm_mulhi_epu16(*U0, k33050);
59 const __m128i B1 = _mm_adds_epu16(B0, Y1);
60 const __m128i B2 = _mm_subs_epu16(B1, k17685);
69 static WEBP_INLINE __m128i Load_HI_16_SSE2(const uint8_t* src) {
70 const __m128i zero = _mm_setzero_si128();
71 return _mm_unpacklo_epi8(zero, _mm_loadl_epi64((const __m128i*)src));
75 static WEBP_INLINE __m128i Load_UV_HI_8_SSE2(const uint8_t* src) {
76 const __m128i zero = _mm_setzero_si128();
77 const __m128i tmp0 = _mm_cvtsi32_si128(*(const uint32_t*)src);
78 const __m128i tmp1 = _mm_unpacklo_epi8(zero, tmp0);
86 __m128i* const R, __m128i* const G,
87 __m128i* const B) {
88 const __m128i Y0 = Load_HI_16_SSE2(y), U0 = Load_HI_16_SSE2(u),
97 __m128i* const R, __m128i* const G,
98 __m128i* const B) {
99 const __m128i Y0 = Load_HI_16_SSE2(y), U0 = Load_UV_HI_8_SSE2(u),
105 static WEBP_INLINE void PackAndStore4_SSE2(const __m128i* const R,
106 const __m128i* const G,
107 const __m128i* const B,
108 const __m128i* const A,
110 const __m128i rb = _mm_packus_epi16(*R, *B);
111 const __m128i ga = _mm_packus_epi16(*G, *A);
112 const __m128i rg = _mm_unpacklo_epi8(rb, ga);
113 const __m128i ba = _mm_unpackhi_epi8(rb, ga);
114 const __m128i RGBA_lo = _mm_unpacklo_epi16(rg, ba);
115 const __m128i RGBA_hi = _mm_unpackhi_epi16(rg, ba);
116 _mm_storeu_si128((__m128i*)(dst + 0), RGBA_lo);
117 _mm_storeu_si128((__m128i*)(dst + 16), RGBA_hi);
121 static WEBP_INLINE void PackAndStore4444_SSE2(const __m128i* const R,
122 const __m128i* const G,
123 const __m128i* const B,
124 const __m128i* const A,
127 const __m128i rg0 = _mm_packus_epi16(*R, *G);
128 const __m128i ba0 = _mm_packus_epi16(*B, *A);
130 const __m128i rg0 = _mm_packus_epi16(*B, *A);
131 const __m128i ba0 = _mm_packus_epi16(*R, *G);
133 const __m128i mask_0xf0 = _mm_set1_epi8(0xf0);
134 const __m128i rb1 = _mm_unpacklo_epi8(rg0, ba0); // rbrbrbrbrb...
135 const __m128i ga1 = _mm_unpackhi_epi8(rg0, ba0); // gagagagaga...
136 const __m128i rb2 = _mm_and_si128(rb1, mask_0xf0);
137 const __m128i ga2 = _mm_srli_epi16(_mm_and_si128(ga1, mask_0xf0), 4);
138 const __m128i rgba4444 = _mm_or_si128(rb2, ga2);
139 _mm_storeu_si128((__m128i*)dst, rgba4444);
143 static WEBP_INLINE void PackAndStore565_SSE2(const __m128i* const R,
144 const __m128i* const G,
145 const __m128i* const B,
147 const __m128i r0 = _mm_packus_epi16(*R, *R);
148 const __m128i g0 = _mm_packus_epi16(*G, *G);
149 const __m128i b0 = _mm_packus_epi16(*B, *B);
150 const __m128i r1 = _mm_and_si128(r0, _mm_set1_epi8(0xf8));
151 const __m128i b1 = _mm_and_si128(_mm_srli_epi16(b0, 3), _mm_set1_epi8(0x1f));
152 const __m128i g1 = _mm_srli_epi16(_mm_and_si128(g0, _mm_set1_epi8(0xe0)), 5);
153 const __m128i g2 = _mm_slli_epi16(_mm_and_si128(g0, _mm_set1_epi8(0x1c)), 3);
154 const __m128i rg = _mm_or_si128(r1, g1);
155 const __m128i gb = _mm_or_si128(g2, b1);
157 const __m128i rgb565 = _mm_unpacklo_epi8(rg, gb);
159 const __m128i rgb565 = _mm_unpacklo_epi8(gb, rg);
161 _mm_storeu_si128((__m128i*)dst, rgb565);
167 static WEBP_INLINE void PlanarTo24b_SSE2(__m128i* const in0, __m128i* const in1,
168 __m128i* const in2, __m128i* const in3,
169 __m128i* const in4, __m128i* const in5,
185 _mm_storeu_si128((__m128i*)(rgb + 0), *in0);
186 _mm_storeu_si128((__m128i*)(rgb + 16), *in1);
187 _mm_storeu_si128((__m128i*)(rgb + 32), *in2);
188 _mm_storeu_si128((__m128i*)(rgb + 48), *in3);
189 _mm_storeu_si128((__m128i*)(rgb + 64), *in4);
190 _mm_storeu_si128((__m128i*)(rgb + 80), *in5);
195 const __m128i kAlpha = _mm_set1_epi16(255);
198 __m128i R, G, B;
206 const __m128i kAlpha = _mm_set1_epi16(255);
209 __m128i R, G, B;
217 const __m128i kAlpha = _mm_set1_epi16(255);
220 __m128i R, G, B;
228 const __m128i kAlpha = _mm_set1_epi16(255);
231 __m128i R, G, B;
241 __m128i R, G, B;
249 __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
250 __m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;
271 __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
272 __m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;
297 const __m128i kAlpha = _mm_set1_epi16(255);
300 __m128i R, G, B;
319 const __m128i kAlpha = _mm_set1_epi16(255);
322 __m128i R, G, B;
341 const __m128i kAlpha = _mm_set1_epi16(255);
344 __m128i R, G, B;
365 __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
366 __m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;
402 __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
403 __m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;
451 #define LOAD_16(src) _mm_loadu_si128((const __m128i*)(src))
453 #define STORE_16(V, dst) _mm_storeu_si128((__m128i*)(dst), (V))
458 const __m128i* const in /*in[6]*/, __m128i* const out /*out[6]*/) {
471 const uint8_t* const rgb, __m128i* const out /*out[6]*/) {
472 __m128i tmp[6];
473 tmp[0] = _mm_loadu_si128((const __m128i*)(rgb + 0));
474 tmp[1] = _mm_loadu_si128((const __m128i*)(rgb + 16));
475 tmp[2] = _mm_loadu_si128((const __m128i*)(rgb + 32));
476 tmp[3] = _mm_loadu_si128((const __m128i*)(rgb + 48));
477 tmp[4] = _mm_loadu_si128((const __m128i*)(rgb + 64));
478 tmp[5] = _mm_loadu_si128((const __m128i*)(rgb + 80));
489 __m128i* const rgb /*in[6]*/) {
490 const __m128i zero = _mm_setzero_si128();
491 __m128i a0 = LOAD_16(argb + 0);
492 __m128i a1 = LOAD_16(argb + 4);
493 __m128i a2 = LOAD_16(argb + 8);
494 __m128i a3 = LOAD_16(argb + 12);
509 const __m128i V0_lo = _mm_madd_epi16(RG_LO, MULT_RG); \
510 const __m128i V0_hi = _mm_madd_epi16(RG_HI, MULT_RG); \
511 const __m128i V1_lo = _mm_madd_epi16(GB_LO, MULT_GB); \
512 const __m128i V1_hi = _mm_madd_epi16(GB_HI, MULT_GB); \
513 const __m128i V2_lo = _mm_add_epi32(V0_lo, V1_lo); \
514 const __m128i V2_hi = _mm_add_epi32(V0_hi, V1_hi); \
515 const __m128i V3_lo = _mm_add_epi32(V2_lo, ROUNDER); \
516 const __m128i V3_hi = _mm_add_epi32(V2_hi, ROUNDER); \
517 const __m128i V5_lo = _mm_srai_epi32(V3_lo, DESCALE_FIX); \
518 const __m128i V5_hi = _mm_srai_epi32(V3_hi, DESCALE_FIX); \
523 static WEBP_INLINE void ConvertRGBToY_SSE2(const __m128i* const R,
524 const __m128i* const G,
525 const __m128i* const B,
526 __m128i* const Y) {
527 const __m128i kRG_y = MK_CST_16(16839, 33059 - 16384);
528 const __m128i kGB_y = MK_CST_16(16384, 6420);
529 const __m128i kHALF_Y = _mm_set1_epi32((16 << YUV_FIX) + YUV_HALF);
531 const __m128i RG_lo = _mm_unpacklo_epi16(*R, *G);
532 const __m128i RG_hi = _mm_unpackhi_epi16(*R, *G);
533 const __m128i GB_lo = _mm_unpacklo_epi16(*G, *B);
534 const __m128i GB_hi = _mm_unpackhi_epi16(*G, *B);
538 static WEBP_INLINE void ConvertRGBToUV_SSE2(const __m128i* const R,
539 const __m128i* const G,
540 const __m128i* const B,
541 __m128i* const U,
542 __m128i* const V) {
543 const __m128i kRG_u = MK_CST_16(-9719, -19081);
544 const __m128i kGB_u = MK_CST_16(0, 28800);
545 const __m128i kRG_v = MK_CST_16(28800, 0);
546 const __m128i kGB_v = MK_CST_16(-24116, -4684);
547 const __m128i kHALF_UV = _mm_set1_epi32(((128 << YUV_FIX) + YUV_HALF) << 2);
549 const __m128i RG_lo = _mm_unpacklo_epi16(*R, *G);
550 const __m128i RG_hi = _mm_unpackhi_epi16(*R, *G);
551 const __m128i GB_lo = _mm_unpacklo_epi16(*G, *B);
552 const __m128i GB_hi = _mm_unpackhi_epi16(*G, *B);
566 __m128i rgb_plane[6];
572 const __m128i zero = _mm_setzero_si128();
573 __m128i r, g, b, Y0, Y1;
600 __m128i bgr_plane[6];
606 const __m128i zero = _mm_setzero_si128();
607 __m128i r, g, b, Y0, Y1;
634 __m128i Y0, Y1, rgb[6];
649 static void HorizontalAddPack_SSE2(const __m128i* const A,
650 const __m128i* const B,
651 __m128i* const out) {
652 const __m128i k2 = _mm_set1_epi16(2);
653 const __m128i C = _mm_madd_epi16(*A, k2);
654 const __m128i D = _mm_madd_epi16(*B, k2);
664 __m128i rgb[6], U0, V0, U1, V1;
680 const __m128i prev_u = LOAD_16(u);
681 const __m128i prev_v = LOAD_16(v);
696 __m128i* const r, __m128i* const g, __m128i* const b) {
697 const __m128i in0 = LOAD_16(rgbx + 0); // r0 | g0 | b0 |x| r1 | g1 | b1 |x
698 const __m128i in1 = LOAD_16(rgbx + 8); // r2 | g2 | b2 |x| r3 | g3 | b3 |x
699 const __m128i in2 = LOAD_16(rgbx + 16); // r4 | ...
700 const __m128i in3 = LOAD_16(rgbx + 24); // r6 | ...
702 const __m128i A0 = _mm_unpacklo_epi16(in0, in1);
703 const __m128i A1 = _mm_unpackhi_epi16(in0, in1);
704 const __m128i A2 = _mm_unpacklo_epi16(in2, in3);
705 const __m128i A3 = _mm_unpackhi_epi16(in2, in3);
706 const __m128i B0 = _mm_unpacklo_epi16(A0, A1); // r0 r1 r2 r3 | g0 g1 ..
707 const __m128i B1 = _mm_unpackhi_epi16(A0, A1); // b0 b1 b2 b3 | x x x x
708 const __m128i B2 = _mm_unpacklo_epi16(A2, A3); // r4 r5 r6 r7 | g4 g5 ..
709 const __m128i B3 = _mm_unpackhi_epi16(A2, A3); // b4 b5 b6 b7 | x x x x
720 __m128i r, g, b, U0, V0, U1, V1;
762 const __m128i zero = _mm_setzero_si128();
763 const __m128i max = _mm_set1_epi16(MAX_Y);
764 const __m128i one = _mm_set1_epi16(1);
765 __m128i sum = zero;
768 const __m128i A = _mm_loadu_si128((const __m128i*)(ref + i));
769 const __m128i B = _mm_loadu_si128((const __m128i*)(src + i));
770 const __m128i C = _mm_loadu_si128((const __m128i*)(dst + i));
771 const __m128i D = _mm_sub_epi16(A, B); // diff_y
772 const __m128i E = _mm_cmpgt_epi16(zero, D); // sign (-1 or 0)
773 const __m128i F = _mm_add_epi16(C, D); // new_y
774 const __m128i G = _mm_or_si128(E, one); // -1 or 1
775 const __m128i H = _mm_max_epi16(_mm_min_epi16(F, max), zero);
776 const __m128i I = _mm_madd_epi16(D, G); // sum(abs(...))
777 _mm_storeu_si128((__m128i*)(dst + i), H);
780 _mm_storeu_si128((__m128i*)tmp, sum);
795 const __m128i A = _mm_loadu_si128((const __m128i*)(ref + i));
796 const __m128i B = _mm_loadu_si128((const __m128i*)(src + i));
797 const __m128i C = _mm_loadu_si128((const __m128i*)(dst + i));
798 const __m128i D = _mm_sub_epi16(A, B); // diff_uv
799 const __m128i E = _mm_add_epi16(C, D); // new_uv
800 _mm_storeu_si128((__m128i*)(dst + i), E);
811 const __m128i kCst8 = _mm_set1_epi16(8);
812 const __m128i max = _mm_set1_epi16(MAX_Y);
813 const __m128i zero = _mm_setzero_si128();
815 const __m128i a0 = _mm_loadu_si128((const __m128i*)(A + i + 0));
816 const __m128i a1 = _mm_loadu_si128((const __m128i*)(A + i + 1));
817 const __m128i b0 = _mm_loadu_si128((const __m128i*)(B + i + 0));
818 const __m128i b1 = _mm_loadu_si128((const __m128i*)(B + i + 1));
819 const __m128i a0b1 = _mm_add_epi16(a0, b1);
820 const __m128i a1b0 = _mm_add_epi16(a1, b0);
821 const __m128i a0a1b0b1 = _mm_add_epi16(a0b1, a1b0); // A0+A1+B0+B1
822 const __m128i a0a1b0b1_8 = _mm_add_epi16(a0a1b0b1, kCst8);
823 const __m128i a0b1_2 = _mm_add_epi16(a0b1, a0b1); // 2*(A0+B1)
824 const __m128i a1b0_2 = _mm_add_epi16(a1b0, a1b0); // 2*(A1+B0)
825 const __m128i c0 = _mm_srai_epi16(_mm_add_epi16(a0b1_2, a0a1b0b1_8), 3);
826 const __m128i c1 = _mm_srai_epi16(_mm_add_epi16(a1b0_2, a0a1b0b1_8), 3);
827 const __m128i d0 = _mm_add_epi16(c1, a0);
828 const __m128i d1 = _mm_add_epi16(c0, a1);
829 const __m128i e0 = _mm_srai_epi16(d0, 1);
830 const __m128i e1 = _mm_srai_epi16(d1, 1);
831 const __m128i f0 = _mm_unpacklo_epi16(e0, e1);
832 const __m128i f1 = _mm_unpackhi_epi16(e0, e1);
833 const __m128i g0 = _mm_loadu_si128((const __m128i*)(best_y + 2 * i + 0));
834 const __m128i g1 = _mm_loadu_si128((const __m128i*)(best_y + 2 * i + 8));
835 const __m128i h0 = _mm_add_epi16(g0, f0);
836 const __m128i h1 = _mm_add_epi16(g1, f1);
837 const __m128i i0 = _mm_max_epi16(_mm_min_epi16(h0, max), zero);
838 const __m128i i1 = _mm_max_epi16(_mm_min_epi16(h1, max), zero);
839 _mm_storeu_si128((__m128i*)(out + 2 * i + 0), i0);
840 _mm_storeu_si128((__m128i*)(out + 2 * i + 8), i1);