Home | History | Annotate | Download | only in dsp

Lines Matching refs:__m128i

29   const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH);
41 const __m128i out0 = _mm_loadu_si128((__m128i*)&out[0]);
42 const __m128i out1 = _mm_loadu_si128((__m128i*)&out[8]);
44 const __m128i abs0 = _mm_abs_epi16(out0);
45 const __m128i abs1 = _mm_abs_epi16(out1);
46 const __m128i v0 = _mm_srai_epi16(abs0, 3);
47 const __m128i v1 = _mm_srai_epi16(abs1, 3);
49 const __m128i bin0 = _mm_min_epi16(v0, max_coeff_thresh);
50 const __m128i bin1 = _mm_min_epi16(v1, max_coeff_thresh);
52 _mm_storeu_si128((__m128i*)&out[0], bin0);
53 _mm_storeu_si128((__m128i*)&out[8], bin1);
76 __m128i tmp_0, tmp_1, tmp_2, tmp_3;
80 const __m128i inA_0 = _mm_loadu_si128((const __m128i*)&inA[BPS * 0]);
81 const __m128i inA_1 = _mm_loadu_si128((const __m128i*)&inA[BPS * 1]);
82 const __m128i inA_2 = _mm_loadu_si128((const __m128i*)&inA[BPS * 2]);
87 const __m128i inA_3 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 3]);
88 const __m128i inB_0 = _mm_loadu_si128((const __m128i*)&inB[BPS * 0]);
89 const __m128i inB_1 = _mm_loadu_si128((const __m128i*)&inB[BPS * 1]);
90 const __m128i inB_2 = _mm_loadu_si128((const __m128i*)&inB[BPS * 2]);
91 const __m128i inB_3 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 3]);
94 const __m128i inAB_0 = _mm_unpacklo_epi32(inA_0, inB_0);
95 const __m128i inAB_1 = _mm_unpacklo_epi32(inA_1, inB_1);
96 const __m128i inAB_2 = _mm_unpacklo_epi32(inA_2, inB_2);
97 const __m128i inAB_3 = _mm_unpacklo_epi32(inA_3, inB_3);
112 const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2);
113 const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3);
114 const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3);
115 const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2);
116 const __m128i b0 = _mm_add_epi16(a0, a1);
117 const __m128i b1 = _mm_add_epi16(a3, a2);
118 const __m128i b2 = _mm_sub_epi16(a3, a2);
119 const __m128i b3 = _mm_sub_epi16(a0, a1);
132 const __m128i w_0 = _mm_loadu_si128((const __m128i*)&w[0]);
133 const __m128i w_8 = _mm_loadu_si128((const __m128i*)&w[8]);
136 const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2);
137 const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3);
138 const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3);
139 const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2);
140 const __m128i b0 = _mm_add_epi16(a0, a1);
141 const __m128i b1 = _mm_add_epi16(a3, a2);
142 const __m128i b2 = _mm_sub_epi16(a3, a2);
143 const __m128i b3 = _mm_sub_epi16(a0, a1);
146 __m128i A_b0 = _mm_unpacklo_epi64(b0, b1);
147 __m128i A_b2 = _mm_unpacklo_epi64(b2, b3);
148 __m128i B_b0 = _mm_unpackhi_epi64(b0, b1);
149 __m128i B_b2 = _mm_unpackhi_epi64(b2, b3);
166 _mm_storeu_si128((__m128i*)&sum[0], A_b2);
203 const __m128i max_coeff_2047 = _mm_set1_epi16(MAX_LEVEL);
204 const __m128i zero = _mm_setzero_si128();
205 __m128i out0, out8;
206 __m128i packed_out;
209 __m128i in0 = _mm_loadu_si128((__m128i*)&in[0]);
210 __m128i in8 = _mm_loadu_si128((__m128i*)&in[8]);
211 const __m128i iq0 = _mm_loadu_si128((const __m128i*)&mtx->iq_[0]);
212 const __m128i iq8 = _mm_loadu_si128((const __m128i*)&mtx->iq_[8]);
213 const __m128i q0 = _mm_loadu_si128((const __m128i*)&mtx->q_[0]);
214 const __m128i q8 = _mm_loadu_si128((const __m128i*)&mtx->q_[8]);
217 __m128i coeff0 = _mm_abs_epi16(in0);
218 __m128i coeff8 = _mm_abs_epi16(in8);
222 const __m128i sharpen0 = _mm_loadu_si128((const __m128i*)&sharpen[0]);
223 const __m128i sharpen8 = _mm_loadu_si128((const __m128i*)&sharpen[8]);
232 const __m128i coeff_iQ0H = _mm_mulhi_epu16(coeff0, iq0);
233 const __m128i coeff_iQ0L = _mm_mullo_epi16(coeff0, iq0);
234 const __m128i coeff_iQ8H = _mm_mulhi_epu16(coeff8, iq8);
235 const __m128i coeff_iQ8L = _mm_mullo_epi16(coeff8, iq8);
236 __m128i out_00 = _mm_unpacklo_epi16(coeff_iQ0L, coeff_iQ0H);
237 __m128i out_04 = _mm_unpackhi_epi16(coeff_iQ0L, coeff_iQ0H);
238 __m128i out_08 = _mm_unpacklo_epi16(coeff_iQ8L, coeff_iQ8H);
239 __m128i out_12 = _mm_unpackhi_epi16(coeff_iQ8L, coeff_iQ8H);
241 const __m128i bias_00 = _mm_loadu_si128((const __m128i*)&mtx->bias_[0]);
242 const __m128i bias_04 = _mm_loadu_si128((const __m128i*)&mtx->bias_[4]);
243 const __m128i bias_08 = _mm_loadu_si128((const __m128i*)&mtx->bias_[8]);
244 const __m128i bias_12 = _mm_loadu_si128((const __m128i*)&mtx->bias_[12]);
272 _mm_storeu_si128((__m128i*)&in[0], in0);
273 _mm_storeu_si128((__m128i*)&in[8], in8);
282 const __m128i kCst_lo = PSHUFB_CST(0, 1, 4, -1, 5, 2, 3, 6);
283 const __m128i kCst_7 = PSHUFB_CST(-1, -1, -1, -1, 7, -1, -1, -1);
284 const __m128i tmp_lo = _mm_shuffle_epi8(out0, kCst_lo);
285 const __m128i tmp_7 = _mm_shuffle_epi8(out0, kCst_7); // extract #7
286 const __m128i kCst_hi = PSHUFB_CST(1, 4, 5, 2, -1, 3, 6, 7);
287 const __m128i kCst_8 = PSHUFB_CST(-1, -1, -1, 0, -1, -1, -1, -1);
288 const __m128i tmp_hi = _mm_shuffle_epi8(out8, kCst_hi);
289 const __m128i tmp_8 = _mm_shuffle_epi8(out8, kCst_8); // extract #8
290 const __m128i out_z0 = _mm_or_si128(tmp_lo, tmp_8);
291 const __m128i out_z8 = _mm_or_si128(tmp_hi, tmp_7);
292 _mm_storeu_si128((__m128i*)&out[0], out_z0);
293 _mm_storeu_si128((__m128i*)&out[8], out_z8);