Lines Matching defs:in
4 // that can be found in the COPYING file in the root of the source
6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree.
90 uint32x4x4_t in;
91 INIT_VECTOR4(in, zero, zero, zero, zero);
93 LOADQ_LANE_32b(in.val[0], 0);
94 LOADQ_LANE_32b(in.val[1], 0);
95 LOADQ_LANE_32b(in.val[2], 0);
96 LOADQ_LANE_32b(in.val[3], 0);
97 LOADQ_LANE_32b(in.val[0], 1);
98 LOADQ_LANE_32b(in.val[1], 1);
99 LOADQ_LANE_32b(in.val[2], 1);
100 LOADQ_LANE_32b(in.val[3], 1);
101 LOADQ_LANE_32b(in.val[0], 2);
102 LOADQ_LANE_32b(in.val[1], 2);
103 LOADQ_LANE_32b(in.val[2], 2);
104 LOADQ_LANE_32b(in.val[3], 2);
105 LOADQ_LANE_32b(in.val[0], 3);
106 LOADQ_LANE_32b(in.val[1], 3);
107 LOADQ_LANE_32b(in.val[2], 3);
108 LOADQ_LANE_32b(in.val[3], 3);
111 const uint8x16x2_t row01 = vtrnq_u8(vreinterpretq_u8_u32(in.val[0]),
112 vreinterpretq_u8_u32(in.val[1]));
113 const uint8x16x2_t row23 = vtrnq_u8(vreinterpretq_u8_u32(in.val[2]),
114 vreinterpretq_u8_u32(in.val[3]));
163 // We pack the 8x8 u-samples in the lower half of the uint8x16_t destination
187 // We pack the 8x8 u-samples in the lower half of the uint8x16_t destination
308 // p0 and q0 contain the u+v samples packed in low/high halves.
319 // The p1...q1 registers contain the u+v samples packed in low/high halves.
438 // Simple In-loop filtering (Paragraph 15.2)
587 NEEDS_FILTER(p1, p0, q0, q1, thresh, q9) /* filter mask in q9 */ \
661 // Complex In-loop filtering (Paragraph 15.3)
975 // Technically these are unsigned but vqdmulh is only available in signed.
1030 static void TransformOne(const int16_t* in, uint8_t* dst) {
1032 INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
1040 static void TransformOne(const int16_t* in, uint8_t* dst) {
1046 "vld1.16 {q1, q2}, [%[in]] \n"
1049 /* d2: in[0]
1050 * d3: in[8]
1051 * d4: in[4]
1052 * d5: in[12]
1056 /* q8 = {in[4], in[12]} * kC1 * 2 >> 16
1057 * q9 = {in[4], in[12]} * kC2 >> 16
1062 /* d22 = a = in[0] + in[8]
1063 * d23 = b = in[0] - in[8]
1071 * We avoided this in kC2 by pre-shifting the constant.
1072 * q8 = in[4]/[12] * kC1 >> 16
1076 /* Add {in[4], in[12]} back after the multiplication. This is handled by
1077 * adding 1 << 16 to kC1 in the libwebp C code.
1081 /* d20 = c = in[4]*kC2 - in[12]*kC1
1082 * d21 = d = in[4]*kC1 + in[12]*kC2
1118 /* d20 = c = in[4]*kC2 - in[12]*kC1
1119 * d21 = d = in[4]*kC1 + in[12]*kC2
1165 : [in] "+r"(in), [dst] "+r"(dst) /* modified registers */
1173 static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
1174 TransformOne(in, dst);
1176 TransformOne(in + 16, dst + 4);
1180 static void TransformDC(const int16_t* in, uint8_t* dst) {
1181 const int16x8_t DC = vdupq_n_s16(in[0]);
1194 static void TransformWHT(const int16_t* in, int16_t* out) {
1199 const int16x4_t in00_03 = vld1_s16(in + 0);
1200 const int16x4_t in04_07 = vld1_s16(in + 4);
1201 const int16x4_t in08_11 = vld1_s16(in + 8);
1202 const int16x4_t in12_15 = vld1_s16(in + 12);
1203 const int32x4_t a0 = vaddl_s16(in00_03, in12_15); // in[0..3] + in[12..15]
1204 const int32x4_t a1 = vaddl_s16(in04_07, in08_11); // in[4..7] + in[8..11]
1205 const int32x4_t a2 = vsubl_s16(in04_07, in08_11); // in[4..7] - in[8..11]
1206 const int32x4_t a3 = vsubl_s16(in00_03, in12_15); // in[0..3] - in[12..15]
1246 static void TransformAC3(const int16_t* in, uint8_t* dst) {
1249 const int16x4_t A = vld1_dup_s16(in);
1250 const int16x4_t c4 = vdup_n_s16(MUL(in[4], kC2_full));
1251 const int16x4_t d4 = vdup_n_s16(MUL(in[4], kC1_full));
1252 const int c1 = MUL(in[1], kC2_full);
1253 const int d1 = MUL(in[1], kC1_full);