Lines Matching defs:in
4 // that can be found in the COPYING file in the root of the source
6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree.
90 uint32x4x4_t in;
91 INIT_VECTOR4(in, zero, zero, zero, zero);
93 LOADQ_LANE_32b(in.val[0], 0);
94 LOADQ_LANE_32b(in.val[1], 0);
95 LOADQ_LANE_32b(in.val[2], 0);
96 LOADQ_LANE_32b(in.val[3], 0);
97 LOADQ_LANE_32b(in.val[0], 1);
98 LOADQ_LANE_32b(in.val[1], 1);
99 LOADQ_LANE_32b(in.val[2], 1);
100 LOADQ_LANE_32b(in.val[3], 1);
101 LOADQ_LANE_32b(in.val[0], 2);
102 LOADQ_LANE_32b(in.val[1], 2);
103 LOADQ_LANE_32b(in.val[2], 2);
104 LOADQ_LANE_32b(in.val[3], 2);
105 LOADQ_LANE_32b(in.val[0], 3);
106 LOADQ_LANE_32b(in.val[1], 3);
107 LOADQ_LANE_32b(in.val[2], 3);
108 LOADQ_LANE_32b(in.val[3], 3);
111 const uint8x16x2_t row01 = vtrnq_u8(vreinterpretq_u8_u32(in.val[0]),
112 vreinterpretq_u8_u32(in.val[1]));
113 const uint8x16x2_t row23 = vtrnq_u8(vreinterpretq_u8_u32(in.val[2]),
114 vreinterpretq_u8_u32(in.val[3]));
163 // We pack the 8x8 u-samples in the lower half of the uint8x16_t destination
187 // We pack the 8x8 u-samples in the lower half of the uint8x16_t destination
308 // p0 and q0 contain the u+v samples packed in low/high halves.
319 // The p1...q1 registers contain the u+v samples packed in low/high halves.
438 // Simple In-loop filtering (Paragraph 15.2)
574 NEEDS_FILTER(p1, p0, q0, q1, thresh, q9) /* filter mask in q9 */ \
648 // Complex In-loop filtering (Paragraph 15.3)
726 // TODO(skal): avoid the double FlipSign() in ApplyFilter2() and here
802 // TODO(skal): avoid the double FlipSign() in ApplyFilter2() and here
970 // Technically these are unsigned but vqdmulh is only available in signed.
1025 static void TransformOne(const int16_t* in, uint8_t* dst) {
1027 INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
1035 static void TransformOne(const int16_t* in, uint8_t* dst) {
1041 "vld1.16 {q1, q2}, [%[in]] \n"
1044 /* d2: in[0]
1045 * d3: in[8]
1046 * d4: in[4]
1047 * d5: in[12]
1051 /* q8 = {in[4], in[12]} * kC1 * 2 >> 16
1052 * q9 = {in[4], in[12]} * kC2 >> 16
1057 /* d22 = a = in[0] + in[8]
1058 * d23 = b = in[0] - in[8]
1066 * We avoided this in kC2 by pre-shifting the constant.
1067 * q8 = in[4]/[12] * kC1 >> 16
1071 /* Add {in[4], in[12]} back after the multiplication. This is handled by
1072 * adding 1 << 16 to kC1 in the libwebp C code.
1076 /* d20 = c = in[4]*kC2 - in[12]*kC1
1077 * d21 = d = in[4]*kC1 + in[12]*kC2
1113 /* d20 = c = in[4]*kC2 - in[12]*kC1
1114 * d21 = d = in[4]*kC1 + in[12]*kC2
1160 : [in] "+r"(in), [dst] "+r"(dst) /* modified registers */
1168 static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
1169 TransformOne(in, dst);
1171 TransformOne(in + 16, dst + 4);
1175 static void TransformDC(const int16_t* in, uint8_t* dst) {
1176 const int16x8_t DC = vdupq_n_s16(in[0]);
1189 static void TransformWHT(const int16_t* in, int16_t* out) {
1194 const int16x4_t in00_03 = vld1_s16(in + 0);
1195 const int16x4_t in04_07 = vld1_s16(in + 4);
1196 const int16x4_t in08_11 = vld1_s16(in + 8);
1197 const int16x4_t in12_15 = vld1_s16(in + 12);
1198 const int32x4_t a0 = vaddl_s16(in00_03, in12_15); // in[0..3] + in[12..15]
1199 const int32x4_t a1 = vaddl_s16(in04_07, in08_11); // in[4..7] + in[8..11]
1200 const int32x4_t a2 = vsubl_s16(in04_07, in08_11); // in[4..7] - in[8..11]
1201 const int32x4_t a3 = vsubl_s16(in00_03, in12_15); // in[0..3] - in[12..15]
1241 static void TransformAC3(const int16_t* in, uint8_t* dst) {
1244 const int16x4_t A = vdup_n_s16(in[0]);
1245 const int16x4_t c4 = vdup_n_s16(MUL(in[4], kC2_full));
1246 const int16x4_t d4 = vdup_n_s16(MUL(in[4], kC1_full));
1247 const int c1 = MUL(in[1], kC2_full);
1248 const int d1 = MUL(in[1], kC1_full);