Lines Matching refs:DST
224 uint8_t* const dst, int stride) {
225 vst2_lane_u8(dst + 0 * stride, v, 0);
226 vst2_lane_u8(dst + 1 * stride, v, 1);
227 vst2_lane_u8(dst + 2 * stride, v, 2);
228 vst2_lane_u8(dst + 3 * stride, v, 3);
229 vst2_lane_u8(dst + 4 * stride, v, 4);
230 vst2_lane_u8(dst + 5 * stride, v, 5);
231 vst2_lane_u8(dst + 6 * stride, v, 6);
232 vst2_lane_u8(dst + 7 * stride, v, 7);
236 uint8_t* const dst, int stride) {
242 Store2x8_NEON(lo, dst - 1 + 0 * stride, stride);
243 Store2x8_NEON(hi, dst - 1 + 8 * stride, stride);
248 uint8_t* const dst, int stride) {
249 vst4_lane_u8(dst + 0 * stride, v, 0);
250 vst4_lane_u8(dst + 1 * stride, v, 1);
251 vst4_lane_u8(dst + 2 * stride, v, 2);
252 vst4_lane_u8(dst + 3 * stride, v, 3);
253 vst4_lane_u8(dst + 4 * stride, v, 4);
254 vst4_lane_u8(dst + 5 * stride, v, 5);
255 vst4_lane_u8(dst + 6 * stride, v, 6);
256 vst4_lane_u8(dst + 7 * stride, v, 7);
261 uint8_t* const dst, int stride) {
269 Store4x8_NEON(lo, dst - 2 + 0 * stride, stride);
270 Store4x8_NEON(hi, dst - 2 + 8 * stride, stride);
275 uint8_t* const dst, int stride) {
276 vst1q_u8(dst - stride, p0);
277 vst1q_u8(dst, q0);
282 uint8_t* const dst, int stride) {
283 Store16x2_NEON(p1, p0, dst - stride, stride);
284 Store16x2_NEON(q0, q1, dst + stride, stride);
311 #define STORE6_LANE(DST, VAL0, VAL1, LANE) do { \
312 vst3_lane_u8((DST) - 3, (VAL0), (LANE)); \
313 vst3_lane_u8((DST) + 0, (VAL1), (LANE)); \
314 (DST) += stride; \
384 // to the corresponding rows of 'dst'.
385 static WEBP_INLINE void SaturateAndStore4x4_NEON(uint8_t* const dst,
393 vst1_lane_u32((uint32_t*)(dst + 0 * BPS), vreinterpret_u32_u8(dst01_u8), 0);
394 vst1_lane_u32((uint32_t*)(dst + 1 * BPS), vreinterpret_u32_u8(dst01_u8), 1);
395 vst1_lane_u32((uint32_t*)(dst + 2 * BPS), vreinterpret_u32_u8(dst23_u8), 0);
396 vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1);
401 uint8_t* const dst) {
406 dst01 = vld1_lane_u32((uint32_t*)(dst + 0 * BPS), dst01, 0);
407 dst23 = vld1_lane_u32((uint32_t*)(dst + 2 * BPS), dst23, 0);
408 dst01 = vld1_lane_u32((uint32_t*)(dst + 1 * BPS), dst01, 1);
409 dst23 = vld1_lane_u32((uint32_t*)(dst + 3 * BPS), dst23, 1);
420 SaturateAndStore4x4_NEON(dst, out01, out23);
1043 static void TransformOne_NEON(const int16_t* in, uint8_t* dst) {
1048 Add4x4_NEON(rows.val[0], rows.val[1], dst);
1053 static void TransformOne_NEON(const int16_t* in, uint8_t* dst) {
1147 "vld1.32 d6[0], [%[dst]], %[kBPS] \n"
1148 "vld1.32 d6[1], [%[dst]], %[kBPS] \n"
1149 "vld1.32 d7[0], [%[dst]], %[kBPS] \n"
1150 "vld1.32 d7[1], [%[dst]], %[kBPS] \n"
1152 "sub %[dst], %[dst], %[kBPS], lsl #2 \n"
1173 "vst1.32 d0[0], [%[dst]], %[kBPS] \n"
1174 "vst1.32 d0[1], [%[dst]], %[kBPS] \n"
1175 "vst1.32 d1[0], [%[dst]], %[kBPS] \n"
1176 "vst1.32 d1[1], [%[dst]] \n"
1178 : [in] "+r"(in), [dst] "+r"(dst) /* modified registers */
1186 static void TransformTwo_NEON(const int16_t* in, uint8_t* dst, int do_two) {
1187 TransformOne_NEON(in, dst);
1189 TransformOne_NEON(in + 16, dst + 4);
1193 static void TransformDC_NEON(const int16_t* in, uint8_t* dst) {
1195 Add4x4_NEON(DC, DC, dst);
1200 #define STORE_WHT(dst, col, rows) do { \
1201 *dst = vgetq_lane_s32(rows.val[0], col); (dst) += 16; \
1202 *dst = vgetq_lane_s32(rows.val[1], col); (dst) += 16; \
1203 *dst = vgetq_lane_s32(rows.val[2], col); (dst) += 16; \
1204 *dst = vgetq_lane_s32(rows.val[3], col); (dst) += 16; \
1259 static void TransformAC3_NEON(const int16_t* in, uint8_t* dst) {
1275 Add4x4_NEON(m0_m1, m2_m3, dst);
1282 static void DC4_NEON(uint8_t* dst) { // DC
1283 const uint8x8_t A = vld1_u8(dst - BPS); // top row
1286 const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + 0 * BPS - 1));
1287 const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + 1 * BPS - 1));
1288 const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + 2 * BPS - 1));
1289 const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + 3 * BPS - 1));
1298 vst1_lane_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(dc), 0);
1303 static WEBP_INLINE void TrueMotion_NEON(uint8_t* dst, int size) {
1304 const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1); // top-left pixel 'A[-1]'
1305 const uint8x8_t T = vld1_u8(dst - BPS); // top row 'A[0..3]'
1310 const int16x8_t L0 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 0 * BPS - 1));
1311 const int16x8_t L1 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 1 * BPS - 1));
1312 const int16x8_t L2 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 2 * BPS - 1));
1313 const int16x8_t L3 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 3 * BPS - 1));
1324 vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0_u32, 0);
1325 vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1_u32, 0);
1326 vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2_u32, 0);
1327 vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3_u32, 0);
1329 vst1_u32((uint32_t*)(dst + 0 * BPS), r0_u32);
1330 vst1_u32((uint32_t*)(dst + 1 * BPS), r1_u32);
1331 vst1_u32((uint32_t*)(dst + 2 * BPS), r2_u32);
1332 vst1_u32((uint32_t*)(dst + 3 * BPS), r3_u32);
1334 dst += 4 * BPS;
1338 static void TM4_NEON(uint8_t* dst) { TrueMotion_NEON(dst, 4); }
1340 static void VE4_NEON(uint8_t* dst) { // vertical
1342 const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(dst - BPS - 1)); // top row
1352 vst1_lane_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(avg), 0);
1356 static void RD4_NEON(uint8_t* dst) { // Down-right
1357 const uint8x8_t XABCD_u8 = vld1_u8(dst - BPS - 1);
1360 const uint32_t I = dst[-1 + 0 * BPS];
1361 const uint32_t J = dst[-1 + 1 * BPS];
1362 const uint32_t K = dst[-1 + 2 * BPS];
1363 const uint32_t L = dst[-1 + 3 * BPS];
1378 vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0, 0);
1379 vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1, 0);
1380 vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2, 0);
1381 vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3, 0);
1384 static void LD4_NEON(uint8_t* dst) { // Down-left
1386 const uint8x8_t ABCDEFGH = vld1_u8(dst - BPS + 0);
1387 const uint8x8_t BCDEFGH0 = vld1_u8(dst - BPS + 1);
1388 const uint8x8_t CDEFGH00 = vld1_u8(dst - BPS + 2);
1389 const uint8x8_t CDEFGHH0 = vset_lane_u8(dst[-BPS + 7], CDEFGH00, 6);
1397 vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0, 0);
1398 vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1, 0);
1399 vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2, 0);
1400 vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3, 0);
1406 static void VE8uv_NEON(uint8_t* dst) { // vertical
1407 const uint8x8_t top = vld1_u8(dst - BPS);
1410 vst1_u8(dst + j * BPS, top);
1414 static void HE8uv_NEON(uint8_t* dst) { // horizontal
1417 const uint8x8_t left = vld1_dup_u8(dst - 1);
1418 vst1_u8(dst, left);
1419 dst += BPS;
1423 static WEBP_INLINE void DC8_NEON(uint8_t* dst, int do_top, int do_left) {
1429 const uint8x8_t A = vld1_u8(dst - BPS); // top row
1437 const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + 0 * BPS - 1));
1438 const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + 1 * BPS - 1));
1439 const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + 2 * BPS - 1));
1440 const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + 3 * BPS - 1));
1441 const uint16x8_t L4 = vmovl_u8(vld1_u8(dst + 4 * BPS - 1));
1442 const uint16x8_t L5 = vmovl_u8(vld1_u8(dst + 5 * BPS - 1));
1443 const uint16x8_t L6 = vmovl_u8(vld1_u8(dst + 6 * BPS - 1));
1444 const uint16x8_t L7 = vmovl_u8(vld1_u8(dst + 7 * BPS - 1));
1469 vst1_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(dc));
1474 static void DC8uv_NEON(uint8_t* dst) { DC8_NEON(dst, 1, 1); }
1475 static void DC8uvNoTop_NEON(uint8_t* dst) { DC8_NEON(dst, 0, 1); }
1476 static void DC8uvNoLeft_NEON(uint8_t* dst) { DC8_NEON(dst, 1, 0); }
1477 static void DC8uvNoTopLeft_NEON(uint8_t* dst) { DC8_NEON(dst, 0, 0); }
1479 static void TM8uv_NEON(uint8_t* dst) { TrueMotion_NEON(dst, 8); }
1484 static void VE16_NEON(uint8_t* dst) { // vertical
1485 const uint8x16_t top = vld1q_u8(dst - BPS);
1488 vst1q_u8(dst + j * BPS, top);
1492 static void HE16_NEON(uint8_t* dst) { // horizontal
1495 const uint8x16_t left = vld1q_dup_u8(dst - 1);
1496 vst1q_u8(dst, left);
1497 dst += BPS;
1501 static WEBP_INLINE void DC16_NEON(uint8_t* dst, int do_top, int do_left) {
1507 const uint8x16_t A = vld1q_u8(dst - BPS); // top row
1519 const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + (i + 0) * BPS - 1));
1520 const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + (i + 1) * BPS - 1));
1521 const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + (i + 2) * BPS - 1));
1522 const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + (i + 3) * BPS - 1));
1523 const uint16x8_t L4 = vmovl_u8(vld1_u8(dst + (i + 4) * BPS - 1));
1524 const uint16x8_t L5 = vmovl_u8(vld1_u8(dst + (i + 5) * BPS - 1));
1525 const uint16x8_t L6 = vmovl_u8(vld1_u8(dst + (i + 6) * BPS - 1));
1526 const uint16x8_t L7 = vmovl_u8(vld1_u8(dst + (i + 7) * BPS - 1));
1553 vst1q_u8(dst + i * BPS, dc);
1558 static void DC16TopLeft_NEON(uint8_t* dst) { DC16_NEON(dst, 1, 1); }
1559 static void DC16NoTop_NEON(uint8_t* dst) { DC16_NEON(dst, 0, 1); }
1560 static void DC16NoLeft_NEON(uint8_t* dst) { DC16_NEON(dst, 1, 0); }
1561 static void DC16NoTopLeft_NEON(uint8_t* dst) { DC16_NEON(dst, 0, 0); }
1563 static void TM16_NEON(uint8_t* dst) {
1564 const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1); // top-left pixel 'A[-1]'
1565 const uint8x16_t T = vld1q_u8(dst - BPS); // top row 'A[0..15]'
1572 const int16x8_t L0 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 0 * BPS - 1));
1573 const int16x8_t L1 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 1 * BPS - 1));
1574 const int16x8_t L2 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 2 * BPS - 1));
1575 const int16x8_t L3 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 3 * BPS - 1));
1589 vst1q_u8(dst + 0 * BPS, row0);
1590 vst1q_u8(dst + 1 * BPS, row1);
1591 vst1q_u8(dst + 2 * BPS, row2);
1592 vst1q_u8(dst + 3 * BPS, row3);
1593 dst += 4 * BPS;