Lines Matching refs:DST
33 static void Transform_SSE2(const int16_t* in, uint8_t* dst, int do_two) {
148 // Add inverse transform to 'dst' and store.
155 dst0 = _mm_loadl_epi64((__m128i*)(dst + 0 * BPS));
156 dst1 = _mm_loadl_epi64((__m128i*)(dst + 1 * BPS));
157 dst2 = _mm_loadl_epi64((__m128i*)(dst + 2 * BPS));
158 dst3 = _mm_loadl_epi64((__m128i*)(dst + 3 * BPS));
161 dst0 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 0 * BPS));
162 dst1 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 1 * BPS));
163 dst2 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 2 * BPS));
164 dst3 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 3 * BPS));
184 _mm_storel_epi64((__m128i*)(dst + 0 * BPS), dst0);
185 _mm_storel_epi64((__m128i*)(dst + 1 * BPS), dst1);
186 _mm_storel_epi64((__m128i*)(dst + 2 * BPS), dst2);
187 _mm_storel_epi64((__m128i*)(dst + 3 * BPS), dst3);
190 WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(dst0));
191 WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(dst1));
192 WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(dst2));
193 WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(dst3));
200 static void TransformAC3(const int16_t* in, uint8_t* dst) {
216 __m128i dst0 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 0 * BPS));
217 __m128i dst1 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 1 * BPS));
218 __m128i dst2 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 2 * BPS));
219 __m128i dst3 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 3 * BPS));
236 WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(dst0));
237 WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(dst1));
238 WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(dst2));
239 WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(dst3));
540 uint8_t* dst, int stride) {
542 for (i = 0; i < 4; ++i, dst += stride) {
543 WebPUint32ToMem(dst, _mm_cvtsi128_si32(*x));
891 #define DST(x, y) dst[(x) + (y) * BPS]
902 static void VE4_SSE2(uint8_t* dst) { // vertical
904 const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(dst - BPS - 1));
914 WebPUint32ToMem(dst + i * BPS, vals);
918 static void LD4_SSE2(uint8_t* dst) { // Down-Left
920 const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(dst - BPS));
923 const __m128i CDEFGHH0 = _mm_insert_epi16(CDEFGH00, dst[-BPS + 7], 3);
928 WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32( abcdefg ));
929 WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1)));
930 WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2)));
931 WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
934 static void VR4_SSE2(uint8_t* dst) { // Vertical-Right
936 const int I = dst[-1 + 0 * BPS];
937 const int J = dst[-1 + 1 * BPS];
938 const int K = dst[-1 + 2 * BPS];
939 const int X = dst[-1 - BPS];
940 const __m128i XABCD = _mm_loadl_epi64((__m128i*)(dst - BPS - 1));
949 WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32( abcd ));
950 WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32( efgh ));
951 WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_slli_si128(abcd, 1)));
952 WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_slli_si128(efgh, 1)));
955 DST(0, 2) = AVG3(J, I, X);
956 DST(0, 3) = AVG3(K, J, I);
959 static void VL4_SSE2(uint8_t* dst) { // Vertical-Left
961 const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(dst - BPS));
974 WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32( avg1 ));
975 WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32( avg4 ));
976 WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(avg1, 1)));
977 WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(avg4, 1)));
980 DST(3, 2) = (extra_out >> 0) & 0xff;
981 DST(3, 3) = (extra_out >> 8) & 0xff;
984 static void RD4_SSE2(uint8_t* dst) { // Down-right
986 const __m128i XABCD = _mm_loadl_epi64((__m128i*)(dst - BPS - 1));
988 const uint32_t I = dst[-1 + 0 * BPS];
989 const uint32_t J = dst[-1 + 1 * BPS];
990 const uint32_t K = dst[-1 + 2 * BPS];
991 const uint32_t L = dst[-1 + 3 * BPS];
1001 WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32( abcdefg ));
1002 WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1)));
1003 WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2)));
1004 WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
1007 #undef DST
1013 static WEBP_INLINE void TrueMotion_SSE2(uint8_t* dst, int size) {
1014 const uint8_t* top = dst - BPS;
1020 for (y = 0; y < 4; ++y, dst += BPS) {
1021 const int val = dst[-1] - top[-1];
1024 WebPUint32ToMem(dst, _mm_cvtsi128_si32(out));
1029 for (y = 0; y < 8; ++y, dst += BPS) {
1030 const int val = dst[-1] - top[-1];
1033 _mm_storel_epi64((__m128i*)dst, out);
1039 for (y = 0; y < 16; ++y, dst += BPS) {
1040 const int val = dst[-1] - top[-1];
1045 _mm_storeu_si128((__m128i*)dst, out);
1050 static void TM4_SSE2(uint8_t* dst) { TrueMotion_SSE2(dst, 4); }
1051 static void TM8uv_SSE2(uint8_t* dst) { TrueMotion_SSE2(dst, 8); }
1052 static void TM16_SSE2(uint8_t* dst) { TrueMotion_SSE2(dst, 16); }
1054 static void VE16_SSE2(uint8_t* dst) {
1055 const __m128i top = _mm_loadu_si128((const __m128i*)(dst - BPS));
1058 _mm_storeu_si128((__m128i*)(dst + j * BPS), top);
1062 static void HE16_SSE2(uint8_t* dst) { // horizontal
1065 const __m128i values = _mm_set1_epi8(dst[-1]);
1066 _mm_storeu_si128((__m128i*)dst, values);
1067 dst += BPS;
1071 static WEBP_INLINE void Put16_SSE2(uint8_t v, uint8_t* dst) {
1075 _mm_storeu_si128((__m128i*)(dst + j * BPS), values);
1079 static void DC16_SSE2(uint8_t* dst) { // DC
1081 const __m128i top = _mm_loadu_si128((const __m128i*)(dst - BPS));
1088 left += dst[-1 + j * BPS];
1092 Put16_SSE2(DC >> 5, dst);
1096 static void DC16NoTop_SSE2(uint8_t* dst) { // DC with top samples unavailable
1100 DC += dst[-1 + j * BPS];
1102 Put16_SSE2(DC >> 4, dst);
1105 static void DC16NoLeft_SSE2(uint8_t* dst) { // DC with left samples unavailable
1107 const __m128i top = _mm_loadu_si128((const __m128i*)(dst - BPS));
1112 Put16_SSE2(DC >> 4, dst);
1115 static void DC16NoTopLeft_SSE2(uint8_t* dst) { // DC with no top & left samples
1116 Put16_SSE2(0x80, dst);
1122 static void VE8uv_SSE2(uint8_t* dst) { // vertical
1124 const __m128i top = _mm_loadl_epi64((const __m128i*)(dst - BPS));
1126 _mm_storel_epi64((__m128i*)(dst + j * BPS), top);
1131 static WEBP_INLINE void Put8x8uv_SSE2(uint8_t v, uint8_t* dst) {
1135 _mm_storel_epi64((__m128i*)(dst + j * BPS), values);
1139 static void DC8uv_SSE2(uint8_t* dst) { // DC
1141 const __m128i top = _mm_loadl_epi64((const __m128i*)(dst - BPS));
1146 left += dst[-1 + j * BPS];
1150 Put8x8uv_SSE2(DC >> 4, dst);
1154 static void DC8uvNoLeft_SSE2(uint8_t* dst) { // DC with no left samples
1156 const __m128i top = _mm_loadl_epi64((const __m128i*)(dst - BPS));
1159 Put8x8uv_SSE2(DC >> 3, dst);
1162 static void DC8uvNoTop_SSE2(uint8_t* dst) { // DC with no top samples
1166 dc0 += dst[-1 + i * BPS];
1168 Put8x8uv_SSE2(dc0 >> 3, dst);
1171 static void DC8uvNoTopLeft_SSE2(uint8_t* dst) { // DC with nothing
1172 Put8x8uv_SSE2(0x80, dst);