1 /* 2 * Copyright (c) 2017 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #ifndef VPX_VPX_DSP_X86_MEM_SSE2_H_ 12 #define VPX_VPX_DSP_X86_MEM_SSE2_H_ 13 14 #include <emmintrin.h> // SSE2 15 #include <string.h> 16 17 #include "./vpx_config.h" 18 19 static INLINE void storeu_uint32(void *dst, uint32_t v) { 20 memcpy(dst, &v, sizeof(v)); 21 } 22 23 static INLINE uint32_t loadu_uint32(const void *src) { 24 uint32_t v; 25 memcpy(&v, src, sizeof(v)); 26 return v; 27 } 28 29 static INLINE __m128i load_unaligned_u32(const void *a) { 30 uint32_t val; 31 memcpy(&val, a, sizeof(val)); 32 return _mm_cvtsi32_si128(val); 33 } 34 35 static INLINE void store_unaligned_u32(void *const a, const __m128i v) { 36 const uint32_t val = _mm_cvtsi128_si32(v); 37 memcpy(a, &val, sizeof(val)); 38 } 39 40 #define mm_storelu(dst, v) memcpy((dst), (const char *)&(v), 8) 41 #define mm_storehu(dst, v) memcpy((dst), (const char *)&(v) + 8, 8) 42 43 static INLINE __m128i loadh_epi64(const __m128i s, const void *const src) { 44 return _mm_castps_si128( 45 _mm_loadh_pi(_mm_castsi128_ps(s), (const __m64 *)src)); 46 } 47 48 static INLINE void load_8bit_4x4(const uint8_t *const s, const ptrdiff_t stride, 49 __m128i *const d) { 50 d[0] = _mm_cvtsi32_si128(*(const int *)(s + 0 * stride)); 51 d[1] = _mm_cvtsi32_si128(*(const int *)(s + 1 * stride)); 52 d[2] = _mm_cvtsi32_si128(*(const int *)(s + 2 * stride)); 53 d[3] = _mm_cvtsi32_si128(*(const int *)(s + 3 * stride)); 54 } 55 56 static INLINE void load_8bit_4x8(const uint8_t *const s, const ptrdiff_t stride, 57 __m128i *const d) { 58 load_8bit_4x4(s + 0 * stride, stride, &d[0]); 59 load_8bit_4x4(s + 4 * stride, stride, &d[4]); 60 } 61 62 static INLINE void load_8bit_8x4(const uint8_t *const s, const ptrdiff_t stride, 63 __m128i *const d) { 64 d[0] = _mm_loadl_epi64((const __m128i *)(s + 0 * stride)); 65 d[1] = _mm_loadl_epi64((const __m128i *)(s + 1 * stride)); 66 d[2] = _mm_loadl_epi64((const __m128i *)(s + 2 * stride)); 67 d[3] = _mm_loadl_epi64((const __m128i *)(s + 3 * stride)); 68 } 69 70 static INLINE void load_8bit_8x8(const uint8_t *const s, const ptrdiff_t stride, 71 __m128i *const d) { 72 load_8bit_8x4(s + 0 * stride, stride, &d[0]); 73 load_8bit_8x4(s + 4 * stride, stride, &d[4]); 74 } 75 76 static INLINE void load_8bit_16x8(const uint8_t *const s, 77 const ptrdiff_t stride, __m128i *const d) { 78 d[0] = _mm_load_si128((const __m128i *)(s + 0 * stride)); 79 d[1] = _mm_load_si128((const __m128i *)(s + 1 * stride)); 80 d[2] = _mm_load_si128((const __m128i *)(s + 2 * stride)); 81 d[3] = _mm_load_si128((const __m128i *)(s + 3 * stride)); 82 d[4] = _mm_load_si128((const __m128i *)(s + 4 * stride)); 83 d[5] = _mm_load_si128((const __m128i *)(s + 5 * stride)); 84 d[6] = _mm_load_si128((const __m128i *)(s + 6 * stride)); 85 d[7] = _mm_load_si128((const __m128i *)(s + 7 * stride)); 86 } 87 88 static INLINE void loadu_8bit_16x4(const uint8_t *const s, 89 const ptrdiff_t stride, __m128i *const d) { 90 d[0] = _mm_loadu_si128((const __m128i *)(s + 0 * stride)); 91 d[1] = _mm_loadu_si128((const __m128i *)(s + 1 * stride)); 92 d[2] = _mm_loadu_si128((const __m128i *)(s + 2 * stride)); 93 d[3] = _mm_loadu_si128((const __m128i *)(s + 3 * stride)); 94 } 95 96 static INLINE void loadu_8bit_16x8(const uint8_t *const s, 97 const ptrdiff_t stride, __m128i *const d) { 98 loadu_8bit_16x4(s + 0 * stride, stride, &d[0]); 99 loadu_8bit_16x4(s + 4 * stride, stride, &d[4]); 100 } 101 102 static INLINE void _mm_storeh_epi64(__m128i *const d, const __m128i s) { 103 _mm_storeh_pi((__m64 *)d, _mm_castsi128_ps(s)); 104 } 105 106 static INLINE void store_8bit_4x4(const __m128i *const s, uint8_t *const d, 107 const ptrdiff_t stride) { 108 *(int *)(d + 0 * stride) = _mm_cvtsi128_si32(s[0]); 109 *(int *)(d + 1 * stride) = _mm_cvtsi128_si32(s[1]); 110 *(int *)(d + 2 * stride) = _mm_cvtsi128_si32(s[2]); 111 *(int *)(d + 3 * stride) = _mm_cvtsi128_si32(s[3]); 112 } 113 114 static INLINE void store_8bit_4x4_sse2(const __m128i s, uint8_t *const d, 115 const ptrdiff_t stride) { 116 __m128i ss[4]; 117 118 ss[0] = s; 119 ss[1] = _mm_srli_si128(s, 4); 120 ss[2] = _mm_srli_si128(s, 8); 121 ss[3] = _mm_srli_si128(s, 12); 122 store_8bit_4x4(ss, d, stride); 123 } 124 125 static INLINE void store_8bit_8x4_from_16x2(const __m128i *const s, 126 uint8_t *const d, 127 const ptrdiff_t stride) { 128 _mm_storel_epi64((__m128i *)(d + 0 * stride), s[0]); 129 _mm_storeh_epi64((__m128i *)(d + 1 * stride), s[0]); 130 _mm_storel_epi64((__m128i *)(d + 2 * stride), s[1]); 131 _mm_storeh_epi64((__m128i *)(d + 3 * stride), s[1]); 132 } 133 134 static INLINE void store_8bit_8x8(const __m128i *const s, uint8_t *const d, 135 const ptrdiff_t stride) { 136 _mm_storel_epi64((__m128i *)(d + 0 * stride), s[0]); 137 _mm_storel_epi64((__m128i *)(d + 1 * stride), s[1]); 138 _mm_storel_epi64((__m128i *)(d + 2 * stride), s[2]); 139 _mm_storel_epi64((__m128i *)(d + 3 * stride), s[3]); 140 _mm_storel_epi64((__m128i *)(d + 4 * stride), s[4]); 141 _mm_storel_epi64((__m128i *)(d + 5 * stride), s[5]); 142 _mm_storel_epi64((__m128i *)(d + 6 * stride), s[6]); 143 _mm_storel_epi64((__m128i *)(d + 7 * stride), s[7]); 144 } 145 146 static INLINE void storeu_8bit_16x4(const __m128i *const s, uint8_t *const d, 147 const ptrdiff_t stride) { 148 _mm_storeu_si128((__m128i *)(d + 0 * stride), s[0]); 149 _mm_storeu_si128((__m128i *)(d + 1 * stride), s[1]); 150 _mm_storeu_si128((__m128i *)(d + 2 * stride), s[2]); 151 _mm_storeu_si128((__m128i *)(d + 3 * stride), s[3]); 152 } 153 154 #endif // VPX_VPX_DSP_X86_MEM_SSE2_H_ 155