1 /* 2 * Copyright (c) 2017 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #ifndef VPX_DSP_ARM_MEM_NEON_H_ 12 #define VPX_DSP_ARM_MEM_NEON_H_ 13 14 #include <arm_neon.h> 15 #include <assert.h> 16 #include <string.h> 17 18 #include "./vpx_config.h" 19 #include "vpx/vpx_integer.h" 20 #include "vpx_dsp/vpx_dsp_common.h" 21 22 // Helper functions used to load tran_low_t into int16, narrowing if necessary. 23 static INLINE int16x8x2_t load_tran_low_to_s16x2q(const tran_low_t *buf) { 24 #if CONFIG_VP9_HIGHBITDEPTH 25 const int32x4x2_t v0 = vld2q_s32(buf); 26 const int32x4x2_t v1 = vld2q_s32(buf + 8); 27 const int16x4_t s0 = vmovn_s32(v0.val[0]); 28 const int16x4_t s1 = vmovn_s32(v0.val[1]); 29 const int16x4_t s2 = vmovn_s32(v1.val[0]); 30 const int16x4_t s3 = vmovn_s32(v1.val[1]); 31 int16x8x2_t res; 32 res.val[0] = vcombine_s16(s0, s2); 33 res.val[1] = vcombine_s16(s1, s3); 34 return res; 35 #else 36 return vld2q_s16(buf); 37 #endif 38 } 39 40 static INLINE int16x8_t load_tran_low_to_s16q(const tran_low_t *buf) { 41 #if CONFIG_VP9_HIGHBITDEPTH 42 const int32x4_t v0 = vld1q_s32(buf); 43 const int32x4_t v1 = vld1q_s32(buf + 4); 44 const int16x4_t s0 = vmovn_s32(v0); 45 const int16x4_t s1 = vmovn_s32(v1); 46 return vcombine_s16(s0, s1); 47 #else 48 return vld1q_s16(buf); 49 #endif 50 } 51 52 static INLINE int16x4_t load_tran_low_to_s16d(const tran_low_t *buf) { 53 #if CONFIG_VP9_HIGHBITDEPTH 54 const int32x4_t v0 = vld1q_s32(buf); 55 return vmovn_s32(v0); 56 #else 57 return vld1_s16(buf); 58 #endif 59 } 60 61 static INLINE void store_s16q_to_tran_low(tran_low_t *buf, const int16x8_t a) { 62 #if CONFIG_VP9_HIGHBITDEPTH 63 const int32x4_t v0 = vmovl_s16(vget_low_s16(a)); 64 const int32x4_t v1 = vmovl_s16(vget_high_s16(a)); 65 vst1q_s32(buf, v0); 66 vst1q_s32(buf + 4, v1); 67 #else 68 vst1q_s16(buf, a); 69 #endif 70 } 71 72 // Propagate type information to the compiler. Without this the compiler may 73 // assume the required alignment of uint32_t (4 bytes) and add alignment hints 74 // to the memory access. 75 // 76 // This is used for functions operating on uint8_t which wish to load or store 4 77 // values at a time but which may not be on 4 byte boundaries. 78 static INLINE void uint32_to_mem(uint8_t *buf, uint32_t a) { 79 memcpy(buf, &a, 4); 80 } 81 82 // Load 2 sets of 4 bytes when alignment is not guaranteed. 83 static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf, int stride) { 84 uint32_t a; 85 uint32x2_t a_u32 = vdup_n_u32(0); 86 if (stride == 4) return vld1_u8(buf); 87 memcpy(&a, buf, 4); 88 buf += stride; 89 a_u32 = vld1_lane_u32(&a, a_u32, 0); 90 memcpy(&a, buf, 4); 91 a_u32 = vld1_lane_u32(&a, a_u32, 1); 92 return vreinterpret_u8_u32(a_u32); 93 } 94 95 // Store 2 sets of 4 bytes when alignment is not guaranteed. 96 static INLINE void store_unaligned_u8(uint8_t *buf, int stride, 97 const uint8x8_t a) { 98 const uint32x2_t a_u32 = vreinterpret_u32_u8(a); 99 if (stride == 4) { 100 vst1_u8(buf, a); 101 return; 102 } 103 uint32_to_mem(buf, vget_lane_u32(a_u32, 0)); 104 buf += stride; 105 uint32_to_mem(buf, vget_lane_u32(a_u32, 1)); 106 } 107 108 // Load 4 sets of 4 bytes when alignment is not guaranteed. 109 static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf, int stride) { 110 uint32_t a; 111 uint32x4_t a_u32 = vdupq_n_u32(0); 112 if (stride == 4) return vld1q_u8(buf); 113 memcpy(&a, buf, 4); 114 buf += stride; 115 a_u32 = vld1q_lane_u32(&a, a_u32, 0); 116 memcpy(&a, buf, 4); 117 buf += stride; 118 a_u32 = vld1q_lane_u32(&a, a_u32, 1); 119 memcpy(&a, buf, 4); 120 buf += stride; 121 a_u32 = vld1q_lane_u32(&a, a_u32, 2); 122 memcpy(&a, buf, 4); 123 buf += stride; 124 a_u32 = vld1q_lane_u32(&a, a_u32, 3); 125 return vreinterpretq_u8_u32(a_u32); 126 } 127 128 // Store 4 sets of 4 bytes when alignment is not guaranteed. 129 static INLINE void store_unaligned_u8q(uint8_t *buf, int stride, 130 const uint8x16_t a) { 131 const uint32x4_t a_u32 = vreinterpretq_u32_u8(a); 132 if (stride == 4) { 133 vst1q_u8(buf, a); 134 return; 135 } 136 uint32_to_mem(buf, vgetq_lane_u32(a_u32, 0)); 137 buf += stride; 138 uint32_to_mem(buf, vgetq_lane_u32(a_u32, 1)); 139 buf += stride; 140 uint32_to_mem(buf, vgetq_lane_u32(a_u32, 2)); 141 buf += stride; 142 uint32_to_mem(buf, vgetq_lane_u32(a_u32, 3)); 143 } 144 145 // Load 2 sets of 4 bytes when alignment is guaranteed. 146 static INLINE uint8x8_t load_u8(const uint8_t *buf, int stride) { 147 uint32x2_t a = vdup_n_u32(0); 148 149 assert(!((intptr_t)buf % sizeof(uint32_t))); 150 assert(!(stride % sizeof(uint32_t))); 151 152 a = vld1_lane_u32((const uint32_t *)buf, a, 0); 153 buf += stride; 154 a = vld1_lane_u32((const uint32_t *)buf, a, 1); 155 return vreinterpret_u8_u32(a); 156 } 157 158 // Store 2 sets of 4 bytes when alignment is guaranteed. 159 static INLINE void store_u8(uint8_t *buf, int stride, const uint8x8_t a) { 160 uint32x2_t a_u32 = vreinterpret_u32_u8(a); 161 162 assert(!((intptr_t)buf % sizeof(uint32_t))); 163 assert(!(stride % sizeof(uint32_t))); 164 165 vst1_lane_u32((uint32_t *)buf, a_u32, 0); 166 buf += stride; 167 vst1_lane_u32((uint32_t *)buf, a_u32, 1); 168 } 169 #endif // VPX_DSP_ARM_MEM_NEON_H_ 170