1 /* 2 * Copyright (c) 2017 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #ifndef VPX_DSP_ARM_MEM_NEON_H_ 12 #define VPX_DSP_ARM_MEM_NEON_H_ 13 14 #include <arm_neon.h> 15 #include <assert.h> 16 #include <string.h> 17 18 #include "./vpx_config.h" 19 #include "vpx/vpx_integer.h" 20 #include "vpx_dsp/vpx_dsp_common.h" 21 22 // Helper functions used to load tran_low_t into int16, narrowing if necessary. 23 static INLINE int16x8x2_t load_tran_low_to_s16x2q(const tran_low_t *buf) { 24 #if CONFIG_VP9_HIGHBITDEPTH 25 const int32x4x2_t v0 = vld2q_s32(buf); 26 const int32x4x2_t v1 = vld2q_s32(buf + 8); 27 const int16x4_t s0 = vmovn_s32(v0.val[0]); 28 const int16x4_t s1 = vmovn_s32(v0.val[1]); 29 const int16x4_t s2 = vmovn_s32(v1.val[0]); 30 const int16x4_t s3 = vmovn_s32(v1.val[1]); 31 int16x8x2_t res; 32 res.val[0] = vcombine_s16(s0, s2); 33 res.val[1] = vcombine_s16(s1, s3); 34 return res; 35 #else 36 return vld2q_s16(buf); 37 #endif 38 } 39 40 static INLINE int16x8_t load_tran_low_to_s16q(const tran_low_t *buf) { 41 #if CONFIG_VP9_HIGHBITDEPTH 42 const int32x4_t v0 = vld1q_s32(buf); 43 const int32x4_t v1 = vld1q_s32(buf + 4); 44 const int16x4_t s0 = vmovn_s32(v0); 45 const int16x4_t s1 = vmovn_s32(v1); 46 return vcombine_s16(s0, s1); 47 #else 48 return vld1q_s16(buf); 49 #endif 50 } 51 52 static INLINE int16x4_t load_tran_low_to_s16d(const tran_low_t *buf) { 53 #if CONFIG_VP9_HIGHBITDEPTH 54 const int32x4_t v0 = vld1q_s32(buf); 55 return vmovn_s32(v0); 56 #else 57 return vld1_s16(buf); 58 #endif 59 } 60 61 static INLINE void store_s16q_to_tran_low(tran_low_t *buf, const int16x8_t a) { 62 #if CONFIG_VP9_HIGHBITDEPTH 63 const int32x4_t v0 = vmovl_s16(vget_low_s16(a)); 64 const int32x4_t v1 = vmovl_s16(vget_high_s16(a)); 65 vst1q_s32(buf, v0); 66 vst1q_s32(buf + 4, v1); 67 #else 68 vst1q_s16(buf, a); 69 #endif 70 } 71 72 // Propagate type information to the compiler. Without this the compiler may 73 // assume the required alignment of uint32_t (4 bytes) and add alignment hints 74 // to the memory access. 75 // 76 // This is used for functions operating on uint8_t which wish to load or store 4 77 // values at a time but which may not be on 4 byte boundaries. 78 static INLINE void uint32_to_mem(uint8_t *buf, uint32_t a) { 79 memcpy(buf, &a, 4); 80 } 81 82 // Load 4 sets of 4 bytes when alignment is not guaranteed. 83 static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf, int stride) { 84 uint32_t a; 85 uint32x4_t a_u32 = vdupq_n_u32(0); 86 if (stride == 4) return vld1q_u8(buf); 87 memcpy(&a, buf, 4); 88 buf += stride; 89 a_u32 = vld1q_lane_u32(&a, a_u32, 0); 90 memcpy(&a, buf, 4); 91 buf += stride; 92 a_u32 = vld1q_lane_u32(&a, a_u32, 1); 93 memcpy(&a, buf, 4); 94 buf += stride; 95 a_u32 = vld1q_lane_u32(&a, a_u32, 2); 96 memcpy(&a, buf, 4); 97 buf += stride; 98 a_u32 = vld1q_lane_u32(&a, a_u32, 3); 99 return vreinterpretq_u8_u32(a_u32); 100 } 101 102 // Store 4 sets of 4 bytes when alignment is not guaranteed. 103 static INLINE void store_unaligned_u8q(uint8_t *buf, int stride, 104 const uint8x16_t a) { 105 const uint32x4_t a_u32 = vreinterpretq_u32_u8(a); 106 if (stride == 4) { 107 vst1q_u8(buf, a); 108 return; 109 } 110 uint32_to_mem(buf, vgetq_lane_u32(a_u32, 0)); 111 buf += stride; 112 uint32_to_mem(buf, vgetq_lane_u32(a_u32, 1)); 113 buf += stride; 114 uint32_to_mem(buf, vgetq_lane_u32(a_u32, 2)); 115 buf += stride; 116 uint32_to_mem(buf, vgetq_lane_u32(a_u32, 3)); 117 } 118 119 // Load 2 sets of 4 bytes when alignment is guaranteed. 120 static INLINE uint8x8_t load_u8(const uint8_t *buf, int stride) { 121 uint32x2_t a = vdup_n_u32(0); 122 123 assert(!((intptr_t)buf % sizeof(uint32_t))); 124 assert(!(stride % sizeof(uint32_t))); 125 126 a = vld1_lane_u32((const uint32_t *)buf, a, 0); 127 buf += stride; 128 a = vld1_lane_u32((const uint32_t *)buf, a, 1); 129 return vreinterpret_u8_u32(a); 130 } 131 132 // Store 2 sets of 4 bytes when alignment is guaranteed. 133 static INLINE void store_u8(uint8_t *buf, int stride, const uint8x8_t a) { 134 uint32x2_t a_u32 = vreinterpret_u32_u8(a); 135 136 assert(!((intptr_t)buf % sizeof(uint32_t))); 137 assert(!(stride % sizeof(uint32_t))); 138 139 vst1_lane_u32((uint32_t *)buf, a_u32, 0); 140 buf += stride; 141 vst1_lane_u32((uint32_t *)buf, a_u32, 1); 142 } 143 #endif // VPX_DSP_ARM_MEM_NEON_H_ 144