1 /* 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <arm_neon.h> 12 13 #include "./vpx_config.h" 14 15 #include "vpx/vpx_integer.h" 16 17 unsigned int vpx_sad8x16_neon(unsigned char *src_ptr, int src_stride, 18 unsigned char *ref_ptr, int ref_stride) { 19 uint8x8_t d0, d8; 20 uint16x8_t q12; 21 uint32x4_t q1; 22 uint64x2_t q3; 23 uint32x2_t d5; 24 int i; 25 26 d0 = vld1_u8(src_ptr); 27 src_ptr += src_stride; 28 d8 = vld1_u8(ref_ptr); 29 ref_ptr += ref_stride; 30 q12 = vabdl_u8(d0, d8); 31 32 for (i = 0; i < 15; i++) { 33 d0 = vld1_u8(src_ptr); 34 src_ptr += src_stride; 35 d8 = vld1_u8(ref_ptr); 36 ref_ptr += ref_stride; 37 q12 = vabal_u8(q12, d0, d8); 38 } 39 40 q1 = vpaddlq_u16(q12); 41 q3 = vpaddlq_u32(q1); 42 d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)), 43 vreinterpret_u32_u64(vget_high_u64(q3))); 44 45 return vget_lane_u32(d5, 0); 46 } 47 48 unsigned int vpx_sad4x4_neon(unsigned char *src_ptr, int src_stride, 49 unsigned char *ref_ptr, int ref_stride) { 50 uint8x8_t d0, d8; 51 uint16x8_t q12; 52 uint32x2_t d1; 53 uint64x1_t d3; 54 int i; 55 56 d0 = vld1_u8(src_ptr); 57 src_ptr += src_stride; 58 d8 = vld1_u8(ref_ptr); 59 ref_ptr += ref_stride; 60 q12 = vabdl_u8(d0, d8); 61 62 for (i = 0; i < 3; i++) { 63 d0 = vld1_u8(src_ptr); 64 src_ptr += src_stride; 65 d8 = vld1_u8(ref_ptr); 66 ref_ptr += ref_stride; 67 q12 = vabal_u8(q12, d0, d8); 68 } 69 70 d1 = vpaddl_u16(vget_low_u16(q12)); 71 d3 = vpaddl_u32(d1); 72 73 return vget_lane_u32(vreinterpret_u32_u64(d3), 0); 74 } 75 76 unsigned int vpx_sad16x8_neon(unsigned char *src_ptr, int src_stride, 77 unsigned char *ref_ptr, int ref_stride) { 78 uint8x16_t q0, q4; 79 uint16x8_t q12, q13; 80 uint32x4_t q1; 81 uint64x2_t q3; 82 uint32x2_t d5; 83 int i; 84 85 q0 = vld1q_u8(src_ptr); 86 src_ptr += src_stride; 87 q4 = vld1q_u8(ref_ptr); 88 ref_ptr += ref_stride; 89 q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4)); 90 q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4)); 91 92 for (i = 0; i < 7; i++) { 93 q0 = vld1q_u8(src_ptr); 94 src_ptr += src_stride; 95 q4 = vld1q_u8(ref_ptr); 96 ref_ptr += ref_stride; 97 q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4)); 98 q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4)); 99 } 100 101 q12 = vaddq_u16(q12, q13); 102 q1 = vpaddlq_u16(q12); 103 q3 = vpaddlq_u32(q1); 104 d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)), 105 vreinterpret_u32_u64(vget_high_u64(q3))); 106 107 return vget_lane_u32(d5, 0); 108 } 109 110 static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo, 111 const uint16x8_t vec_hi) { 112 const uint32x4_t vec_l_lo = 113 vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo)); 114 const uint32x4_t vec_l_hi = 115 vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi)); 116 const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi); 117 const uint64x2_t b = vpaddlq_u32(a); 118 const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), 119 vreinterpret_u32_u64(vget_high_u64(b))); 120 return vget_lane_u32(c, 0); 121 } 122 static INLINE unsigned int horizontal_add_16x8(const uint16x8_t vec_16x8) { 123 const uint32x4_t a = vpaddlq_u16(vec_16x8); 124 const uint64x2_t b = vpaddlq_u32(a); 125 const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), 126 vreinterpret_u32_u64(vget_high_u64(b))); 127 return vget_lane_u32(c, 0); 128 } 129 130 unsigned int vpx_sad64x64_neon(const uint8_t *src, int src_stride, 131 const uint8_t *ref, int ref_stride) { 132 int i; 133 uint16x8_t vec_accum_lo = vdupq_n_u16(0); 134 uint16x8_t vec_accum_hi = vdupq_n_u16(0); 135 for (i = 0; i < 64; ++i) { 136 const uint8x16_t vec_src_00 = vld1q_u8(src); 137 const uint8x16_t vec_src_16 = vld1q_u8(src + 16); 138 const uint8x16_t vec_src_32 = vld1q_u8(src + 32); 139 const uint8x16_t vec_src_48 = vld1q_u8(src + 48); 140 const uint8x16_t vec_ref_00 = vld1q_u8(ref); 141 const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16); 142 const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32); 143 const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48); 144 src += src_stride; 145 ref += ref_stride; 146 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00), 147 vget_low_u8(vec_ref_00)); 148 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00), 149 vget_high_u8(vec_ref_00)); 150 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16), 151 vget_low_u8(vec_ref_16)); 152 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16), 153 vget_high_u8(vec_ref_16)); 154 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_32), 155 vget_low_u8(vec_ref_32)); 156 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_32), 157 vget_high_u8(vec_ref_32)); 158 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_48), 159 vget_low_u8(vec_ref_48)); 160 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_48), 161 vget_high_u8(vec_ref_48)); 162 } 163 return horizontal_long_add_16x8(vec_accum_lo, vec_accum_hi); 164 } 165 166 unsigned int vpx_sad32x32_neon(const uint8_t *src, int src_stride, 167 const uint8_t *ref, int ref_stride) { 168 int i; 169 uint16x8_t vec_accum_lo = vdupq_n_u16(0); 170 uint16x8_t vec_accum_hi = vdupq_n_u16(0); 171 172 for (i = 0; i < 32; ++i) { 173 const uint8x16_t vec_src_00 = vld1q_u8(src); 174 const uint8x16_t vec_src_16 = vld1q_u8(src + 16); 175 const uint8x16_t vec_ref_00 = vld1q_u8(ref); 176 const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16); 177 src += src_stride; 178 ref += ref_stride; 179 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00), 180 vget_low_u8(vec_ref_00)); 181 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00), 182 vget_high_u8(vec_ref_00)); 183 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16), 184 vget_low_u8(vec_ref_16)); 185 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16), 186 vget_high_u8(vec_ref_16)); 187 } 188 return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi)); 189 } 190 191 unsigned int vpx_sad16x16_neon(const uint8_t *src, int src_stride, 192 const uint8_t *ref, int ref_stride) { 193 int i; 194 uint16x8_t vec_accum_lo = vdupq_n_u16(0); 195 uint16x8_t vec_accum_hi = vdupq_n_u16(0); 196 197 for (i = 0; i < 16; ++i) { 198 const uint8x16_t vec_src = vld1q_u8(src); 199 const uint8x16_t vec_ref = vld1q_u8(ref); 200 src += src_stride; 201 ref += ref_stride; 202 vec_accum_lo = 203 vabal_u8(vec_accum_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref)); 204 vec_accum_hi = 205 vabal_u8(vec_accum_hi, vget_high_u8(vec_src), vget_high_u8(vec_ref)); 206 } 207 return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi)); 208 } 209 210 unsigned int vpx_sad8x8_neon(const uint8_t *src, int src_stride, 211 const uint8_t *ref, int ref_stride) { 212 int i; 213 uint16x8_t vec_accum = vdupq_n_u16(0); 214 215 for (i = 0; i < 8; ++i) { 216 const uint8x8_t vec_src = vld1_u8(src); 217 const uint8x8_t vec_ref = vld1_u8(ref); 218 src += src_stride; 219 ref += ref_stride; 220 vec_accum = vabal_u8(vec_accum, vec_src, vec_ref); 221 } 222 return horizontal_add_16x8(vec_accum); 223 } 224