1 /* 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <arm_neon.h> 12 13 #include "./vpx_config.h" 14 #include "./vpx_dsp_rtcd.h" 15 #include "vpx/vpx_integer.h" 16 #include "vpx_dsp/arm/mem_neon.h" 17 #include "vpx_dsp/arm/sum_neon.h" 18 19 void vpx_sad4x4x4d_neon(const uint8_t *src, int src_stride, 20 const uint8_t *const ref[4], int ref_stride, 21 uint32_t *res) { 22 int i; 23 const uint8x16_t src_u8 = load_unaligned_u8q(src, src_stride); 24 for (i = 0; i < 4; ++i) { 25 const uint8x16_t ref_u8 = load_unaligned_u8q(ref[i], ref_stride); 26 uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(ref_u8)); 27 abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(ref_u8)); 28 res[i] = vget_lane_u32(horizontal_add_uint16x8(abs), 0); 29 } 30 } 31 32 void vpx_sad4x8x4d_neon(const uint8_t *src, int src_stride, 33 const uint8_t *const ref[4], int ref_stride, 34 uint32_t *res) { 35 int i; 36 const uint8x16_t src_0 = load_unaligned_u8q(src, src_stride); 37 const uint8x16_t src_1 = load_unaligned_u8q(src + 4 * src_stride, src_stride); 38 for (i = 0; i < 4; ++i) { 39 const uint8x16_t ref_0 = load_unaligned_u8q(ref[i], ref_stride); 40 const uint8x16_t ref_1 = 41 load_unaligned_u8q(ref[i] + 4 * ref_stride, ref_stride); 42 uint16x8_t abs = vabdl_u8(vget_low_u8(src_0), vget_low_u8(ref_0)); 43 abs = vabal_u8(abs, vget_high_u8(src_0), vget_high_u8(ref_0)); 44 abs = vabal_u8(abs, vget_low_u8(src_1), vget_low_u8(ref_1)); 45 abs = vabal_u8(abs, vget_high_u8(src_1), vget_high_u8(ref_1)); 46 res[i] = vget_lane_u32(horizontal_add_uint16x8(abs), 0); 47 } 48 } 49 50 static INLINE void sad8x_4d(const uint8_t *a, int a_stride, 51 const uint8_t *const b[4], int b_stride, 52 uint32_t *result, const int height) { 53 int i, j; 54 uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), 55 vdupq_n_u16(0) }; 56 const uint8_t *b_loop[4] = { b[0], b[1], b[2], b[3] }; 57 58 for (i = 0; i < height; ++i) { 59 const uint8x8_t a_u8 = vld1_u8(a); 60 a += a_stride; 61 for (j = 0; j < 4; ++j) { 62 const uint8x8_t b_u8 = vld1_u8(b_loop[j]); 63 b_loop[j] += b_stride; 64 sum[j] = vabal_u8(sum[j], a_u8, b_u8); 65 } 66 } 67 68 for (j = 0; j < 4; ++j) { 69 result[j] = vget_lane_u32(horizontal_add_uint16x8(sum[j]), 0); 70 } 71 } 72 73 void vpx_sad8x4x4d_neon(const uint8_t *src, int src_stride, 74 const uint8_t *const ref[4], int ref_stride, 75 uint32_t *res) { 76 sad8x_4d(src, src_stride, ref, ref_stride, res, 4); 77 } 78 79 void vpx_sad8x8x4d_neon(const uint8_t *src, int src_stride, 80 const uint8_t *const ref[4], int ref_stride, 81 uint32_t *res) { 82 sad8x_4d(src, src_stride, ref, ref_stride, res, 8); 83 } 84 85 void vpx_sad8x16x4d_neon(const uint8_t *src, int src_stride, 86 const uint8_t *const ref[4], int ref_stride, 87 uint32_t *res) { 88 sad8x_4d(src, src_stride, ref, ref_stride, res, 16); 89 } 90 91 static INLINE void sad16x_4d(const uint8_t *a, int a_stride, 92 const uint8_t *const b[4], int b_stride, 93 uint32_t *result, const int height) { 94 int i, j; 95 uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), 96 vdupq_n_u16(0) }; 97 const uint8_t *b_loop[4] = { b[0], b[1], b[2], b[3] }; 98 99 for (i = 0; i < height; ++i) { 100 const uint8x16_t a_u8 = vld1q_u8(a); 101 a += a_stride; 102 for (j = 0; j < 4; ++j) { 103 const uint8x16_t b_u8 = vld1q_u8(b_loop[j]); 104 b_loop[j] += b_stride; 105 sum[j] = vabal_u8(sum[j], vget_low_u8(a_u8), vget_low_u8(b_u8)); 106 sum[j] = vabal_u8(sum[j], vget_high_u8(a_u8), vget_high_u8(b_u8)); 107 } 108 } 109 110 for (j = 0; j < 4; ++j) { 111 result[j] = vget_lane_u32(horizontal_add_uint16x8(sum[j]), 0); 112 } 113 } 114 115 void vpx_sad16x8x4d_neon(const uint8_t *src, int src_stride, 116 const uint8_t *const ref[4], int ref_stride, 117 uint32_t *res) { 118 sad16x_4d(src, src_stride, ref, ref_stride, res, 8); 119 } 120 121 void vpx_sad16x16x4d_neon(const uint8_t *src, int src_stride, 122 const uint8_t *const ref[4], int ref_stride, 123 uint32_t *res) { 124 sad16x_4d(src, src_stride, ref, ref_stride, res, 16); 125 } 126 127 void vpx_sad16x32x4d_neon(const uint8_t *src, int src_stride, 128 const uint8_t *const ref[4], int ref_stride, 129 uint32_t *res) { 130 sad16x_4d(src, src_stride, ref, ref_stride, res, 32); 131 } 132 133 static INLINE void sad32x_4d(const uint8_t *a, int a_stride, 134 const uint8_t *const b[4], int b_stride, 135 uint32_t *result, const int height) { 136 int i, j; 137 uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), 138 vdupq_n_u16(0) }; 139 const uint8_t *b_loop[4] = { b[0], b[1], b[2], b[3] }; 140 141 for (i = 0; i < height; ++i) { 142 const uint8x16_t a_0 = vld1q_u8(a); 143 const uint8x16_t a_1 = vld1q_u8(a + 16); 144 a += a_stride; 145 for (j = 0; j < 4; ++j) { 146 const uint8x16_t b_0 = vld1q_u8(b_loop[j]); 147 const uint8x16_t b_1 = vld1q_u8(b_loop[j] + 16); 148 b_loop[j] += b_stride; 149 sum[j] = vabal_u8(sum[j], vget_low_u8(a_0), vget_low_u8(b_0)); 150 sum[j] = vabal_u8(sum[j], vget_high_u8(a_0), vget_high_u8(b_0)); 151 sum[j] = vabal_u8(sum[j], vget_low_u8(a_1), vget_low_u8(b_1)); 152 sum[j] = vabal_u8(sum[j], vget_high_u8(a_1), vget_high_u8(b_1)); 153 } 154 } 155 156 for (j = 0; j < 4; ++j) { 157 result[j] = vget_lane_u32(horizontal_add_uint16x8(sum[j]), 0); 158 } 159 } 160 161 void vpx_sad32x16x4d_neon(const uint8_t *src, int src_stride, 162 const uint8_t *const ref[4], int ref_stride, 163 uint32_t *res) { 164 sad32x_4d(src, src_stride, ref, ref_stride, res, 16); 165 } 166 167 void vpx_sad32x32x4d_neon(const uint8_t *src, int src_stride, 168 const uint8_t *const ref[4], int ref_stride, 169 uint32_t *res) { 170 sad32x_4d(src, src_stride, ref, ref_stride, res, 32); 171 } 172 173 void vpx_sad32x64x4d_neon(const uint8_t *src, int src_stride, 174 const uint8_t *const ref[4], int ref_stride, 175 uint32_t *res) { 176 sad32x_4d(src, src_stride, ref, ref_stride, res, 64); 177 } 178 179 static INLINE void sum64x(const uint8x16_t a_0, const uint8x16_t a_1, 180 const uint8x16_t b_0, const uint8x16_t b_1, 181 uint16x8_t *sum) { 182 *sum = vabal_u8(*sum, vget_low_u8(a_0), vget_low_u8(b_0)); 183 *sum = vabal_u8(*sum, vget_high_u8(a_0), vget_high_u8(b_0)); 184 *sum = vabal_u8(*sum, vget_low_u8(a_1), vget_low_u8(b_1)); 185 *sum = vabal_u8(*sum, vget_high_u8(a_1), vget_high_u8(b_1)); 186 } 187 188 static INLINE void sad64x_4d(const uint8_t *a, int a_stride, 189 const uint8_t *const b[4], int b_stride, 190 uint32_t *result, const int height) { 191 int i; 192 uint16x8_t sum_0 = vdupq_n_u16(0); 193 uint16x8_t sum_1 = vdupq_n_u16(0); 194 uint16x8_t sum_2 = vdupq_n_u16(0); 195 uint16x8_t sum_3 = vdupq_n_u16(0); 196 uint16x8_t sum_4 = vdupq_n_u16(0); 197 uint16x8_t sum_5 = vdupq_n_u16(0); 198 uint16x8_t sum_6 = vdupq_n_u16(0); 199 uint16x8_t sum_7 = vdupq_n_u16(0); 200 const uint8_t *b_loop[4] = { b[0], b[1], b[2], b[3] }; 201 202 for (i = 0; i < height; ++i) { 203 const uint8x16_t a_0 = vld1q_u8(a); 204 const uint8x16_t a_1 = vld1q_u8(a + 16); 205 const uint8x16_t a_2 = vld1q_u8(a + 32); 206 const uint8x16_t a_3 = vld1q_u8(a + 48); 207 a += a_stride; 208 sum64x(a_0, a_1, vld1q_u8(b_loop[0]), vld1q_u8(b_loop[0] + 16), &sum_0); 209 sum64x(a_2, a_3, vld1q_u8(b_loop[0] + 32), vld1q_u8(b_loop[0] + 48), 210 &sum_1); 211 b_loop[0] += b_stride; 212 sum64x(a_0, a_1, vld1q_u8(b_loop[1]), vld1q_u8(b_loop[1] + 16), &sum_2); 213 sum64x(a_2, a_3, vld1q_u8(b_loop[1] + 32), vld1q_u8(b_loop[1] + 48), 214 &sum_3); 215 b_loop[1] += b_stride; 216 sum64x(a_0, a_1, vld1q_u8(b_loop[2]), vld1q_u8(b_loop[2] + 16), &sum_4); 217 sum64x(a_2, a_3, vld1q_u8(b_loop[2] + 32), vld1q_u8(b_loop[2] + 48), 218 &sum_5); 219 b_loop[2] += b_stride; 220 sum64x(a_0, a_1, vld1q_u8(b_loop[3]), vld1q_u8(b_loop[3] + 16), &sum_6); 221 sum64x(a_2, a_3, vld1q_u8(b_loop[3] + 32), vld1q_u8(b_loop[3] + 48), 222 &sum_7); 223 b_loop[3] += b_stride; 224 } 225 226 result[0] = vget_lane_u32(horizontal_add_long_uint16x8(sum_0, sum_1), 0); 227 result[1] = vget_lane_u32(horizontal_add_long_uint16x8(sum_2, sum_3), 0); 228 result[2] = vget_lane_u32(horizontal_add_long_uint16x8(sum_4, sum_5), 0); 229 result[3] = vget_lane_u32(horizontal_add_long_uint16x8(sum_6, sum_7), 0); 230 } 231 232 void vpx_sad64x32x4d_neon(const uint8_t *src, int src_stride, 233 const uint8_t *const ref[4], int ref_stride, 234 uint32_t *res) { 235 sad64x_4d(src, src_stride, ref, ref_stride, res, 32); 236 } 237 238 void vpx_sad64x64x4d_neon(const uint8_t *src, int src_stride, 239 const uint8_t *const ref[4], int ref_stride, 240 uint32_t *res) { 241 sad64x_4d(src, src_stride, ref, ref_stride, res, 64); 242 } 243