1 /* 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <arm_neon.h> 12 #include "vpx_ports/mem.h" 13 14 unsigned int vp8_variance16x16_neon( 15 const unsigned char *src_ptr, 16 int source_stride, 17 const unsigned char *ref_ptr, 18 int recon_stride, 19 unsigned int *sse) { 20 int i; 21 int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; 22 uint32x2_t d0u32, d10u32; 23 int64x1_t d0s64, d1s64; 24 uint8x16_t q0u8, q1u8, q2u8, q3u8; 25 uint16x8_t q11u16, q12u16, q13u16, q14u16; 26 int32x4_t q8s32, q9s32, q10s32; 27 int64x2_t q0s64, q1s64, q5s64; 28 29 q8s32 = vdupq_n_s32(0); 30 q9s32 = vdupq_n_s32(0); 31 q10s32 = vdupq_n_s32(0); 32 33 for (i = 0; i < 8; i++) { 34 q0u8 = vld1q_u8(src_ptr); 35 src_ptr += source_stride; 36 q1u8 = vld1q_u8(src_ptr); 37 src_ptr += source_stride; 38 __builtin_prefetch(src_ptr); 39 40 q2u8 = vld1q_u8(ref_ptr); 41 ref_ptr += recon_stride; 42 q3u8 = vld1q_u8(ref_ptr); 43 ref_ptr += recon_stride; 44 __builtin_prefetch(ref_ptr); 45 46 q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8)); 47 q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8)); 48 q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8)); 49 q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8)); 50 51 d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); 52 d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); 53 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); 54 q9s32 = vmlal_s16(q9s32, d22s16, d22s16); 55 q10s32 = vmlal_s16(q10s32, d23s16, d23s16); 56 57 d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); 58 d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); 59 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); 60 q9s32 = vmlal_s16(q9s32, d24s16, d24s16); 61 q10s32 = vmlal_s16(q10s32, d25s16, d25s16); 62 63 d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); 64 d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); 65 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16)); 66 q9s32 = vmlal_s16(q9s32, d26s16, d26s16); 67 q10s32 = vmlal_s16(q10s32, d27s16, d27s16); 68 69 d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); 70 d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); 71 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16)); 72 q9s32 = vmlal_s16(q9s32, d28s16, d28s16); 73 q10s32 = vmlal_s16(q10s32, d29s16, d29s16); 74 } 75 76 q10s32 = vaddq_s32(q10s32, q9s32); 77 q0s64 = vpaddlq_s32(q8s32); 78 q1s64 = vpaddlq_s32(q10s32); 79 80 d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); 81 d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); 82 83 q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), 84 vreinterpret_s32_s64(d0s64)); 85 vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); 86 87 d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8); 88 d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); 89 90 return vget_lane_u32(d0u32, 0); 91 } 92 93 unsigned int vp8_variance16x8_neon( 94 const unsigned char *src_ptr, 95 int source_stride, 96 const unsigned char *ref_ptr, 97 int recon_stride, 98 unsigned int *sse) { 99 int i; 100 int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; 101 uint32x2_t d0u32, d10u32; 102 int64x1_t d0s64, d1s64; 103 uint8x16_t q0u8, q1u8, q2u8, q3u8; 104 uint16x8_t q11u16, q12u16, q13u16, q14u16; 105 int32x4_t q8s32, q9s32, q10s32; 106 int64x2_t q0s64, q1s64, q5s64; 107 108 q8s32 = vdupq_n_s32(0); 109 q9s32 = vdupq_n_s32(0); 110 q10s32 = vdupq_n_s32(0); 111 112 for (i = 0; i < 4; i++) { // variance16x8_neon_loop 113 q0u8 = vld1q_u8(src_ptr); 114 src_ptr += source_stride; 115 q1u8 = vld1q_u8(src_ptr); 116 src_ptr += source_stride; 117 __builtin_prefetch(src_ptr); 118 119 q2u8 = vld1q_u8(ref_ptr); 120 ref_ptr += recon_stride; 121 q3u8 = vld1q_u8(ref_ptr); 122 ref_ptr += recon_stride; 123 __builtin_prefetch(ref_ptr); 124 125 q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8)); 126 q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8)); 127 q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8)); 128 q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8)); 129 130 d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); 131 d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); 132 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); 133 q9s32 = vmlal_s16(q9s32, d22s16, d22s16); 134 q10s32 = vmlal_s16(q10s32, d23s16, d23s16); 135 136 d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); 137 d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); 138 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); 139 q9s32 = vmlal_s16(q9s32, d24s16, d24s16); 140 q10s32 = vmlal_s16(q10s32, d25s16, d25s16); 141 142 d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); 143 d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); 144 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16)); 145 q9s32 = vmlal_s16(q9s32, d26s16, d26s16); 146 q10s32 = vmlal_s16(q10s32, d27s16, d27s16); 147 148 d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); 149 d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); 150 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16)); 151 q9s32 = vmlal_s16(q9s32, d28s16, d28s16); 152 q10s32 = vmlal_s16(q10s32, d29s16, d29s16); 153 } 154 155 q10s32 = vaddq_s32(q10s32, q9s32); 156 q0s64 = vpaddlq_s32(q8s32); 157 q1s64 = vpaddlq_s32(q10s32); 158 159 d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); 160 d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); 161 162 q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), 163 vreinterpret_s32_s64(d0s64)); 164 vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); 165 166 d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7); 167 d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); 168 169 return vget_lane_u32(d0u32, 0); 170 } 171 172 unsigned int vp8_variance8x16_neon( 173 const unsigned char *src_ptr, 174 int source_stride, 175 const unsigned char *ref_ptr, 176 int recon_stride, 177 unsigned int *sse) { 178 int i; 179 uint8x8_t d0u8, d2u8, d4u8, d6u8; 180 int16x4_t d22s16, d23s16, d24s16, d25s16; 181 uint32x2_t d0u32, d10u32; 182 int64x1_t d0s64, d1s64; 183 uint16x8_t q11u16, q12u16; 184 int32x4_t q8s32, q9s32, q10s32; 185 int64x2_t q0s64, q1s64, q5s64; 186 187 q8s32 = vdupq_n_s32(0); 188 q9s32 = vdupq_n_s32(0); 189 q10s32 = vdupq_n_s32(0); 190 191 for (i = 0; i < 8; i++) { // variance8x16_neon_loop 192 d0u8 = vld1_u8(src_ptr); 193 src_ptr += source_stride; 194 d2u8 = vld1_u8(src_ptr); 195 src_ptr += source_stride; 196 __builtin_prefetch(src_ptr); 197 198 d4u8 = vld1_u8(ref_ptr); 199 ref_ptr += recon_stride; 200 d6u8 = vld1_u8(ref_ptr); 201 ref_ptr += recon_stride; 202 __builtin_prefetch(ref_ptr); 203 204 q11u16 = vsubl_u8(d0u8, d4u8); 205 q12u16 = vsubl_u8(d2u8, d6u8); 206 207 d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); 208 d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); 209 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); 210 q9s32 = vmlal_s16(q9s32, d22s16, d22s16); 211 q10s32 = vmlal_s16(q10s32, d23s16, d23s16); 212 213 d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); 214 d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); 215 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); 216 q9s32 = vmlal_s16(q9s32, d24s16, d24s16); 217 q10s32 = vmlal_s16(q10s32, d25s16, d25s16); 218 } 219 220 q10s32 = vaddq_s32(q10s32, q9s32); 221 q0s64 = vpaddlq_s32(q8s32); 222 q1s64 = vpaddlq_s32(q10s32); 223 224 d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); 225 d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); 226 227 q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), 228 vreinterpret_s32_s64(d0s64)); 229 vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); 230 231 d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7); 232 d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); 233 234 return vget_lane_u32(d0u32, 0); 235 } 236 237 unsigned int vp8_variance8x8_neon( 238 const unsigned char *src_ptr, 239 int source_stride, 240 const unsigned char *ref_ptr, 241 int recon_stride, 242 unsigned int *sse) { 243 int i; 244 uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; 245 int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; 246 uint32x2_t d0u32, d10u32; 247 int64x1_t d0s64, d1s64; 248 uint16x8_t q11u16, q12u16, q13u16, q14u16; 249 int32x4_t q8s32, q9s32, q10s32; 250 int64x2_t q0s64, q1s64, q5s64; 251 252 q8s32 = vdupq_n_s32(0); 253 q9s32 = vdupq_n_s32(0); 254 q10s32 = vdupq_n_s32(0); 255 256 for (i = 0; i < 2; i++) { // variance8x8_neon_loop 257 d0u8 = vld1_u8(src_ptr); 258 src_ptr += source_stride; 259 d1u8 = vld1_u8(src_ptr); 260 src_ptr += source_stride; 261 d2u8 = vld1_u8(src_ptr); 262 src_ptr += source_stride; 263 d3u8 = vld1_u8(src_ptr); 264 src_ptr += source_stride; 265 266 d4u8 = vld1_u8(ref_ptr); 267 ref_ptr += recon_stride; 268 d5u8 = vld1_u8(ref_ptr); 269 ref_ptr += recon_stride; 270 d6u8 = vld1_u8(ref_ptr); 271 ref_ptr += recon_stride; 272 d7u8 = vld1_u8(ref_ptr); 273 ref_ptr += recon_stride; 274 275 q11u16 = vsubl_u8(d0u8, d4u8); 276 q12u16 = vsubl_u8(d1u8, d5u8); 277 q13u16 = vsubl_u8(d2u8, d6u8); 278 q14u16 = vsubl_u8(d3u8, d7u8); 279 280 d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); 281 d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); 282 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); 283 q9s32 = vmlal_s16(q9s32, d22s16, d22s16); 284 q10s32 = vmlal_s16(q10s32, d23s16, d23s16); 285 286 d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); 287 d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); 288 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); 289 q9s32 = vmlal_s16(q9s32, d24s16, d24s16); 290 q10s32 = vmlal_s16(q10s32, d25s16, d25s16); 291 292 d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); 293 d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); 294 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16)); 295 q9s32 = vmlal_s16(q9s32, d26s16, d26s16); 296 q10s32 = vmlal_s16(q10s32, d27s16, d27s16); 297 298 d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); 299 d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); 300 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16)); 301 q9s32 = vmlal_s16(q9s32, d28s16, d28s16); 302 q10s32 = vmlal_s16(q10s32, d29s16, d29s16); 303 } 304 305 q10s32 = vaddq_s32(q10s32, q9s32); 306 q0s64 = vpaddlq_s32(q8s32); 307 q1s64 = vpaddlq_s32(q10s32); 308 309 d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); 310 d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); 311 312 q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), 313 vreinterpret_s32_s64(d0s64)); 314 vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); 315 316 d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 6); 317 d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); 318 319 return vget_lane_u32(d0u32, 0); 320 } 321