1 /* 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <arm_neon.h> 12 13 #ifdef _MSC_VER 14 #define __builtin_prefetch(x) 15 #endif 16 17 unsigned int vp8_variance16x16_neon( 18 const unsigned char *src_ptr, 19 int source_stride, 20 const unsigned char *ref_ptr, 21 int recon_stride, 22 unsigned int *sse) { 23 int i; 24 int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; 25 uint32x2_t d0u32, d10u32; 26 int64x1_t d0s64, d1s64; 27 uint8x16_t q0u8, q1u8, q2u8, q3u8; 28 uint16x8_t q11u16, q12u16, q13u16, q14u16; 29 int32x4_t q8s32, q9s32, q10s32; 30 int64x2_t q0s64, q1s64, q5s64; 31 32 q8s32 = vdupq_n_s32(0); 33 q9s32 = vdupq_n_s32(0); 34 q10s32 = vdupq_n_s32(0); 35 36 for (i = 0; i < 8; i++) { 37 q0u8 = vld1q_u8(src_ptr); 38 src_ptr += source_stride; 39 q1u8 = vld1q_u8(src_ptr); 40 src_ptr += source_stride; 41 __builtin_prefetch(src_ptr); 42 43 q2u8 = vld1q_u8(ref_ptr); 44 ref_ptr += recon_stride; 45 q3u8 = vld1q_u8(ref_ptr); 46 ref_ptr += recon_stride; 47 __builtin_prefetch(ref_ptr); 48 49 q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8)); 50 q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8)); 51 q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8)); 52 q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8)); 53 54 d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); 55 d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); 56 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); 57 q9s32 = vmlal_s16(q9s32, d22s16, d22s16); 58 q10s32 = vmlal_s16(q10s32, d23s16, d23s16); 59 60 d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); 61 d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); 62 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); 63 q9s32 = vmlal_s16(q9s32, d24s16, d24s16); 64 q10s32 = vmlal_s16(q10s32, d25s16, d25s16); 65 66 d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); 67 d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); 68 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16)); 69 q9s32 = vmlal_s16(q9s32, d26s16, d26s16); 70 q10s32 = vmlal_s16(q10s32, d27s16, d27s16); 71 72 d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); 73 d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); 74 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16)); 75 q9s32 = vmlal_s16(q9s32, d28s16, d28s16); 76 q10s32 = vmlal_s16(q10s32, d29s16, d29s16); 77 } 78 79 q10s32 = vaddq_s32(q10s32, q9s32); 80 q0s64 = vpaddlq_s32(q8s32); 81 q1s64 = vpaddlq_s32(q10s32); 82 83 d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); 84 d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); 85 86 q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), 87 vreinterpret_s32_s64(d0s64)); 88 vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); 89 90 d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8); 91 d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); 92 93 return vget_lane_u32(d0u32, 0); 94 } 95 96 unsigned int vp8_variance16x8_neon( 97 const unsigned char *src_ptr, 98 int source_stride, 99 const unsigned char *ref_ptr, 100 int recon_stride, 101 unsigned int *sse) { 102 int i; 103 int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; 104 uint32x2_t d0u32, d10u32; 105 int64x1_t d0s64, d1s64; 106 uint8x16_t q0u8, q1u8, q2u8, q3u8; 107 uint16x8_t q11u16, q12u16, q13u16, q14u16; 108 int32x4_t q8s32, q9s32, q10s32; 109 int64x2_t q0s64, q1s64, q5s64; 110 111 q8s32 = vdupq_n_s32(0); 112 q9s32 = vdupq_n_s32(0); 113 q10s32 = vdupq_n_s32(0); 114 115 for (i = 0; i < 4; i++) { // variance16x8_neon_loop 116 q0u8 = vld1q_u8(src_ptr); 117 src_ptr += source_stride; 118 q1u8 = vld1q_u8(src_ptr); 119 src_ptr += source_stride; 120 __builtin_prefetch(src_ptr); 121 122 q2u8 = vld1q_u8(ref_ptr); 123 ref_ptr += recon_stride; 124 q3u8 = vld1q_u8(ref_ptr); 125 ref_ptr += recon_stride; 126 __builtin_prefetch(ref_ptr); 127 128 q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8)); 129 q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8)); 130 q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8)); 131 q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8)); 132 133 d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); 134 d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); 135 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); 136 q9s32 = vmlal_s16(q9s32, d22s16, d22s16); 137 q10s32 = vmlal_s16(q10s32, d23s16, d23s16); 138 139 d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); 140 d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); 141 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); 142 q9s32 = vmlal_s16(q9s32, d24s16, d24s16); 143 q10s32 = vmlal_s16(q10s32, d25s16, d25s16); 144 145 d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); 146 d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); 147 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16)); 148 q9s32 = vmlal_s16(q9s32, d26s16, d26s16); 149 q10s32 = vmlal_s16(q10s32, d27s16, d27s16); 150 151 d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); 152 d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); 153 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16)); 154 q9s32 = vmlal_s16(q9s32, d28s16, d28s16); 155 q10s32 = vmlal_s16(q10s32, d29s16, d29s16); 156 } 157 158 q10s32 = vaddq_s32(q10s32, q9s32); 159 q0s64 = vpaddlq_s32(q8s32); 160 q1s64 = vpaddlq_s32(q10s32); 161 162 d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); 163 d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); 164 165 q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), 166 vreinterpret_s32_s64(d0s64)); 167 vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); 168 169 d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7); 170 d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); 171 172 return vget_lane_u32(d0u32, 0); 173 } 174 175 unsigned int vp8_variance8x16_neon( 176 const unsigned char *src_ptr, 177 int source_stride, 178 const unsigned char *ref_ptr, 179 int recon_stride, 180 unsigned int *sse) { 181 int i; 182 uint8x8_t d0u8, d2u8, d4u8, d6u8; 183 int16x4_t d22s16, d23s16, d24s16, d25s16; 184 uint32x2_t d0u32, d10u32; 185 int64x1_t d0s64, d1s64; 186 uint16x8_t q11u16, q12u16; 187 int32x4_t q8s32, q9s32, q10s32; 188 int64x2_t q0s64, q1s64, q5s64; 189 190 q8s32 = vdupq_n_s32(0); 191 q9s32 = vdupq_n_s32(0); 192 q10s32 = vdupq_n_s32(0); 193 194 for (i = 0; i < 8; i++) { // variance8x16_neon_loop 195 d0u8 = vld1_u8(src_ptr); 196 src_ptr += source_stride; 197 d2u8 = vld1_u8(src_ptr); 198 src_ptr += source_stride; 199 __builtin_prefetch(src_ptr); 200 201 d4u8 = vld1_u8(ref_ptr); 202 ref_ptr += recon_stride; 203 d6u8 = vld1_u8(ref_ptr); 204 ref_ptr += recon_stride; 205 __builtin_prefetch(ref_ptr); 206 207 q11u16 = vsubl_u8(d0u8, d4u8); 208 q12u16 = vsubl_u8(d2u8, d6u8); 209 210 d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); 211 d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); 212 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); 213 q9s32 = vmlal_s16(q9s32, d22s16, d22s16); 214 q10s32 = vmlal_s16(q10s32, d23s16, d23s16); 215 216 d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); 217 d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); 218 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); 219 q9s32 = vmlal_s16(q9s32, d24s16, d24s16); 220 q10s32 = vmlal_s16(q10s32, d25s16, d25s16); 221 } 222 223 q10s32 = vaddq_s32(q10s32, q9s32); 224 q0s64 = vpaddlq_s32(q8s32); 225 q1s64 = vpaddlq_s32(q10s32); 226 227 d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); 228 d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); 229 230 q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), 231 vreinterpret_s32_s64(d0s64)); 232 vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); 233 234 d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7); 235 d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); 236 237 return vget_lane_u32(d0u32, 0); 238 } 239 240 unsigned int vp8_variance8x8_neon( 241 const unsigned char *src_ptr, 242 int source_stride, 243 const unsigned char *ref_ptr, 244 int recon_stride, 245 unsigned int *sse) { 246 int i; 247 uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; 248 int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; 249 uint32x2_t d0u32, d10u32; 250 int64x1_t d0s64, d1s64; 251 uint16x8_t q11u16, q12u16, q13u16, q14u16; 252 int32x4_t q8s32, q9s32, q10s32; 253 int64x2_t q0s64, q1s64, q5s64; 254 255 q8s32 = vdupq_n_s32(0); 256 q9s32 = vdupq_n_s32(0); 257 q10s32 = vdupq_n_s32(0); 258 259 for (i = 0; i < 2; i++) { // variance8x8_neon_loop 260 d0u8 = vld1_u8(src_ptr); 261 src_ptr += source_stride; 262 d1u8 = vld1_u8(src_ptr); 263 src_ptr += source_stride; 264 d2u8 = vld1_u8(src_ptr); 265 src_ptr += source_stride; 266 d3u8 = vld1_u8(src_ptr); 267 src_ptr += source_stride; 268 269 d4u8 = vld1_u8(ref_ptr); 270 ref_ptr += recon_stride; 271 d5u8 = vld1_u8(ref_ptr); 272 ref_ptr += recon_stride; 273 d6u8 = vld1_u8(ref_ptr); 274 ref_ptr += recon_stride; 275 d7u8 = vld1_u8(ref_ptr); 276 ref_ptr += recon_stride; 277 278 q11u16 = vsubl_u8(d0u8, d4u8); 279 q12u16 = vsubl_u8(d1u8, d5u8); 280 q13u16 = vsubl_u8(d2u8, d6u8); 281 q14u16 = vsubl_u8(d3u8, d7u8); 282 283 d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); 284 d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); 285 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); 286 q9s32 = vmlal_s16(q9s32, d22s16, d22s16); 287 q10s32 = vmlal_s16(q10s32, d23s16, d23s16); 288 289 d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); 290 d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); 291 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); 292 q9s32 = vmlal_s16(q9s32, d24s16, d24s16); 293 q10s32 = vmlal_s16(q10s32, d25s16, d25s16); 294 295 d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); 296 d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); 297 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16)); 298 q9s32 = vmlal_s16(q9s32, d26s16, d26s16); 299 q10s32 = vmlal_s16(q10s32, d27s16, d27s16); 300 301 d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); 302 d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); 303 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16)); 304 q9s32 = vmlal_s16(q9s32, d28s16, d28s16); 305 q10s32 = vmlal_s16(q10s32, d29s16, d29s16); 306 } 307 308 q10s32 = vaddq_s32(q10s32, q9s32); 309 q0s64 = vpaddlq_s32(q8s32); 310 q1s64 = vpaddlq_s32(q10s32); 311 312 d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); 313 d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); 314 315 q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), 316 vreinterpret_s32_s64(d0s64)); 317 vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); 318 319 d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 6); 320 d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); 321 322 return vget_lane_u32(d0u32, 0); 323 } 324