1 /* 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <arm_neon.h> 12 #include <string.h> 13 #include "./vpx_config.h" 14 #include "./vp8_rtcd.h" 15 #include "vpx_dsp/arm/mem_neon.h" 16 #include "vpx_ports/mem.h" 17 18 static const int8_t vp8_sub_pel_filters[8][8] = { 19 { 0, 0, 128, 0, 0, 0, 0, 0 }, /* note that 1/8 pel positionyys are */ 20 { 0, -6, 123, 12, -1, 0, 0, 0 }, /* just as per alpha -0.5 bicubic */ 21 { 2, -11, 108, 36, -8, 1, 0, 0 }, /* New 1/4 pel 6 tap filter */ 22 { 0, -9, 93, 50, -6, 0, 0, 0 }, 23 { 3, -16, 77, 77, -16, 3, 0, 0 }, /* New 1/2 pel 6 tap filter */ 24 { 0, -6, 50, 93, -9, 0, 0, 0 }, 25 { 1, -8, 36, 108, -11, 2, 0, 0 }, /* New 1/4 pel 6 tap filter */ 26 { 0, -1, 12, 123, -6, 0, 0, 0 }, 27 }; 28 29 // This table is derived from vp8/common/filter.c:vp8_sub_pel_filters. 30 // Apply abs() to all the values. Elements 0, 2, 3, and 5 are always positive. 31 // Elements 1 and 4 are either 0 or negative. The code accounts for this with 32 // multiply/accumulates which either add or subtract as needed. The other 33 // functions will be updated to use this table later. 34 // It is also expanded to 8 elements to allow loading into 64 bit neon 35 // registers. 36 static const uint8_t abs_filters[8][8] = { 37 { 0, 0, 128, 0, 0, 0, 0, 0 }, { 0, 6, 123, 12, 1, 0, 0, 0 }, 38 { 2, 11, 108, 36, 8, 1, 0, 0 }, { 0, 9, 93, 50, 6, 0, 0, 0 }, 39 { 3, 16, 77, 77, 16, 3, 0, 0 }, { 0, 6, 50, 93, 9, 0, 0, 0 }, 40 { 1, 8, 36, 108, 11, 2, 0, 0 }, { 0, 1, 12, 123, 6, 0, 0, 0 }, 41 }; 42 43 static INLINE uint8x8_t load_and_shift(const unsigned char *a) { 44 return vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(vld1_u8(a)), 32)); 45 } 46 47 static INLINE void filter_add_accumulate(const uint8x16_t a, const uint8x16_t b, 48 const uint8x8_t filter, uint16x8_t *c, 49 uint16x8_t *d) { 50 const uint32x2x2_t a_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(a)), 51 vreinterpret_u32_u8(vget_high_u8(a))); 52 const uint32x2x2_t b_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(b)), 53 vreinterpret_u32_u8(vget_high_u8(b))); 54 *c = vmlal_u8(*c, vreinterpret_u8_u32(a_shuf.val[0]), filter); 55 *d = vmlal_u8(*d, vreinterpret_u8_u32(b_shuf.val[0]), filter); 56 } 57 58 static INLINE void filter_sub_accumulate(const uint8x16_t a, const uint8x16_t b, 59 const uint8x8_t filter, uint16x8_t *c, 60 uint16x8_t *d) { 61 const uint32x2x2_t a_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(a)), 62 vreinterpret_u32_u8(vget_high_u8(a))); 63 const uint32x2x2_t b_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(b)), 64 vreinterpret_u32_u8(vget_high_u8(b))); 65 *c = vmlsl_u8(*c, vreinterpret_u8_u32(a_shuf.val[0]), filter); 66 *d = vmlsl_u8(*d, vreinterpret_u8_u32(b_shuf.val[0]), filter); 67 } 68 69 static INLINE void yonly4x4(const unsigned char *src, int src_stride, 70 int filter_offset, unsigned char *dst, 71 int dst_stride) { 72 uint8x8_t a0, a1, a2, a3, a4, a5, a6, a7, a8; 73 uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7, b8; 74 uint16x8_t c0, c1, c2, c3; 75 int16x8_t d0, d1; 76 uint8x8_t e0, e1; 77 78 const uint8x8_t filter = vld1_u8(abs_filters[filter_offset]); 79 const uint8x8_t filter0 = vdup_lane_u8(filter, 0); 80 const uint8x8_t filter1 = vdup_lane_u8(filter, 1); 81 const uint8x8_t filter2 = vdup_lane_u8(filter, 2); 82 const uint8x8_t filter3 = vdup_lane_u8(filter, 3); 83 const uint8x8_t filter4 = vdup_lane_u8(filter, 4); 84 const uint8x8_t filter5 = vdup_lane_u8(filter, 5); 85 86 src -= src_stride * 2; 87 // Shift the even rows to allow using 'vext' to combine the vectors. armv8 88 // has vcopy_lane which would be interesting. This started as just a 89 // horrible workaround for clang adding alignment hints to 32bit loads: 90 // https://llvm.org/bugs/show_bug.cgi?id=24421 91 // But it turns out it almost identical to casting the loads. 92 a0 = load_and_shift(src); 93 src += src_stride; 94 a1 = vld1_u8(src); 95 src += src_stride; 96 a2 = load_and_shift(src); 97 src += src_stride; 98 a3 = vld1_u8(src); 99 src += src_stride; 100 a4 = load_and_shift(src); 101 src += src_stride; 102 a5 = vld1_u8(src); 103 src += src_stride; 104 a6 = load_and_shift(src); 105 src += src_stride; 106 a7 = vld1_u8(src); 107 src += src_stride; 108 a8 = vld1_u8(src); 109 110 // Combine the rows so we can operate on 8 at a time. 111 b0 = vext_u8(a0, a1, 4); 112 b2 = vext_u8(a2, a3, 4); 113 b4 = vext_u8(a4, a5, 4); 114 b6 = vext_u8(a6, a7, 4); 115 b8 = a8; 116 117 // To keep with the 8-at-a-time theme, combine *alternate* rows. This 118 // allows combining the odd rows with the even. 119 b1 = vext_u8(b0, b2, 4); 120 b3 = vext_u8(b2, b4, 4); 121 b5 = vext_u8(b4, b6, 4); 122 b7 = vext_u8(b6, b8, 4); 123 124 // Multiply and expand to 16 bits. 125 c0 = vmull_u8(b0, filter0); 126 c1 = vmull_u8(b2, filter0); 127 c2 = vmull_u8(b5, filter5); 128 c3 = vmull_u8(b7, filter5); 129 130 // Multiply, subtract and accumulate for filters 1 and 4 (the negative 131 // ones). 132 c0 = vmlsl_u8(c0, b4, filter4); 133 c1 = vmlsl_u8(c1, b6, filter4); 134 c2 = vmlsl_u8(c2, b1, filter1); 135 c3 = vmlsl_u8(c3, b3, filter1); 136 137 // Add more positive ones. vmlal should really return a signed type. 138 // It's doing signed math internally, as evidenced by the fact we can do 139 // subtractions followed by more additions. Ideally we could use 140 // vqmlal/sl but that instruction doesn't exist. Might be able to 141 // shoehorn vqdmlal/vqdmlsl in here but it would take some effort. 142 c0 = vmlal_u8(c0, b2, filter2); 143 c1 = vmlal_u8(c1, b4, filter2); 144 c2 = vmlal_u8(c2, b3, filter3); 145 c3 = vmlal_u8(c3, b5, filter3); 146 147 // Use signed saturation math because vmlsl may have left some negative 148 // numbers in there. 149 d0 = vqaddq_s16(vreinterpretq_s16_u16(c2), vreinterpretq_s16_u16(c0)); 150 d1 = vqaddq_s16(vreinterpretq_s16_u16(c3), vreinterpretq_s16_u16(c1)); 151 152 // Use signed again because numbers like -200 need to be saturated to 0. 153 e0 = vqrshrun_n_s16(d0, 7); 154 e1 = vqrshrun_n_s16(d1, 7); 155 156 store_unaligned_u8q(dst, dst_stride, vcombine_u8(e0, e1)); 157 } 158 159 void vp8_sixtap_predict4x4_neon(unsigned char *src_ptr, int src_pixels_per_line, 160 int xoffset, int yoffset, 161 unsigned char *dst_ptr, int dst_pitch) { 162 uint8x16_t s0, s1, s2, s3, s4; 163 uint64x2_t s01, s23; 164 // Variables to hold src[] elements for the given filter[] 165 uint8x8_t s0_f5, s1_f5, s2_f5, s3_f5, s4_f5; 166 uint8x8_t s4_f1, s4_f2, s4_f3, s4_f4; 167 uint8x16_t s01_f0, s23_f0; 168 uint64x2_t s01_f3, s23_f3; 169 uint32x2x2_t s01_f3_q, s23_f3_q, s01_f5_q, s23_f5_q; 170 // Accumulator variables. 171 uint16x8_t d0123, d4567, d89; 172 uint16x8_t d0123_a, d4567_a, d89_a; 173 int16x8_t e0123, e4567, e89; 174 // Second pass intermediates. 175 uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7, b8; 176 uint16x8_t c0, c1, c2, c3; 177 int16x8_t d0, d1; 178 uint8x8_t e0, e1; 179 uint8x8_t filter, filter0, filter1, filter2, filter3, filter4, filter5; 180 181 if (xoffset == 0) { // Second pass only. 182 yonly4x4(src_ptr, src_pixels_per_line, yoffset, dst_ptr, dst_pitch); 183 return; 184 } 185 186 if (yoffset == 0) { // First pass only. 187 src_ptr -= 2; 188 } else { // Add context for the second pass. 2 extra lines on top. 189 src_ptr -= 2 + (src_pixels_per_line * 2); 190 } 191 192 filter = vld1_u8(abs_filters[xoffset]); 193 filter0 = vdup_lane_u8(filter, 0); 194 filter1 = vdup_lane_u8(filter, 1); 195 filter2 = vdup_lane_u8(filter, 2); 196 filter3 = vdup_lane_u8(filter, 3); 197 filter4 = vdup_lane_u8(filter, 4); 198 filter5 = vdup_lane_u8(filter, 5); 199 200 // 2 bytes of context, 4 bytes of src values, 3 bytes of context, 7 bytes of 201 // garbage. So much effort for that last single bit. 202 // The low values of each pair are for filter0. 203 s0 = vld1q_u8(src_ptr); 204 src_ptr += src_pixels_per_line; 205 s1 = vld1q_u8(src_ptr); 206 src_ptr += src_pixels_per_line; 207 s2 = vld1q_u8(src_ptr); 208 src_ptr += src_pixels_per_line; 209 s3 = vld1q_u8(src_ptr); 210 src_ptr += src_pixels_per_line; 211 212 // Shift to extract values for filter[5] 213 // If src[] is 0, this puts: 214 // 3 4 5 6 7 8 9 10 in s0_f5 215 // Can't use vshr.u64 because it crosses the double word boundary. 216 s0_f5 = vext_u8(vget_low_u8(s0), vget_high_u8(s0), 5); 217 s1_f5 = vext_u8(vget_low_u8(s1), vget_high_u8(s1), 5); 218 s2_f5 = vext_u8(vget_low_u8(s2), vget_high_u8(s2), 5); 219 s3_f5 = vext_u8(vget_low_u8(s3), vget_high_u8(s3), 5); 220 221 s01_f0 = vcombine_u8(vget_low_u8(s0), vget_low_u8(s1)); 222 s23_f0 = vcombine_u8(vget_low_u8(s2), vget_low_u8(s3)); 223 224 s01_f5_q = vzip_u32(vreinterpret_u32_u8(s0_f5), vreinterpret_u32_u8(s1_f5)); 225 s23_f5_q = vzip_u32(vreinterpret_u32_u8(s2_f5), vreinterpret_u32_u8(s3_f5)); 226 d0123 = vmull_u8(vreinterpret_u8_u32(s01_f5_q.val[0]), filter5); 227 d4567 = vmull_u8(vreinterpret_u8_u32(s23_f5_q.val[0]), filter5); 228 229 // Keep original src data as 64 bits to simplify shifting and extracting. 230 s01 = vreinterpretq_u64_u8(s01_f0); 231 s23 = vreinterpretq_u64_u8(s23_f0); 232 233 // 3 4 5 6 * filter0 234 filter_add_accumulate(s01_f0, s23_f0, filter0, &d0123, &d4567); 235 236 // Shift over one to use -1, 0, 1, 2 for filter1 237 // -1 0 1 2 * filter1 238 filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 8)), 239 vreinterpretq_u8_u64(vshrq_n_u64(s23, 8)), filter1, 240 &d0123, &d4567); 241 242 // 2 3 4 5 * filter4 243 filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 32)), 244 vreinterpretq_u8_u64(vshrq_n_u64(s23, 32)), filter4, 245 &d0123, &d4567); 246 247 // 0 1 2 3 * filter2 248 filter_add_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 16)), 249 vreinterpretq_u8_u64(vshrq_n_u64(s23, 16)), filter2, 250 &d0123, &d4567); 251 252 // 1 2 3 4 * filter3 253 s01_f3 = vshrq_n_u64(s01, 24); 254 s23_f3 = vshrq_n_u64(s23, 24); 255 s01_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s01_f3)), 256 vreinterpret_u32_u64(vget_high_u64(s01_f3))); 257 s23_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s23_f3)), 258 vreinterpret_u32_u64(vget_high_u64(s23_f3))); 259 // Accumulate into different registers so it can use saturated addition. 260 d0123_a = vmull_u8(vreinterpret_u8_u32(s01_f3_q.val[0]), filter3); 261 d4567_a = vmull_u8(vreinterpret_u8_u32(s23_f3_q.val[0]), filter3); 262 263 e0123 = 264 vqaddq_s16(vreinterpretq_s16_u16(d0123), vreinterpretq_s16_u16(d0123_a)); 265 e4567 = 266 vqaddq_s16(vreinterpretq_s16_u16(d4567), vreinterpretq_s16_u16(d4567_a)); 267 268 // Shift and narrow. 269 b0 = vqrshrun_n_s16(e0123, 7); 270 b2 = vqrshrun_n_s16(e4567, 7); 271 272 if (yoffset == 0) { // firstpass_filter4x4_only 273 store_unaligned_u8q(dst_ptr, dst_pitch, vcombine_u8(b0, b2)); 274 return; 275 } 276 277 // Load additional context when doing both filters. 278 s0 = vld1q_u8(src_ptr); 279 src_ptr += src_pixels_per_line; 280 s1 = vld1q_u8(src_ptr); 281 src_ptr += src_pixels_per_line; 282 s2 = vld1q_u8(src_ptr); 283 src_ptr += src_pixels_per_line; 284 s3 = vld1q_u8(src_ptr); 285 src_ptr += src_pixels_per_line; 286 s4 = vld1q_u8(src_ptr); 287 288 s0_f5 = vext_u8(vget_low_u8(s0), vget_high_u8(s0), 5); 289 s1_f5 = vext_u8(vget_low_u8(s1), vget_high_u8(s1), 5); 290 s2_f5 = vext_u8(vget_low_u8(s2), vget_high_u8(s2), 5); 291 s3_f5 = vext_u8(vget_low_u8(s3), vget_high_u8(s3), 5); 292 s4_f5 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 5); 293 294 // 3 4 5 6 * filter0 295 s01_f0 = vcombine_u8(vget_low_u8(s0), vget_low_u8(s1)); 296 s23_f0 = vcombine_u8(vget_low_u8(s2), vget_low_u8(s3)); 297 298 s01_f5_q = vzip_u32(vreinterpret_u32_u8(s0_f5), vreinterpret_u32_u8(s1_f5)); 299 s23_f5_q = vzip_u32(vreinterpret_u32_u8(s2_f5), vreinterpret_u32_u8(s3_f5)); 300 // But this time instead of 16 pixels to filter, there are 20. So an extra 301 // run with a doubleword register. 302 d0123 = vmull_u8(vreinterpret_u8_u32(s01_f5_q.val[0]), filter5); 303 d4567 = vmull_u8(vreinterpret_u8_u32(s23_f5_q.val[0]), filter5); 304 d89 = vmull_u8(s4_f5, filter5); 305 306 // Save a copy as u64 for shifting. 307 s01 = vreinterpretq_u64_u8(s01_f0); 308 s23 = vreinterpretq_u64_u8(s23_f0); 309 310 filter_add_accumulate(s01_f0, s23_f0, filter0, &d0123, &d4567); 311 d89 = vmlal_u8(d89, vget_low_u8(s4), filter0); 312 313 filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 8)), 314 vreinterpretq_u8_u64(vshrq_n_u64(s23, 8)), filter1, 315 &d0123, &d4567); 316 s4_f1 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 1); 317 d89 = vmlsl_u8(d89, s4_f1, filter1); 318 319 filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 32)), 320 vreinterpretq_u8_u64(vshrq_n_u64(s23, 32)), filter4, 321 &d0123, &d4567); 322 s4_f4 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 4); 323 d89 = vmlsl_u8(d89, s4_f4, filter4); 324 325 filter_add_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 16)), 326 vreinterpretq_u8_u64(vshrq_n_u64(s23, 16)), filter2, 327 &d0123, &d4567); 328 s4_f2 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 2); 329 d89 = vmlal_u8(d89, s4_f2, filter2); 330 331 s01_f3 = vshrq_n_u64(s01, 24); 332 s23_f3 = vshrq_n_u64(s23, 24); 333 s01_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s01_f3)), 334 vreinterpret_u32_u64(vget_high_u64(s01_f3))); 335 s23_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s23_f3)), 336 vreinterpret_u32_u64(vget_high_u64(s23_f3))); 337 s4_f3 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 3); 338 d0123_a = vmull_u8(vreinterpret_u8_u32(s01_f3_q.val[0]), filter3); 339 d4567_a = vmull_u8(vreinterpret_u8_u32(s23_f3_q.val[0]), filter3); 340 d89_a = vmull_u8(s4_f3, filter3); 341 342 e0123 = 343 vqaddq_s16(vreinterpretq_s16_u16(d0123), vreinterpretq_s16_u16(d0123_a)); 344 e4567 = 345 vqaddq_s16(vreinterpretq_s16_u16(d4567), vreinterpretq_s16_u16(d4567_a)); 346 e89 = vqaddq_s16(vreinterpretq_s16_u16(d89), vreinterpretq_s16_u16(d89_a)); 347 348 b4 = vqrshrun_n_s16(e0123, 7); 349 b6 = vqrshrun_n_s16(e4567, 7); 350 b8 = vqrshrun_n_s16(e89, 7); 351 352 // Second pass: 4x4 353 filter = vld1_u8(abs_filters[yoffset]); 354 filter0 = vdup_lane_u8(filter, 0); 355 filter1 = vdup_lane_u8(filter, 1); 356 filter2 = vdup_lane_u8(filter, 2); 357 filter3 = vdup_lane_u8(filter, 3); 358 filter4 = vdup_lane_u8(filter, 4); 359 filter5 = vdup_lane_u8(filter, 5); 360 361 b1 = vext_u8(b0, b2, 4); 362 b3 = vext_u8(b2, b4, 4); 363 b5 = vext_u8(b4, b6, 4); 364 b7 = vext_u8(b6, b8, 4); 365 366 c0 = vmull_u8(b0, filter0); 367 c1 = vmull_u8(b2, filter0); 368 c2 = vmull_u8(b5, filter5); 369 c3 = vmull_u8(b7, filter5); 370 371 c0 = vmlsl_u8(c0, b4, filter4); 372 c1 = vmlsl_u8(c1, b6, filter4); 373 c2 = vmlsl_u8(c2, b1, filter1); 374 c3 = vmlsl_u8(c3, b3, filter1); 375 376 c0 = vmlal_u8(c0, b2, filter2); 377 c1 = vmlal_u8(c1, b4, filter2); 378 c2 = vmlal_u8(c2, b3, filter3); 379 c3 = vmlal_u8(c3, b5, filter3); 380 381 d0 = vqaddq_s16(vreinterpretq_s16_u16(c2), vreinterpretq_s16_u16(c0)); 382 d1 = vqaddq_s16(vreinterpretq_s16_u16(c3), vreinterpretq_s16_u16(c1)); 383 384 e0 = vqrshrun_n_s16(d0, 7); 385 e1 = vqrshrun_n_s16(d1, 7); 386 387 store_unaligned_u8q(dst_ptr, dst_pitch, vcombine_u8(e0, e1)); 388 } 389 390 void vp8_sixtap_predict8x4_neon(unsigned char *src_ptr, int src_pixels_per_line, 391 int xoffset, int yoffset, 392 unsigned char *dst_ptr, int dst_pitch) { 393 unsigned char *src; 394 uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8; 395 uint8x8_t d22u8, d23u8, d24u8, d25u8, d26u8; 396 uint8x8_t d27u8, d28u8, d29u8, d30u8, d31u8; 397 int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8; 398 uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16; 399 uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16; 400 int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16; 401 int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16; 402 uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8; 403 404 if (xoffset == 0) { // secondpass_filter8x4_only 405 // load second_pass filter 406 dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); 407 d0s8 = vdup_lane_s8(dtmps8, 0); 408 d1s8 = vdup_lane_s8(dtmps8, 1); 409 d2s8 = vdup_lane_s8(dtmps8, 2); 410 d3s8 = vdup_lane_s8(dtmps8, 3); 411 d4s8 = vdup_lane_s8(dtmps8, 4); 412 d5s8 = vdup_lane_s8(dtmps8, 5); 413 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); 414 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); 415 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); 416 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); 417 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); 418 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); 419 420 // load src data 421 src = src_ptr - src_pixels_per_line * 2; 422 d22u8 = vld1_u8(src); 423 src += src_pixels_per_line; 424 d23u8 = vld1_u8(src); 425 src += src_pixels_per_line; 426 d24u8 = vld1_u8(src); 427 src += src_pixels_per_line; 428 d25u8 = vld1_u8(src); 429 src += src_pixels_per_line; 430 d26u8 = vld1_u8(src); 431 src += src_pixels_per_line; 432 d27u8 = vld1_u8(src); 433 src += src_pixels_per_line; 434 d28u8 = vld1_u8(src); 435 src += src_pixels_per_line; 436 d29u8 = vld1_u8(src); 437 src += src_pixels_per_line; 438 d30u8 = vld1_u8(src); 439 440 q3u16 = vmull_u8(d22u8, d0u8); 441 q4u16 = vmull_u8(d23u8, d0u8); 442 q5u16 = vmull_u8(d24u8, d0u8); 443 q6u16 = vmull_u8(d25u8, d0u8); 444 445 q3u16 = vmlsl_u8(q3u16, d23u8, d1u8); 446 q4u16 = vmlsl_u8(q4u16, d24u8, d1u8); 447 q5u16 = vmlsl_u8(q5u16, d25u8, d1u8); 448 q6u16 = vmlsl_u8(q6u16, d26u8, d1u8); 449 450 q3u16 = vmlsl_u8(q3u16, d26u8, d4u8); 451 q4u16 = vmlsl_u8(q4u16, d27u8, d4u8); 452 q5u16 = vmlsl_u8(q5u16, d28u8, d4u8); 453 q6u16 = vmlsl_u8(q6u16, d29u8, d4u8); 454 455 q3u16 = vmlal_u8(q3u16, d24u8, d2u8); 456 q4u16 = vmlal_u8(q4u16, d25u8, d2u8); 457 q5u16 = vmlal_u8(q5u16, d26u8, d2u8); 458 q6u16 = vmlal_u8(q6u16, d27u8, d2u8); 459 460 q3u16 = vmlal_u8(q3u16, d27u8, d5u8); 461 q4u16 = vmlal_u8(q4u16, d28u8, d5u8); 462 q5u16 = vmlal_u8(q5u16, d29u8, d5u8); 463 q6u16 = vmlal_u8(q6u16, d30u8, d5u8); 464 465 q7u16 = vmull_u8(d25u8, d3u8); 466 q8u16 = vmull_u8(d26u8, d3u8); 467 q9u16 = vmull_u8(d27u8, d3u8); 468 q10u16 = vmull_u8(d28u8, d3u8); 469 470 q3s16 = vreinterpretq_s16_u16(q3u16); 471 q4s16 = vreinterpretq_s16_u16(q4u16); 472 q5s16 = vreinterpretq_s16_u16(q5u16); 473 q6s16 = vreinterpretq_s16_u16(q6u16); 474 q7s16 = vreinterpretq_s16_u16(q7u16); 475 q8s16 = vreinterpretq_s16_u16(q8u16); 476 q9s16 = vreinterpretq_s16_u16(q9u16); 477 q10s16 = vreinterpretq_s16_u16(q10u16); 478 479 q7s16 = vqaddq_s16(q7s16, q3s16); 480 q8s16 = vqaddq_s16(q8s16, q4s16); 481 q9s16 = vqaddq_s16(q9s16, q5s16); 482 q10s16 = vqaddq_s16(q10s16, q6s16); 483 484 d6u8 = vqrshrun_n_s16(q7s16, 7); 485 d7u8 = vqrshrun_n_s16(q8s16, 7); 486 d8u8 = vqrshrun_n_s16(q9s16, 7); 487 d9u8 = vqrshrun_n_s16(q10s16, 7); 488 489 vst1_u8(dst_ptr, d6u8); 490 dst_ptr += dst_pitch; 491 vst1_u8(dst_ptr, d7u8); 492 dst_ptr += dst_pitch; 493 vst1_u8(dst_ptr, d8u8); 494 dst_ptr += dst_pitch; 495 vst1_u8(dst_ptr, d9u8); 496 return; 497 } 498 499 // load first_pass filter 500 dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]); 501 d0s8 = vdup_lane_s8(dtmps8, 0); 502 d1s8 = vdup_lane_s8(dtmps8, 1); 503 d2s8 = vdup_lane_s8(dtmps8, 2); 504 d3s8 = vdup_lane_s8(dtmps8, 3); 505 d4s8 = vdup_lane_s8(dtmps8, 4); 506 d5s8 = vdup_lane_s8(dtmps8, 5); 507 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); 508 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); 509 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); 510 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); 511 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); 512 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); 513 514 // First pass: output_height lines x output_width columns (9x4) 515 if (yoffset == 0) // firstpass_filter4x4_only 516 src = src_ptr - 2; 517 else 518 src = src_ptr - 2 - (src_pixels_per_line * 2); 519 q3u8 = vld1q_u8(src); 520 src += src_pixels_per_line; 521 q4u8 = vld1q_u8(src); 522 src += src_pixels_per_line; 523 q5u8 = vld1q_u8(src); 524 src += src_pixels_per_line; 525 q6u8 = vld1q_u8(src); 526 527 q7u16 = vmull_u8(vget_low_u8(q3u8), d0u8); 528 q8u16 = vmull_u8(vget_low_u8(q4u8), d0u8); 529 q9u16 = vmull_u8(vget_low_u8(q5u8), d0u8); 530 q10u16 = vmull_u8(vget_low_u8(q6u8), d0u8); 531 532 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); 533 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1); 534 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1); 535 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1); 536 537 q7u16 = vmlsl_u8(q7u16, d28u8, d1u8); 538 q8u16 = vmlsl_u8(q8u16, d29u8, d1u8); 539 q9u16 = vmlsl_u8(q9u16, d30u8, d1u8); 540 q10u16 = vmlsl_u8(q10u16, d31u8, d1u8); 541 542 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4); 543 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4); 544 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4); 545 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4); 546 547 q7u16 = vmlsl_u8(q7u16, d28u8, d4u8); 548 q8u16 = vmlsl_u8(q8u16, d29u8, d4u8); 549 q9u16 = vmlsl_u8(q9u16, d30u8, d4u8); 550 q10u16 = vmlsl_u8(q10u16, d31u8, d4u8); 551 552 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2); 553 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2); 554 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2); 555 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2); 556 557 q7u16 = vmlal_u8(q7u16, d28u8, d2u8); 558 q8u16 = vmlal_u8(q8u16, d29u8, d2u8); 559 q9u16 = vmlal_u8(q9u16, d30u8, d2u8); 560 q10u16 = vmlal_u8(q10u16, d31u8, d2u8); 561 562 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5); 563 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5); 564 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5); 565 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5); 566 567 q7u16 = vmlal_u8(q7u16, d28u8, d5u8); 568 q8u16 = vmlal_u8(q8u16, d29u8, d5u8); 569 q9u16 = vmlal_u8(q9u16, d30u8, d5u8); 570 q10u16 = vmlal_u8(q10u16, d31u8, d5u8); 571 572 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3); 573 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3); 574 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3); 575 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3); 576 577 q3u16 = vmull_u8(d28u8, d3u8); 578 q4u16 = vmull_u8(d29u8, d3u8); 579 q5u16 = vmull_u8(d30u8, d3u8); 580 q6u16 = vmull_u8(d31u8, d3u8); 581 582 q3s16 = vreinterpretq_s16_u16(q3u16); 583 q4s16 = vreinterpretq_s16_u16(q4u16); 584 q5s16 = vreinterpretq_s16_u16(q5u16); 585 q6s16 = vreinterpretq_s16_u16(q6u16); 586 q7s16 = vreinterpretq_s16_u16(q7u16); 587 q8s16 = vreinterpretq_s16_u16(q8u16); 588 q9s16 = vreinterpretq_s16_u16(q9u16); 589 q10s16 = vreinterpretq_s16_u16(q10u16); 590 591 q7s16 = vqaddq_s16(q7s16, q3s16); 592 q8s16 = vqaddq_s16(q8s16, q4s16); 593 q9s16 = vqaddq_s16(q9s16, q5s16); 594 q10s16 = vqaddq_s16(q10s16, q6s16); 595 596 d22u8 = vqrshrun_n_s16(q7s16, 7); 597 d23u8 = vqrshrun_n_s16(q8s16, 7); 598 d24u8 = vqrshrun_n_s16(q9s16, 7); 599 d25u8 = vqrshrun_n_s16(q10s16, 7); 600 601 if (yoffset == 0) { // firstpass_filter8x4_only 602 vst1_u8(dst_ptr, d22u8); 603 dst_ptr += dst_pitch; 604 vst1_u8(dst_ptr, d23u8); 605 dst_ptr += dst_pitch; 606 vst1_u8(dst_ptr, d24u8); 607 dst_ptr += dst_pitch; 608 vst1_u8(dst_ptr, d25u8); 609 return; 610 } 611 612 // First Pass on rest 5-line data 613 src += src_pixels_per_line; 614 q3u8 = vld1q_u8(src); 615 src += src_pixels_per_line; 616 q4u8 = vld1q_u8(src); 617 src += src_pixels_per_line; 618 q5u8 = vld1q_u8(src); 619 src += src_pixels_per_line; 620 q6u8 = vld1q_u8(src); 621 src += src_pixels_per_line; 622 q7u8 = vld1q_u8(src); 623 624 q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8); 625 q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8); 626 q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8); 627 q11u16 = vmull_u8(vget_low_u8(q6u8), d0u8); 628 q12u16 = vmull_u8(vget_low_u8(q7u8), d0u8); 629 630 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); 631 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1); 632 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1); 633 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1); 634 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 1); 635 636 q8u16 = vmlsl_u8(q8u16, d27u8, d1u8); 637 q9u16 = vmlsl_u8(q9u16, d28u8, d1u8); 638 q10u16 = vmlsl_u8(q10u16, d29u8, d1u8); 639 q11u16 = vmlsl_u8(q11u16, d30u8, d1u8); 640 q12u16 = vmlsl_u8(q12u16, d31u8, d1u8); 641 642 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4); 643 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4); 644 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4); 645 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4); 646 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 4); 647 648 q8u16 = vmlsl_u8(q8u16, d27u8, d4u8); 649 q9u16 = vmlsl_u8(q9u16, d28u8, d4u8); 650 q10u16 = vmlsl_u8(q10u16, d29u8, d4u8); 651 q11u16 = vmlsl_u8(q11u16, d30u8, d4u8); 652 q12u16 = vmlsl_u8(q12u16, d31u8, d4u8); 653 654 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2); 655 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2); 656 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2); 657 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2); 658 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 2); 659 660 q8u16 = vmlal_u8(q8u16, d27u8, d2u8); 661 q9u16 = vmlal_u8(q9u16, d28u8, d2u8); 662 q10u16 = vmlal_u8(q10u16, d29u8, d2u8); 663 q11u16 = vmlal_u8(q11u16, d30u8, d2u8); 664 q12u16 = vmlal_u8(q12u16, d31u8, d2u8); 665 666 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5); 667 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5); 668 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5); 669 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5); 670 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 5); 671 672 q8u16 = vmlal_u8(q8u16, d27u8, d5u8); 673 q9u16 = vmlal_u8(q9u16, d28u8, d5u8); 674 q10u16 = vmlal_u8(q10u16, d29u8, d5u8); 675 q11u16 = vmlal_u8(q11u16, d30u8, d5u8); 676 q12u16 = vmlal_u8(q12u16, d31u8, d5u8); 677 678 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3); 679 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3); 680 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3); 681 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3); 682 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 3); 683 684 q3u16 = vmull_u8(d27u8, d3u8); 685 q4u16 = vmull_u8(d28u8, d3u8); 686 q5u16 = vmull_u8(d29u8, d3u8); 687 q6u16 = vmull_u8(d30u8, d3u8); 688 q7u16 = vmull_u8(d31u8, d3u8); 689 690 q3s16 = vreinterpretq_s16_u16(q3u16); 691 q4s16 = vreinterpretq_s16_u16(q4u16); 692 q5s16 = vreinterpretq_s16_u16(q5u16); 693 q6s16 = vreinterpretq_s16_u16(q6u16); 694 q7s16 = vreinterpretq_s16_u16(q7u16); 695 q8s16 = vreinterpretq_s16_u16(q8u16); 696 q9s16 = vreinterpretq_s16_u16(q9u16); 697 q10s16 = vreinterpretq_s16_u16(q10u16); 698 q11s16 = vreinterpretq_s16_u16(q11u16); 699 q12s16 = vreinterpretq_s16_u16(q12u16); 700 701 q8s16 = vqaddq_s16(q8s16, q3s16); 702 q9s16 = vqaddq_s16(q9s16, q4s16); 703 q10s16 = vqaddq_s16(q10s16, q5s16); 704 q11s16 = vqaddq_s16(q11s16, q6s16); 705 q12s16 = vqaddq_s16(q12s16, q7s16); 706 707 d26u8 = vqrshrun_n_s16(q8s16, 7); 708 d27u8 = vqrshrun_n_s16(q9s16, 7); 709 d28u8 = vqrshrun_n_s16(q10s16, 7); 710 d29u8 = vqrshrun_n_s16(q11s16, 7); 711 d30u8 = vqrshrun_n_s16(q12s16, 7); 712 713 // Second pass: 8x4 714 dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); 715 d0s8 = vdup_lane_s8(dtmps8, 0); 716 d1s8 = vdup_lane_s8(dtmps8, 1); 717 d2s8 = vdup_lane_s8(dtmps8, 2); 718 d3s8 = vdup_lane_s8(dtmps8, 3); 719 d4s8 = vdup_lane_s8(dtmps8, 4); 720 d5s8 = vdup_lane_s8(dtmps8, 5); 721 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); 722 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); 723 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); 724 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); 725 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); 726 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); 727 728 q3u16 = vmull_u8(d22u8, d0u8); 729 q4u16 = vmull_u8(d23u8, d0u8); 730 q5u16 = vmull_u8(d24u8, d0u8); 731 q6u16 = vmull_u8(d25u8, d0u8); 732 733 q3u16 = vmlsl_u8(q3u16, d23u8, d1u8); 734 q4u16 = vmlsl_u8(q4u16, d24u8, d1u8); 735 q5u16 = vmlsl_u8(q5u16, d25u8, d1u8); 736 q6u16 = vmlsl_u8(q6u16, d26u8, d1u8); 737 738 q3u16 = vmlsl_u8(q3u16, d26u8, d4u8); 739 q4u16 = vmlsl_u8(q4u16, d27u8, d4u8); 740 q5u16 = vmlsl_u8(q5u16, d28u8, d4u8); 741 q6u16 = vmlsl_u8(q6u16, d29u8, d4u8); 742 743 q3u16 = vmlal_u8(q3u16, d24u8, d2u8); 744 q4u16 = vmlal_u8(q4u16, d25u8, d2u8); 745 q5u16 = vmlal_u8(q5u16, d26u8, d2u8); 746 q6u16 = vmlal_u8(q6u16, d27u8, d2u8); 747 748 q3u16 = vmlal_u8(q3u16, d27u8, d5u8); 749 q4u16 = vmlal_u8(q4u16, d28u8, d5u8); 750 q5u16 = vmlal_u8(q5u16, d29u8, d5u8); 751 q6u16 = vmlal_u8(q6u16, d30u8, d5u8); 752 753 q7u16 = vmull_u8(d25u8, d3u8); 754 q8u16 = vmull_u8(d26u8, d3u8); 755 q9u16 = vmull_u8(d27u8, d3u8); 756 q10u16 = vmull_u8(d28u8, d3u8); 757 758 q3s16 = vreinterpretq_s16_u16(q3u16); 759 q4s16 = vreinterpretq_s16_u16(q4u16); 760 q5s16 = vreinterpretq_s16_u16(q5u16); 761 q6s16 = vreinterpretq_s16_u16(q6u16); 762 q7s16 = vreinterpretq_s16_u16(q7u16); 763 q8s16 = vreinterpretq_s16_u16(q8u16); 764 q9s16 = vreinterpretq_s16_u16(q9u16); 765 q10s16 = vreinterpretq_s16_u16(q10u16); 766 767 q7s16 = vqaddq_s16(q7s16, q3s16); 768 q8s16 = vqaddq_s16(q8s16, q4s16); 769 q9s16 = vqaddq_s16(q9s16, q5s16); 770 q10s16 = vqaddq_s16(q10s16, q6s16); 771 772 d6u8 = vqrshrun_n_s16(q7s16, 7); 773 d7u8 = vqrshrun_n_s16(q8s16, 7); 774 d8u8 = vqrshrun_n_s16(q9s16, 7); 775 d9u8 = vqrshrun_n_s16(q10s16, 7); 776 777 vst1_u8(dst_ptr, d6u8); 778 dst_ptr += dst_pitch; 779 vst1_u8(dst_ptr, d7u8); 780 dst_ptr += dst_pitch; 781 vst1_u8(dst_ptr, d8u8); 782 dst_ptr += dst_pitch; 783 vst1_u8(dst_ptr, d9u8); 784 return; 785 } 786 787 void vp8_sixtap_predict8x8_neon(unsigned char *src_ptr, int src_pixels_per_line, 788 int xoffset, int yoffset, 789 unsigned char *dst_ptr, int dst_pitch) { 790 unsigned char *src, *tmpp; 791 unsigned char tmp[64]; 792 int i; 793 uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8; 794 uint8x8_t d18u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8, d25u8; 795 uint8x8_t d26u8, d27u8, d28u8, d29u8, d30u8, d31u8; 796 int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8; 797 uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16; 798 uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16; 799 int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16; 800 int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16; 801 uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q9u8, q10u8, q11u8, q12u8; 802 803 if (xoffset == 0) { // secondpass_filter8x8_only 804 // load second_pass filter 805 dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); 806 d0s8 = vdup_lane_s8(dtmps8, 0); 807 d1s8 = vdup_lane_s8(dtmps8, 1); 808 d2s8 = vdup_lane_s8(dtmps8, 2); 809 d3s8 = vdup_lane_s8(dtmps8, 3); 810 d4s8 = vdup_lane_s8(dtmps8, 4); 811 d5s8 = vdup_lane_s8(dtmps8, 5); 812 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); 813 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); 814 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); 815 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); 816 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); 817 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); 818 819 // load src data 820 src = src_ptr - src_pixels_per_line * 2; 821 d18u8 = vld1_u8(src); 822 src += src_pixels_per_line; 823 d19u8 = vld1_u8(src); 824 src += src_pixels_per_line; 825 d20u8 = vld1_u8(src); 826 src += src_pixels_per_line; 827 d21u8 = vld1_u8(src); 828 src += src_pixels_per_line; 829 d22u8 = vld1_u8(src); 830 src += src_pixels_per_line; 831 d23u8 = vld1_u8(src); 832 src += src_pixels_per_line; 833 d24u8 = vld1_u8(src); 834 src += src_pixels_per_line; 835 d25u8 = vld1_u8(src); 836 src += src_pixels_per_line; 837 d26u8 = vld1_u8(src); 838 src += src_pixels_per_line; 839 d27u8 = vld1_u8(src); 840 src += src_pixels_per_line; 841 d28u8 = vld1_u8(src); 842 src += src_pixels_per_line; 843 d29u8 = vld1_u8(src); 844 src += src_pixels_per_line; 845 d30u8 = vld1_u8(src); 846 847 for (i = 2; i > 0; i--) { 848 q3u16 = vmull_u8(d18u8, d0u8); 849 q4u16 = vmull_u8(d19u8, d0u8); 850 q5u16 = vmull_u8(d20u8, d0u8); 851 q6u16 = vmull_u8(d21u8, d0u8); 852 853 q3u16 = vmlsl_u8(q3u16, d19u8, d1u8); 854 q4u16 = vmlsl_u8(q4u16, d20u8, d1u8); 855 q5u16 = vmlsl_u8(q5u16, d21u8, d1u8); 856 q6u16 = vmlsl_u8(q6u16, d22u8, d1u8); 857 858 q3u16 = vmlsl_u8(q3u16, d22u8, d4u8); 859 q4u16 = vmlsl_u8(q4u16, d23u8, d4u8); 860 q5u16 = vmlsl_u8(q5u16, d24u8, d4u8); 861 q6u16 = vmlsl_u8(q6u16, d25u8, d4u8); 862 863 q3u16 = vmlal_u8(q3u16, d20u8, d2u8); 864 q4u16 = vmlal_u8(q4u16, d21u8, d2u8); 865 q5u16 = vmlal_u8(q5u16, d22u8, d2u8); 866 q6u16 = vmlal_u8(q6u16, d23u8, d2u8); 867 868 q3u16 = vmlal_u8(q3u16, d23u8, d5u8); 869 q4u16 = vmlal_u8(q4u16, d24u8, d5u8); 870 q5u16 = vmlal_u8(q5u16, d25u8, d5u8); 871 q6u16 = vmlal_u8(q6u16, d26u8, d5u8); 872 873 q7u16 = vmull_u8(d21u8, d3u8); 874 q8u16 = vmull_u8(d22u8, d3u8); 875 q9u16 = vmull_u8(d23u8, d3u8); 876 q10u16 = vmull_u8(d24u8, d3u8); 877 878 q3s16 = vreinterpretq_s16_u16(q3u16); 879 q4s16 = vreinterpretq_s16_u16(q4u16); 880 q5s16 = vreinterpretq_s16_u16(q5u16); 881 q6s16 = vreinterpretq_s16_u16(q6u16); 882 q7s16 = vreinterpretq_s16_u16(q7u16); 883 q8s16 = vreinterpretq_s16_u16(q8u16); 884 q9s16 = vreinterpretq_s16_u16(q9u16); 885 q10s16 = vreinterpretq_s16_u16(q10u16); 886 887 q7s16 = vqaddq_s16(q7s16, q3s16); 888 q8s16 = vqaddq_s16(q8s16, q4s16); 889 q9s16 = vqaddq_s16(q9s16, q5s16); 890 q10s16 = vqaddq_s16(q10s16, q6s16); 891 892 d6u8 = vqrshrun_n_s16(q7s16, 7); 893 d7u8 = vqrshrun_n_s16(q8s16, 7); 894 d8u8 = vqrshrun_n_s16(q9s16, 7); 895 d9u8 = vqrshrun_n_s16(q10s16, 7); 896 897 d18u8 = d22u8; 898 d19u8 = d23u8; 899 d20u8 = d24u8; 900 d21u8 = d25u8; 901 d22u8 = d26u8; 902 d23u8 = d27u8; 903 d24u8 = d28u8; 904 d25u8 = d29u8; 905 d26u8 = d30u8; 906 907 vst1_u8(dst_ptr, d6u8); 908 dst_ptr += dst_pitch; 909 vst1_u8(dst_ptr, d7u8); 910 dst_ptr += dst_pitch; 911 vst1_u8(dst_ptr, d8u8); 912 dst_ptr += dst_pitch; 913 vst1_u8(dst_ptr, d9u8); 914 dst_ptr += dst_pitch; 915 } 916 return; 917 } 918 919 // load first_pass filter 920 dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]); 921 d0s8 = vdup_lane_s8(dtmps8, 0); 922 d1s8 = vdup_lane_s8(dtmps8, 1); 923 d2s8 = vdup_lane_s8(dtmps8, 2); 924 d3s8 = vdup_lane_s8(dtmps8, 3); 925 d4s8 = vdup_lane_s8(dtmps8, 4); 926 d5s8 = vdup_lane_s8(dtmps8, 5); 927 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); 928 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); 929 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); 930 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); 931 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); 932 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); 933 934 // First pass: output_height lines x output_width columns (9x4) 935 if (yoffset == 0) // firstpass_filter4x4_only 936 src = src_ptr - 2; 937 else 938 src = src_ptr - 2 - (src_pixels_per_line * 2); 939 940 tmpp = tmp; 941 for (i = 2; i > 0; i--) { 942 q3u8 = vld1q_u8(src); 943 src += src_pixels_per_line; 944 q4u8 = vld1q_u8(src); 945 src += src_pixels_per_line; 946 q5u8 = vld1q_u8(src); 947 src += src_pixels_per_line; 948 q6u8 = vld1q_u8(src); 949 src += src_pixels_per_line; 950 951 __builtin_prefetch(src); 952 __builtin_prefetch(src + src_pixels_per_line); 953 __builtin_prefetch(src + src_pixels_per_line * 2); 954 955 q7u16 = vmull_u8(vget_low_u8(q3u8), d0u8); 956 q8u16 = vmull_u8(vget_low_u8(q4u8), d0u8); 957 q9u16 = vmull_u8(vget_low_u8(q5u8), d0u8); 958 q10u16 = vmull_u8(vget_low_u8(q6u8), d0u8); 959 960 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); 961 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1); 962 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1); 963 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1); 964 965 q7u16 = vmlsl_u8(q7u16, d28u8, d1u8); 966 q8u16 = vmlsl_u8(q8u16, d29u8, d1u8); 967 q9u16 = vmlsl_u8(q9u16, d30u8, d1u8); 968 q10u16 = vmlsl_u8(q10u16, d31u8, d1u8); 969 970 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4); 971 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4); 972 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4); 973 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4); 974 975 q7u16 = vmlsl_u8(q7u16, d28u8, d4u8); 976 q8u16 = vmlsl_u8(q8u16, d29u8, d4u8); 977 q9u16 = vmlsl_u8(q9u16, d30u8, d4u8); 978 q10u16 = vmlsl_u8(q10u16, d31u8, d4u8); 979 980 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2); 981 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2); 982 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2); 983 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2); 984 985 q7u16 = vmlal_u8(q7u16, d28u8, d2u8); 986 q8u16 = vmlal_u8(q8u16, d29u8, d2u8); 987 q9u16 = vmlal_u8(q9u16, d30u8, d2u8); 988 q10u16 = vmlal_u8(q10u16, d31u8, d2u8); 989 990 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5); 991 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5); 992 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5); 993 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5); 994 995 q7u16 = vmlal_u8(q7u16, d28u8, d5u8); 996 q8u16 = vmlal_u8(q8u16, d29u8, d5u8); 997 q9u16 = vmlal_u8(q9u16, d30u8, d5u8); 998 q10u16 = vmlal_u8(q10u16, d31u8, d5u8); 999 1000 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3); 1001 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3); 1002 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3); 1003 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3); 1004 1005 q3u16 = vmull_u8(d28u8, d3u8); 1006 q4u16 = vmull_u8(d29u8, d3u8); 1007 q5u16 = vmull_u8(d30u8, d3u8); 1008 q6u16 = vmull_u8(d31u8, d3u8); 1009 1010 q3s16 = vreinterpretq_s16_u16(q3u16); 1011 q4s16 = vreinterpretq_s16_u16(q4u16); 1012 q5s16 = vreinterpretq_s16_u16(q5u16); 1013 q6s16 = vreinterpretq_s16_u16(q6u16); 1014 q7s16 = vreinterpretq_s16_u16(q7u16); 1015 q8s16 = vreinterpretq_s16_u16(q8u16); 1016 q9s16 = vreinterpretq_s16_u16(q9u16); 1017 q10s16 = vreinterpretq_s16_u16(q10u16); 1018 1019 q7s16 = vqaddq_s16(q7s16, q3s16); 1020 q8s16 = vqaddq_s16(q8s16, q4s16); 1021 q9s16 = vqaddq_s16(q9s16, q5s16); 1022 q10s16 = vqaddq_s16(q10s16, q6s16); 1023 1024 d22u8 = vqrshrun_n_s16(q7s16, 7); 1025 d23u8 = vqrshrun_n_s16(q8s16, 7); 1026 d24u8 = vqrshrun_n_s16(q9s16, 7); 1027 d25u8 = vqrshrun_n_s16(q10s16, 7); 1028 1029 if (yoffset == 0) { // firstpass_filter8x4_only 1030 vst1_u8(dst_ptr, d22u8); 1031 dst_ptr += dst_pitch; 1032 vst1_u8(dst_ptr, d23u8); 1033 dst_ptr += dst_pitch; 1034 vst1_u8(dst_ptr, d24u8); 1035 dst_ptr += dst_pitch; 1036 vst1_u8(dst_ptr, d25u8); 1037 dst_ptr += dst_pitch; 1038 } else { 1039 vst1_u8(tmpp, d22u8); 1040 tmpp += 8; 1041 vst1_u8(tmpp, d23u8); 1042 tmpp += 8; 1043 vst1_u8(tmpp, d24u8); 1044 tmpp += 8; 1045 vst1_u8(tmpp, d25u8); 1046 tmpp += 8; 1047 } 1048 } 1049 if (yoffset == 0) return; 1050 1051 // First Pass on rest 5-line data 1052 q3u8 = vld1q_u8(src); 1053 src += src_pixels_per_line; 1054 q4u8 = vld1q_u8(src); 1055 src += src_pixels_per_line; 1056 q5u8 = vld1q_u8(src); 1057 src += src_pixels_per_line; 1058 q6u8 = vld1q_u8(src); 1059 src += src_pixels_per_line; 1060 q7u8 = vld1q_u8(src); 1061 1062 q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8); 1063 q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8); 1064 q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8); 1065 q11u16 = vmull_u8(vget_low_u8(q6u8), d0u8); 1066 q12u16 = vmull_u8(vget_low_u8(q7u8), d0u8); 1067 1068 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); 1069 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1); 1070 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1); 1071 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1); 1072 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 1); 1073 1074 q8u16 = vmlsl_u8(q8u16, d27u8, d1u8); 1075 q9u16 = vmlsl_u8(q9u16, d28u8, d1u8); 1076 q10u16 = vmlsl_u8(q10u16, d29u8, d1u8); 1077 q11u16 = vmlsl_u8(q11u16, d30u8, d1u8); 1078 q12u16 = vmlsl_u8(q12u16, d31u8, d1u8); 1079 1080 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4); 1081 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4); 1082 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4); 1083 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4); 1084 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 4); 1085 1086 q8u16 = vmlsl_u8(q8u16, d27u8, d4u8); 1087 q9u16 = vmlsl_u8(q9u16, d28u8, d4u8); 1088 q10u16 = vmlsl_u8(q10u16, d29u8, d4u8); 1089 q11u16 = vmlsl_u8(q11u16, d30u8, d4u8); 1090 q12u16 = vmlsl_u8(q12u16, d31u8, d4u8); 1091 1092 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2); 1093 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2); 1094 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2); 1095 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2); 1096 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 2); 1097 1098 q8u16 = vmlal_u8(q8u16, d27u8, d2u8); 1099 q9u16 = vmlal_u8(q9u16, d28u8, d2u8); 1100 q10u16 = vmlal_u8(q10u16, d29u8, d2u8); 1101 q11u16 = vmlal_u8(q11u16, d30u8, d2u8); 1102 q12u16 = vmlal_u8(q12u16, d31u8, d2u8); 1103 1104 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5); 1105 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5); 1106 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5); 1107 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5); 1108 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 5); 1109 1110 q8u16 = vmlal_u8(q8u16, d27u8, d5u8); 1111 q9u16 = vmlal_u8(q9u16, d28u8, d5u8); 1112 q10u16 = vmlal_u8(q10u16, d29u8, d5u8); 1113 q11u16 = vmlal_u8(q11u16, d30u8, d5u8); 1114 q12u16 = vmlal_u8(q12u16, d31u8, d5u8); 1115 1116 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3); 1117 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3); 1118 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3); 1119 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3); 1120 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 3); 1121 1122 q3u16 = vmull_u8(d27u8, d3u8); 1123 q4u16 = vmull_u8(d28u8, d3u8); 1124 q5u16 = vmull_u8(d29u8, d3u8); 1125 q6u16 = vmull_u8(d30u8, d3u8); 1126 q7u16 = vmull_u8(d31u8, d3u8); 1127 1128 q3s16 = vreinterpretq_s16_u16(q3u16); 1129 q4s16 = vreinterpretq_s16_u16(q4u16); 1130 q5s16 = vreinterpretq_s16_u16(q5u16); 1131 q6s16 = vreinterpretq_s16_u16(q6u16); 1132 q7s16 = vreinterpretq_s16_u16(q7u16); 1133 q8s16 = vreinterpretq_s16_u16(q8u16); 1134 q9s16 = vreinterpretq_s16_u16(q9u16); 1135 q10s16 = vreinterpretq_s16_u16(q10u16); 1136 q11s16 = vreinterpretq_s16_u16(q11u16); 1137 q12s16 = vreinterpretq_s16_u16(q12u16); 1138 1139 q8s16 = vqaddq_s16(q8s16, q3s16); 1140 q9s16 = vqaddq_s16(q9s16, q4s16); 1141 q10s16 = vqaddq_s16(q10s16, q5s16); 1142 q11s16 = vqaddq_s16(q11s16, q6s16); 1143 q12s16 = vqaddq_s16(q12s16, q7s16); 1144 1145 d26u8 = vqrshrun_n_s16(q8s16, 7); 1146 d27u8 = vqrshrun_n_s16(q9s16, 7); 1147 d28u8 = vqrshrun_n_s16(q10s16, 7); 1148 d29u8 = vqrshrun_n_s16(q11s16, 7); 1149 d30u8 = vqrshrun_n_s16(q12s16, 7); 1150 1151 // Second pass: 8x8 1152 dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); 1153 d0s8 = vdup_lane_s8(dtmps8, 0); 1154 d1s8 = vdup_lane_s8(dtmps8, 1); 1155 d2s8 = vdup_lane_s8(dtmps8, 2); 1156 d3s8 = vdup_lane_s8(dtmps8, 3); 1157 d4s8 = vdup_lane_s8(dtmps8, 4); 1158 d5s8 = vdup_lane_s8(dtmps8, 5); 1159 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); 1160 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); 1161 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); 1162 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); 1163 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); 1164 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); 1165 1166 tmpp = tmp; 1167 q9u8 = vld1q_u8(tmpp); 1168 tmpp += 16; 1169 q10u8 = vld1q_u8(tmpp); 1170 tmpp += 16; 1171 q11u8 = vld1q_u8(tmpp); 1172 tmpp += 16; 1173 q12u8 = vld1q_u8(tmpp); 1174 1175 d18u8 = vget_low_u8(q9u8); 1176 d19u8 = vget_high_u8(q9u8); 1177 d20u8 = vget_low_u8(q10u8); 1178 d21u8 = vget_high_u8(q10u8); 1179 d22u8 = vget_low_u8(q11u8); 1180 d23u8 = vget_high_u8(q11u8); 1181 d24u8 = vget_low_u8(q12u8); 1182 d25u8 = vget_high_u8(q12u8); 1183 1184 for (i = 2; i > 0; i--) { 1185 q3u16 = vmull_u8(d18u8, d0u8); 1186 q4u16 = vmull_u8(d19u8, d0u8); 1187 q5u16 = vmull_u8(d20u8, d0u8); 1188 q6u16 = vmull_u8(d21u8, d0u8); 1189 1190 q3u16 = vmlsl_u8(q3u16, d19u8, d1u8); 1191 q4u16 = vmlsl_u8(q4u16, d20u8, d1u8); 1192 q5u16 = vmlsl_u8(q5u16, d21u8, d1u8); 1193 q6u16 = vmlsl_u8(q6u16, d22u8, d1u8); 1194 1195 q3u16 = vmlsl_u8(q3u16, d22u8, d4u8); 1196 q4u16 = vmlsl_u8(q4u16, d23u8, d4u8); 1197 q5u16 = vmlsl_u8(q5u16, d24u8, d4u8); 1198 q6u16 = vmlsl_u8(q6u16, d25u8, d4u8); 1199 1200 q3u16 = vmlal_u8(q3u16, d20u8, d2u8); 1201 q4u16 = vmlal_u8(q4u16, d21u8, d2u8); 1202 q5u16 = vmlal_u8(q5u16, d22u8, d2u8); 1203 q6u16 = vmlal_u8(q6u16, d23u8, d2u8); 1204 1205 q3u16 = vmlal_u8(q3u16, d23u8, d5u8); 1206 q4u16 = vmlal_u8(q4u16, d24u8, d5u8); 1207 q5u16 = vmlal_u8(q5u16, d25u8, d5u8); 1208 q6u16 = vmlal_u8(q6u16, d26u8, d5u8); 1209 1210 q7u16 = vmull_u8(d21u8, d3u8); 1211 q8u16 = vmull_u8(d22u8, d3u8); 1212 q9u16 = vmull_u8(d23u8, d3u8); 1213 q10u16 = vmull_u8(d24u8, d3u8); 1214 1215 q3s16 = vreinterpretq_s16_u16(q3u16); 1216 q4s16 = vreinterpretq_s16_u16(q4u16); 1217 q5s16 = vreinterpretq_s16_u16(q5u16); 1218 q6s16 = vreinterpretq_s16_u16(q6u16); 1219 q7s16 = vreinterpretq_s16_u16(q7u16); 1220 q8s16 = vreinterpretq_s16_u16(q8u16); 1221 q9s16 = vreinterpretq_s16_u16(q9u16); 1222 q10s16 = vreinterpretq_s16_u16(q10u16); 1223 1224 q7s16 = vqaddq_s16(q7s16, q3s16); 1225 q8s16 = vqaddq_s16(q8s16, q4s16); 1226 q9s16 = vqaddq_s16(q9s16, q5s16); 1227 q10s16 = vqaddq_s16(q10s16, q6s16); 1228 1229 d6u8 = vqrshrun_n_s16(q7s16, 7); 1230 d7u8 = vqrshrun_n_s16(q8s16, 7); 1231 d8u8 = vqrshrun_n_s16(q9s16, 7); 1232 d9u8 = vqrshrun_n_s16(q10s16, 7); 1233 1234 d18u8 = d22u8; 1235 d19u8 = d23u8; 1236 d20u8 = d24u8; 1237 d21u8 = d25u8; 1238 d22u8 = d26u8; 1239 d23u8 = d27u8; 1240 d24u8 = d28u8; 1241 d25u8 = d29u8; 1242 d26u8 = d30u8; 1243 1244 vst1_u8(dst_ptr, d6u8); 1245 dst_ptr += dst_pitch; 1246 vst1_u8(dst_ptr, d7u8); 1247 dst_ptr += dst_pitch; 1248 vst1_u8(dst_ptr, d8u8); 1249 dst_ptr += dst_pitch; 1250 vst1_u8(dst_ptr, d9u8); 1251 dst_ptr += dst_pitch; 1252 } 1253 return; 1254 } 1255 1256 void vp8_sixtap_predict16x16_neon(unsigned char *src_ptr, 1257 int src_pixels_per_line, int xoffset, 1258 int yoffset, unsigned char *dst_ptr, 1259 int dst_pitch) { 1260 unsigned char *src, *src_tmp, *dst, *tmpp; 1261 unsigned char tmp[336]; 1262 int i, j; 1263 uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8; 1264 uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d18u8, d19u8; 1265 uint8x8_t d20u8, d21u8, d22u8, d23u8, d24u8, d25u8, d26u8, d27u8; 1266 uint8x8_t d28u8, d29u8, d30u8, d31u8; 1267 int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8; 1268 uint8x16_t q3u8, q4u8; 1269 uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16, q8u16, q9u16, q10u16; 1270 uint16x8_t q11u16, q12u16, q13u16, q15u16; 1271 int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16, q8s16, q9s16, q10s16; 1272 int16x8_t q11s16, q12s16, q13s16, q15s16; 1273 1274 if (xoffset == 0) { // secondpass_filter8x8_only 1275 // load second_pass filter 1276 dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); 1277 d0s8 = vdup_lane_s8(dtmps8, 0); 1278 d1s8 = vdup_lane_s8(dtmps8, 1); 1279 d2s8 = vdup_lane_s8(dtmps8, 2); 1280 d3s8 = vdup_lane_s8(dtmps8, 3); 1281 d4s8 = vdup_lane_s8(dtmps8, 4); 1282 d5s8 = vdup_lane_s8(dtmps8, 5); 1283 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); 1284 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); 1285 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); 1286 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); 1287 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); 1288 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); 1289 1290 // load src data 1291 src_tmp = src_ptr - src_pixels_per_line * 2; 1292 for (i = 0; i < 2; ++i) { 1293 src = src_tmp + i * 8; 1294 dst = dst_ptr + i * 8; 1295 d18u8 = vld1_u8(src); 1296 src += src_pixels_per_line; 1297 d19u8 = vld1_u8(src); 1298 src += src_pixels_per_line; 1299 d20u8 = vld1_u8(src); 1300 src += src_pixels_per_line; 1301 d21u8 = vld1_u8(src); 1302 src += src_pixels_per_line; 1303 d22u8 = vld1_u8(src); 1304 src += src_pixels_per_line; 1305 for (j = 0; j < 4; ++j) { 1306 d23u8 = vld1_u8(src); 1307 src += src_pixels_per_line; 1308 d24u8 = vld1_u8(src); 1309 src += src_pixels_per_line; 1310 d25u8 = vld1_u8(src); 1311 src += src_pixels_per_line; 1312 d26u8 = vld1_u8(src); 1313 src += src_pixels_per_line; 1314 1315 q3u16 = vmull_u8(d18u8, d0u8); 1316 q4u16 = vmull_u8(d19u8, d0u8); 1317 q5u16 = vmull_u8(d20u8, d0u8); 1318 q6u16 = vmull_u8(d21u8, d0u8); 1319 1320 q3u16 = vmlsl_u8(q3u16, d19u8, d1u8); 1321 q4u16 = vmlsl_u8(q4u16, d20u8, d1u8); 1322 q5u16 = vmlsl_u8(q5u16, d21u8, d1u8); 1323 q6u16 = vmlsl_u8(q6u16, d22u8, d1u8); 1324 1325 q3u16 = vmlsl_u8(q3u16, d22u8, d4u8); 1326 q4u16 = vmlsl_u8(q4u16, d23u8, d4u8); 1327 q5u16 = vmlsl_u8(q5u16, d24u8, d4u8); 1328 q6u16 = vmlsl_u8(q6u16, d25u8, d4u8); 1329 1330 q3u16 = vmlal_u8(q3u16, d20u8, d2u8); 1331 q4u16 = vmlal_u8(q4u16, d21u8, d2u8); 1332 q5u16 = vmlal_u8(q5u16, d22u8, d2u8); 1333 q6u16 = vmlal_u8(q6u16, d23u8, d2u8); 1334 1335 q3u16 = vmlal_u8(q3u16, d23u8, d5u8); 1336 q4u16 = vmlal_u8(q4u16, d24u8, d5u8); 1337 q5u16 = vmlal_u8(q5u16, d25u8, d5u8); 1338 q6u16 = vmlal_u8(q6u16, d26u8, d5u8); 1339 1340 q7u16 = vmull_u8(d21u8, d3u8); 1341 q8u16 = vmull_u8(d22u8, d3u8); 1342 q9u16 = vmull_u8(d23u8, d3u8); 1343 q10u16 = vmull_u8(d24u8, d3u8); 1344 1345 q3s16 = vreinterpretq_s16_u16(q3u16); 1346 q4s16 = vreinterpretq_s16_u16(q4u16); 1347 q5s16 = vreinterpretq_s16_u16(q5u16); 1348 q6s16 = vreinterpretq_s16_u16(q6u16); 1349 q7s16 = vreinterpretq_s16_u16(q7u16); 1350 q8s16 = vreinterpretq_s16_u16(q8u16); 1351 q9s16 = vreinterpretq_s16_u16(q9u16); 1352 q10s16 = vreinterpretq_s16_u16(q10u16); 1353 1354 q7s16 = vqaddq_s16(q7s16, q3s16); 1355 q8s16 = vqaddq_s16(q8s16, q4s16); 1356 q9s16 = vqaddq_s16(q9s16, q5s16); 1357 q10s16 = vqaddq_s16(q10s16, q6s16); 1358 1359 d6u8 = vqrshrun_n_s16(q7s16, 7); 1360 d7u8 = vqrshrun_n_s16(q8s16, 7); 1361 d8u8 = vqrshrun_n_s16(q9s16, 7); 1362 d9u8 = vqrshrun_n_s16(q10s16, 7); 1363 1364 d18u8 = d22u8; 1365 d19u8 = d23u8; 1366 d20u8 = d24u8; 1367 d21u8 = d25u8; 1368 d22u8 = d26u8; 1369 1370 vst1_u8(dst, d6u8); 1371 dst += dst_pitch; 1372 vst1_u8(dst, d7u8); 1373 dst += dst_pitch; 1374 vst1_u8(dst, d8u8); 1375 dst += dst_pitch; 1376 vst1_u8(dst, d9u8); 1377 dst += dst_pitch; 1378 } 1379 } 1380 return; 1381 } 1382 1383 // load first_pass filter 1384 dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]); 1385 d0s8 = vdup_lane_s8(dtmps8, 0); 1386 d1s8 = vdup_lane_s8(dtmps8, 1); 1387 d2s8 = vdup_lane_s8(dtmps8, 2); 1388 d3s8 = vdup_lane_s8(dtmps8, 3); 1389 d4s8 = vdup_lane_s8(dtmps8, 4); 1390 d5s8 = vdup_lane_s8(dtmps8, 5); 1391 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); 1392 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); 1393 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); 1394 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); 1395 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); 1396 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); 1397 1398 // First pass: output_height lines x output_width columns (9x4) 1399 if (yoffset == 0) { // firstpass_filter4x4_only 1400 src = src_ptr - 2; 1401 dst = dst_ptr; 1402 for (i = 0; i < 8; ++i) { 1403 d6u8 = vld1_u8(src); 1404 d7u8 = vld1_u8(src + 8); 1405 d8u8 = vld1_u8(src + 16); 1406 src += src_pixels_per_line; 1407 d9u8 = vld1_u8(src); 1408 d10u8 = vld1_u8(src + 8); 1409 d11u8 = vld1_u8(src + 16); 1410 src += src_pixels_per_line; 1411 1412 __builtin_prefetch(src); 1413 __builtin_prefetch(src + src_pixels_per_line); 1414 1415 q6u16 = vmull_u8(d6u8, d0u8); 1416 q7u16 = vmull_u8(d7u8, d0u8); 1417 q8u16 = vmull_u8(d9u8, d0u8); 1418 q9u16 = vmull_u8(d10u8, d0u8); 1419 1420 d20u8 = vext_u8(d6u8, d7u8, 1); 1421 d21u8 = vext_u8(d9u8, d10u8, 1); 1422 d22u8 = vext_u8(d7u8, d8u8, 1); 1423 d23u8 = vext_u8(d10u8, d11u8, 1); 1424 d24u8 = vext_u8(d6u8, d7u8, 4); 1425 d25u8 = vext_u8(d9u8, d10u8, 4); 1426 d26u8 = vext_u8(d7u8, d8u8, 4); 1427 d27u8 = vext_u8(d10u8, d11u8, 4); 1428 d28u8 = vext_u8(d6u8, d7u8, 5); 1429 d29u8 = vext_u8(d9u8, d10u8, 5); 1430 1431 q6u16 = vmlsl_u8(q6u16, d20u8, d1u8); 1432 q8u16 = vmlsl_u8(q8u16, d21u8, d1u8); 1433 q7u16 = vmlsl_u8(q7u16, d22u8, d1u8); 1434 q9u16 = vmlsl_u8(q9u16, d23u8, d1u8); 1435 q6u16 = vmlsl_u8(q6u16, d24u8, d4u8); 1436 q8u16 = vmlsl_u8(q8u16, d25u8, d4u8); 1437 q7u16 = vmlsl_u8(q7u16, d26u8, d4u8); 1438 q9u16 = vmlsl_u8(q9u16, d27u8, d4u8); 1439 q6u16 = vmlal_u8(q6u16, d28u8, d5u8); 1440 q8u16 = vmlal_u8(q8u16, d29u8, d5u8); 1441 1442 d20u8 = vext_u8(d7u8, d8u8, 5); 1443 d21u8 = vext_u8(d10u8, d11u8, 5); 1444 d22u8 = vext_u8(d6u8, d7u8, 2); 1445 d23u8 = vext_u8(d9u8, d10u8, 2); 1446 d24u8 = vext_u8(d7u8, d8u8, 2); 1447 d25u8 = vext_u8(d10u8, d11u8, 2); 1448 d26u8 = vext_u8(d6u8, d7u8, 3); 1449 d27u8 = vext_u8(d9u8, d10u8, 3); 1450 d28u8 = vext_u8(d7u8, d8u8, 3); 1451 d29u8 = vext_u8(d10u8, d11u8, 3); 1452 1453 q7u16 = vmlal_u8(q7u16, d20u8, d5u8); 1454 q9u16 = vmlal_u8(q9u16, d21u8, d5u8); 1455 q6u16 = vmlal_u8(q6u16, d22u8, d2u8); 1456 q8u16 = vmlal_u8(q8u16, d23u8, d2u8); 1457 q7u16 = vmlal_u8(q7u16, d24u8, d2u8); 1458 q9u16 = vmlal_u8(q9u16, d25u8, d2u8); 1459 1460 q10u16 = vmull_u8(d26u8, d3u8); 1461 q11u16 = vmull_u8(d27u8, d3u8); 1462 q12u16 = vmull_u8(d28u8, d3u8); 1463 q15u16 = vmull_u8(d29u8, d3u8); 1464 1465 q6s16 = vreinterpretq_s16_u16(q6u16); 1466 q7s16 = vreinterpretq_s16_u16(q7u16); 1467 q8s16 = vreinterpretq_s16_u16(q8u16); 1468 q9s16 = vreinterpretq_s16_u16(q9u16); 1469 q10s16 = vreinterpretq_s16_u16(q10u16); 1470 q11s16 = vreinterpretq_s16_u16(q11u16); 1471 q12s16 = vreinterpretq_s16_u16(q12u16); 1472 q15s16 = vreinterpretq_s16_u16(q15u16); 1473 1474 q6s16 = vqaddq_s16(q6s16, q10s16); 1475 q8s16 = vqaddq_s16(q8s16, q11s16); 1476 q7s16 = vqaddq_s16(q7s16, q12s16); 1477 q9s16 = vqaddq_s16(q9s16, q15s16); 1478 1479 d6u8 = vqrshrun_n_s16(q6s16, 7); 1480 d7u8 = vqrshrun_n_s16(q7s16, 7); 1481 d8u8 = vqrshrun_n_s16(q8s16, 7); 1482 d9u8 = vqrshrun_n_s16(q9s16, 7); 1483 1484 q3u8 = vcombine_u8(d6u8, d7u8); 1485 q4u8 = vcombine_u8(d8u8, d9u8); 1486 vst1q_u8(dst, q3u8); 1487 dst += dst_pitch; 1488 vst1q_u8(dst, q4u8); 1489 dst += dst_pitch; 1490 } 1491 return; 1492 } 1493 1494 src = src_ptr - 2 - src_pixels_per_line * 2; 1495 tmpp = tmp; 1496 for (i = 0; i < 7; ++i) { 1497 d6u8 = vld1_u8(src); 1498 d7u8 = vld1_u8(src + 8); 1499 d8u8 = vld1_u8(src + 16); 1500 src += src_pixels_per_line; 1501 d9u8 = vld1_u8(src); 1502 d10u8 = vld1_u8(src + 8); 1503 d11u8 = vld1_u8(src + 16); 1504 src += src_pixels_per_line; 1505 d12u8 = vld1_u8(src); 1506 d13u8 = vld1_u8(src + 8); 1507 d14u8 = vld1_u8(src + 16); 1508 src += src_pixels_per_line; 1509 1510 __builtin_prefetch(src); 1511 __builtin_prefetch(src + src_pixels_per_line); 1512 __builtin_prefetch(src + src_pixels_per_line * 2); 1513 1514 q8u16 = vmull_u8(d6u8, d0u8); 1515 q9u16 = vmull_u8(d7u8, d0u8); 1516 q10u16 = vmull_u8(d9u8, d0u8); 1517 q11u16 = vmull_u8(d10u8, d0u8); 1518 q12u16 = vmull_u8(d12u8, d0u8); 1519 q13u16 = vmull_u8(d13u8, d0u8); 1520 1521 d28u8 = vext_u8(d6u8, d7u8, 1); 1522 d29u8 = vext_u8(d9u8, d10u8, 1); 1523 d30u8 = vext_u8(d12u8, d13u8, 1); 1524 q8u16 = vmlsl_u8(q8u16, d28u8, d1u8); 1525 q10u16 = vmlsl_u8(q10u16, d29u8, d1u8); 1526 q12u16 = vmlsl_u8(q12u16, d30u8, d1u8); 1527 d28u8 = vext_u8(d7u8, d8u8, 1); 1528 d29u8 = vext_u8(d10u8, d11u8, 1); 1529 d30u8 = vext_u8(d13u8, d14u8, 1); 1530 q9u16 = vmlsl_u8(q9u16, d28u8, d1u8); 1531 q11u16 = vmlsl_u8(q11u16, d29u8, d1u8); 1532 q13u16 = vmlsl_u8(q13u16, d30u8, d1u8); 1533 1534 d28u8 = vext_u8(d6u8, d7u8, 4); 1535 d29u8 = vext_u8(d9u8, d10u8, 4); 1536 d30u8 = vext_u8(d12u8, d13u8, 4); 1537 q8u16 = vmlsl_u8(q8u16, d28u8, d4u8); 1538 q10u16 = vmlsl_u8(q10u16, d29u8, d4u8); 1539 q12u16 = vmlsl_u8(q12u16, d30u8, d4u8); 1540 d28u8 = vext_u8(d7u8, d8u8, 4); 1541 d29u8 = vext_u8(d10u8, d11u8, 4); 1542 d30u8 = vext_u8(d13u8, d14u8, 4); 1543 q9u16 = vmlsl_u8(q9u16, d28u8, d4u8); 1544 q11u16 = vmlsl_u8(q11u16, d29u8, d4u8); 1545 q13u16 = vmlsl_u8(q13u16, d30u8, d4u8); 1546 1547 d28u8 = vext_u8(d6u8, d7u8, 5); 1548 d29u8 = vext_u8(d9u8, d10u8, 5); 1549 d30u8 = vext_u8(d12u8, d13u8, 5); 1550 q8u16 = vmlal_u8(q8u16, d28u8, d5u8); 1551 q10u16 = vmlal_u8(q10u16, d29u8, d5u8); 1552 q12u16 = vmlal_u8(q12u16, d30u8, d5u8); 1553 d28u8 = vext_u8(d7u8, d8u8, 5); 1554 d29u8 = vext_u8(d10u8, d11u8, 5); 1555 d30u8 = vext_u8(d13u8, d14u8, 5); 1556 q9u16 = vmlal_u8(q9u16, d28u8, d5u8); 1557 q11u16 = vmlal_u8(q11u16, d29u8, d5u8); 1558 q13u16 = vmlal_u8(q13u16, d30u8, d5u8); 1559 1560 d28u8 = vext_u8(d6u8, d7u8, 2); 1561 d29u8 = vext_u8(d9u8, d10u8, 2); 1562 d30u8 = vext_u8(d12u8, d13u8, 2); 1563 q8u16 = vmlal_u8(q8u16, d28u8, d2u8); 1564 q10u16 = vmlal_u8(q10u16, d29u8, d2u8); 1565 q12u16 = vmlal_u8(q12u16, d30u8, d2u8); 1566 d28u8 = vext_u8(d7u8, d8u8, 2); 1567 d29u8 = vext_u8(d10u8, d11u8, 2); 1568 d30u8 = vext_u8(d13u8, d14u8, 2); 1569 q9u16 = vmlal_u8(q9u16, d28u8, d2u8); 1570 q11u16 = vmlal_u8(q11u16, d29u8, d2u8); 1571 q13u16 = vmlal_u8(q13u16, d30u8, d2u8); 1572 1573 d28u8 = vext_u8(d6u8, d7u8, 3); 1574 d29u8 = vext_u8(d9u8, d10u8, 3); 1575 d30u8 = vext_u8(d12u8, d13u8, 3); 1576 d15u8 = vext_u8(d7u8, d8u8, 3); 1577 d31u8 = vext_u8(d10u8, d11u8, 3); 1578 d6u8 = vext_u8(d13u8, d14u8, 3); 1579 q4u16 = vmull_u8(d28u8, d3u8); 1580 q5u16 = vmull_u8(d29u8, d3u8); 1581 q6u16 = vmull_u8(d30u8, d3u8); 1582 q4s16 = vreinterpretq_s16_u16(q4u16); 1583 q5s16 = vreinterpretq_s16_u16(q5u16); 1584 q6s16 = vreinterpretq_s16_u16(q6u16); 1585 q8s16 = vreinterpretq_s16_u16(q8u16); 1586 q10s16 = vreinterpretq_s16_u16(q10u16); 1587 q12s16 = vreinterpretq_s16_u16(q12u16); 1588 q8s16 = vqaddq_s16(q8s16, q4s16); 1589 q10s16 = vqaddq_s16(q10s16, q5s16); 1590 q12s16 = vqaddq_s16(q12s16, q6s16); 1591 1592 q6u16 = vmull_u8(d15u8, d3u8); 1593 q7u16 = vmull_u8(d31u8, d3u8); 1594 q3u16 = vmull_u8(d6u8, d3u8); 1595 q3s16 = vreinterpretq_s16_u16(q3u16); 1596 q6s16 = vreinterpretq_s16_u16(q6u16); 1597 q7s16 = vreinterpretq_s16_u16(q7u16); 1598 q9s16 = vreinterpretq_s16_u16(q9u16); 1599 q11s16 = vreinterpretq_s16_u16(q11u16); 1600 q13s16 = vreinterpretq_s16_u16(q13u16); 1601 q9s16 = vqaddq_s16(q9s16, q6s16); 1602 q11s16 = vqaddq_s16(q11s16, q7s16); 1603 q13s16 = vqaddq_s16(q13s16, q3s16); 1604 1605 d6u8 = vqrshrun_n_s16(q8s16, 7); 1606 d7u8 = vqrshrun_n_s16(q9s16, 7); 1607 d8u8 = vqrshrun_n_s16(q10s16, 7); 1608 d9u8 = vqrshrun_n_s16(q11s16, 7); 1609 d10u8 = vqrshrun_n_s16(q12s16, 7); 1610 d11u8 = vqrshrun_n_s16(q13s16, 7); 1611 1612 vst1_u8(tmpp, d6u8); 1613 tmpp += 8; 1614 vst1_u8(tmpp, d7u8); 1615 tmpp += 8; 1616 vst1_u8(tmpp, d8u8); 1617 tmpp += 8; 1618 vst1_u8(tmpp, d9u8); 1619 tmpp += 8; 1620 vst1_u8(tmpp, d10u8); 1621 tmpp += 8; 1622 vst1_u8(tmpp, d11u8); 1623 tmpp += 8; 1624 } 1625 1626 // Second pass: 16x16 1627 dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); 1628 d0s8 = vdup_lane_s8(dtmps8, 0); 1629 d1s8 = vdup_lane_s8(dtmps8, 1); 1630 d2s8 = vdup_lane_s8(dtmps8, 2); 1631 d3s8 = vdup_lane_s8(dtmps8, 3); 1632 d4s8 = vdup_lane_s8(dtmps8, 4); 1633 d5s8 = vdup_lane_s8(dtmps8, 5); 1634 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); 1635 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); 1636 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); 1637 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); 1638 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); 1639 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); 1640 1641 for (i = 0; i < 2; ++i) { 1642 dst = dst_ptr + 8 * i; 1643 tmpp = tmp + 8 * i; 1644 d18u8 = vld1_u8(tmpp); 1645 tmpp += 16; 1646 d19u8 = vld1_u8(tmpp); 1647 tmpp += 16; 1648 d20u8 = vld1_u8(tmpp); 1649 tmpp += 16; 1650 d21u8 = vld1_u8(tmpp); 1651 tmpp += 16; 1652 d22u8 = vld1_u8(tmpp); 1653 tmpp += 16; 1654 for (j = 0; j < 4; ++j) { 1655 d23u8 = vld1_u8(tmpp); 1656 tmpp += 16; 1657 d24u8 = vld1_u8(tmpp); 1658 tmpp += 16; 1659 d25u8 = vld1_u8(tmpp); 1660 tmpp += 16; 1661 d26u8 = vld1_u8(tmpp); 1662 tmpp += 16; 1663 1664 q3u16 = vmull_u8(d18u8, d0u8); 1665 q4u16 = vmull_u8(d19u8, d0u8); 1666 q5u16 = vmull_u8(d20u8, d0u8); 1667 q6u16 = vmull_u8(d21u8, d0u8); 1668 1669 q3u16 = vmlsl_u8(q3u16, d19u8, d1u8); 1670 q4u16 = vmlsl_u8(q4u16, d20u8, d1u8); 1671 q5u16 = vmlsl_u8(q5u16, d21u8, d1u8); 1672 q6u16 = vmlsl_u8(q6u16, d22u8, d1u8); 1673 1674 q3u16 = vmlsl_u8(q3u16, d22u8, d4u8); 1675 q4u16 = vmlsl_u8(q4u16, d23u8, d4u8); 1676 q5u16 = vmlsl_u8(q5u16, d24u8, d4u8); 1677 q6u16 = vmlsl_u8(q6u16, d25u8, d4u8); 1678 1679 q3u16 = vmlal_u8(q3u16, d20u8, d2u8); 1680 q4u16 = vmlal_u8(q4u16, d21u8, d2u8); 1681 q5u16 = vmlal_u8(q5u16, d22u8, d2u8); 1682 q6u16 = vmlal_u8(q6u16, d23u8, d2u8); 1683 1684 q3u16 = vmlal_u8(q3u16, d23u8, d5u8); 1685 q4u16 = vmlal_u8(q4u16, d24u8, d5u8); 1686 q5u16 = vmlal_u8(q5u16, d25u8, d5u8); 1687 q6u16 = vmlal_u8(q6u16, d26u8, d5u8); 1688 1689 q7u16 = vmull_u8(d21u8, d3u8); 1690 q8u16 = vmull_u8(d22u8, d3u8); 1691 q9u16 = vmull_u8(d23u8, d3u8); 1692 q10u16 = vmull_u8(d24u8, d3u8); 1693 1694 q3s16 = vreinterpretq_s16_u16(q3u16); 1695 q4s16 = vreinterpretq_s16_u16(q4u16); 1696 q5s16 = vreinterpretq_s16_u16(q5u16); 1697 q6s16 = vreinterpretq_s16_u16(q6u16); 1698 q7s16 = vreinterpretq_s16_u16(q7u16); 1699 q8s16 = vreinterpretq_s16_u16(q8u16); 1700 q9s16 = vreinterpretq_s16_u16(q9u16); 1701 q10s16 = vreinterpretq_s16_u16(q10u16); 1702 1703 q7s16 = vqaddq_s16(q7s16, q3s16); 1704 q8s16 = vqaddq_s16(q8s16, q4s16); 1705 q9s16 = vqaddq_s16(q9s16, q5s16); 1706 q10s16 = vqaddq_s16(q10s16, q6s16); 1707 1708 d6u8 = vqrshrun_n_s16(q7s16, 7); 1709 d7u8 = vqrshrun_n_s16(q8s16, 7); 1710 d8u8 = vqrshrun_n_s16(q9s16, 7); 1711 d9u8 = vqrshrun_n_s16(q10s16, 7); 1712 1713 d18u8 = d22u8; 1714 d19u8 = d23u8; 1715 d20u8 = d24u8; 1716 d21u8 = d25u8; 1717 d22u8 = d26u8; 1718 1719 vst1_u8(dst, d6u8); 1720 dst += dst_pitch; 1721 vst1_u8(dst, d7u8); 1722 dst += dst_pitch; 1723 vst1_u8(dst, d8u8); 1724 dst += dst_pitch; 1725 vst1_u8(dst, d9u8); 1726 dst += dst_pitch; 1727 } 1728 } 1729 return; 1730 } 1731