1 /* 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <arm_neon.h> 12 #include <string.h> 13 #include "./vpx_config.h" 14 #include "vpx_dsp/arm/mem_neon.h" 15 #include "vpx_ports/mem.h" 16 17 static const int8_t vp8_sub_pel_filters[8][8] = { 18 { 0, 0, 128, 0, 0, 0, 0, 0 }, /* note that 1/8 pel positionyys are */ 19 { 0, -6, 123, 12, -1, 0, 0, 0 }, /* just as per alpha -0.5 bicubic */ 20 { 2, -11, 108, 36, -8, 1, 0, 0 }, /* New 1/4 pel 6 tap filter */ 21 { 0, -9, 93, 50, -6, 0, 0, 0 }, 22 { 3, -16, 77, 77, -16, 3, 0, 0 }, /* New 1/2 pel 6 tap filter */ 23 { 0, -6, 50, 93, -9, 0, 0, 0 }, 24 { 1, -8, 36, 108, -11, 2, 0, 0 }, /* New 1/4 pel 6 tap filter */ 25 { 0, -1, 12, 123, -6, 0, 0, 0 }, 26 }; 27 28 // This table is derived from vp8/common/filter.c:vp8_sub_pel_filters. 29 // Apply abs() to all the values. Elements 0, 2, 3, and 5 are always positive. 30 // Elements 1 and 4 are either 0 or negative. The code accounts for this with 31 // multiply/accumulates which either add or subtract as needed. The other 32 // functions will be updated to use this table later. 33 // It is also expanded to 8 elements to allow loading into 64 bit neon 34 // registers. 35 static const uint8_t abs_filters[8][8] = { 36 { 0, 0, 128, 0, 0, 0, 0, 0 }, { 0, 6, 123, 12, 1, 0, 0, 0 }, 37 { 2, 11, 108, 36, 8, 1, 0, 0 }, { 0, 9, 93, 50, 6, 0, 0, 0 }, 38 { 3, 16, 77, 77, 16, 3, 0, 0 }, { 0, 6, 50, 93, 9, 0, 0, 0 }, 39 { 1, 8, 36, 108, 11, 2, 0, 0 }, { 0, 1, 12, 123, 6, 0, 0, 0 }, 40 }; 41 42 static INLINE uint8x8_t load_and_shift(const unsigned char *a) { 43 return vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(vld1_u8(a)), 32)); 44 } 45 46 static INLINE void filter_add_accumulate(const uint8x16_t a, const uint8x16_t b, 47 const uint8x8_t filter, uint16x8_t *c, 48 uint16x8_t *d) { 49 const uint32x2x2_t a_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(a)), 50 vreinterpret_u32_u8(vget_high_u8(a))); 51 const uint32x2x2_t b_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(b)), 52 vreinterpret_u32_u8(vget_high_u8(b))); 53 *c = vmlal_u8(*c, vreinterpret_u8_u32(a_shuf.val[0]), filter); 54 *d = vmlal_u8(*d, vreinterpret_u8_u32(b_shuf.val[0]), filter); 55 } 56 57 static INLINE void filter_sub_accumulate(const uint8x16_t a, const uint8x16_t b, 58 const uint8x8_t filter, uint16x8_t *c, 59 uint16x8_t *d) { 60 const uint32x2x2_t a_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(a)), 61 vreinterpret_u32_u8(vget_high_u8(a))); 62 const uint32x2x2_t b_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(b)), 63 vreinterpret_u32_u8(vget_high_u8(b))); 64 *c = vmlsl_u8(*c, vreinterpret_u8_u32(a_shuf.val[0]), filter); 65 *d = vmlsl_u8(*d, vreinterpret_u8_u32(b_shuf.val[0]), filter); 66 } 67 68 static INLINE void yonly4x4(const unsigned char *src, int src_stride, 69 int filter_offset, unsigned char *dst, 70 int dst_stride) { 71 uint8x8_t a0, a1, a2, a3, a4, a5, a6, a7, a8; 72 uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7, b8; 73 uint16x8_t c0, c1, c2, c3; 74 int16x8_t d0, d1; 75 uint8x8_t e0, e1; 76 77 const uint8x8_t filter = vld1_u8(abs_filters[filter_offset]); 78 const uint8x8_t filter0 = vdup_lane_u8(filter, 0); 79 const uint8x8_t filter1 = vdup_lane_u8(filter, 1); 80 const uint8x8_t filter2 = vdup_lane_u8(filter, 2); 81 const uint8x8_t filter3 = vdup_lane_u8(filter, 3); 82 const uint8x8_t filter4 = vdup_lane_u8(filter, 4); 83 const uint8x8_t filter5 = vdup_lane_u8(filter, 5); 84 85 src -= src_stride * 2; 86 // Shift the even rows to allow using 'vext' to combine the vectors. armv8 87 // has vcopy_lane which would be interesting. This started as just a 88 // horrible workaround for clang adding alignment hints to 32bit loads: 89 // https://llvm.org/bugs/show_bug.cgi?id=24421 90 // But it turns out it almost identical to casting the loads. 91 a0 = load_and_shift(src); 92 src += src_stride; 93 a1 = vld1_u8(src); 94 src += src_stride; 95 a2 = load_and_shift(src); 96 src += src_stride; 97 a3 = vld1_u8(src); 98 src += src_stride; 99 a4 = load_and_shift(src); 100 src += src_stride; 101 a5 = vld1_u8(src); 102 src += src_stride; 103 a6 = load_and_shift(src); 104 src += src_stride; 105 a7 = vld1_u8(src); 106 src += src_stride; 107 a8 = vld1_u8(src); 108 109 // Combine the rows so we can operate on 8 at a time. 110 b0 = vext_u8(a0, a1, 4); 111 b2 = vext_u8(a2, a3, 4); 112 b4 = vext_u8(a4, a5, 4); 113 b6 = vext_u8(a6, a7, 4); 114 b8 = a8; 115 116 // To keep with the 8-at-a-time theme, combine *alternate* rows. This 117 // allows combining the odd rows with the even. 118 b1 = vext_u8(b0, b2, 4); 119 b3 = vext_u8(b2, b4, 4); 120 b5 = vext_u8(b4, b6, 4); 121 b7 = vext_u8(b6, b8, 4); 122 123 // Multiply and expand to 16 bits. 124 c0 = vmull_u8(b0, filter0); 125 c1 = vmull_u8(b2, filter0); 126 c2 = vmull_u8(b5, filter5); 127 c3 = vmull_u8(b7, filter5); 128 129 // Multiply, subtract and accumulate for filters 1 and 4 (the negative 130 // ones). 131 c0 = vmlsl_u8(c0, b4, filter4); 132 c1 = vmlsl_u8(c1, b6, filter4); 133 c2 = vmlsl_u8(c2, b1, filter1); 134 c3 = vmlsl_u8(c3, b3, filter1); 135 136 // Add more positive ones. vmlal should really return a signed type. 137 // It's doing signed math internally, as evidenced by the fact we can do 138 // subtractions followed by more additions. Ideally we could use 139 // vqmlal/sl but that instruction doesn't exist. Might be able to 140 // shoehorn vqdmlal/vqdmlsl in here but it would take some effort. 141 c0 = vmlal_u8(c0, b2, filter2); 142 c1 = vmlal_u8(c1, b4, filter2); 143 c2 = vmlal_u8(c2, b3, filter3); 144 c3 = vmlal_u8(c3, b5, filter3); 145 146 // Use signed saturation math because vmlsl may have left some negative 147 // numbers in there. 148 d0 = vqaddq_s16(vreinterpretq_s16_u16(c2), vreinterpretq_s16_u16(c0)); 149 d1 = vqaddq_s16(vreinterpretq_s16_u16(c3), vreinterpretq_s16_u16(c1)); 150 151 // Use signed again because numbers like -200 need to be saturated to 0. 152 e0 = vqrshrun_n_s16(d0, 7); 153 e1 = vqrshrun_n_s16(d1, 7); 154 155 store_unaligned_u8q(dst, dst_stride, vcombine_u8(e0, e1)); 156 } 157 158 void vp8_sixtap_predict4x4_neon(unsigned char *src_ptr, int src_pixels_per_line, 159 int xoffset, int yoffset, 160 unsigned char *dst_ptr, int dst_pitch) { 161 uint8x16_t s0, s1, s2, s3, s4; 162 uint64x2_t s01, s23; 163 // Variables to hold src[] elements for the given filter[] 164 uint8x8_t s0_f5, s1_f5, s2_f5, s3_f5, s4_f5; 165 uint8x8_t s4_f1, s4_f2, s4_f3, s4_f4; 166 uint8x16_t s01_f0, s23_f0; 167 uint64x2_t s01_f3, s23_f3; 168 uint32x2x2_t s01_f3_q, s23_f3_q, s01_f5_q, s23_f5_q; 169 // Accumulator variables. 170 uint16x8_t d0123, d4567, d89; 171 uint16x8_t d0123_a, d4567_a, d89_a; 172 int16x8_t e0123, e4567, e89; 173 // Second pass intermediates. 174 uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7, b8; 175 uint16x8_t c0, c1, c2, c3; 176 int16x8_t d0, d1; 177 uint8x8_t e0, e1; 178 uint8x8_t filter, filter0, filter1, filter2, filter3, filter4, filter5; 179 180 if (xoffset == 0) { // Second pass only. 181 yonly4x4(src_ptr, src_pixels_per_line, yoffset, dst_ptr, dst_pitch); 182 return; 183 } 184 185 if (yoffset == 0) { // First pass only. 186 src_ptr -= 2; 187 } else { // Add context for the second pass. 2 extra lines on top. 188 src_ptr -= 2 + (src_pixels_per_line * 2); 189 } 190 191 filter = vld1_u8(abs_filters[xoffset]); 192 filter0 = vdup_lane_u8(filter, 0); 193 filter1 = vdup_lane_u8(filter, 1); 194 filter2 = vdup_lane_u8(filter, 2); 195 filter3 = vdup_lane_u8(filter, 3); 196 filter4 = vdup_lane_u8(filter, 4); 197 filter5 = vdup_lane_u8(filter, 5); 198 199 // 2 bytes of context, 4 bytes of src values, 3 bytes of context, 7 bytes of 200 // garbage. So much effort for that last single bit. 201 // The low values of each pair are for filter0. 202 s0 = vld1q_u8(src_ptr); 203 src_ptr += src_pixels_per_line; 204 s1 = vld1q_u8(src_ptr); 205 src_ptr += src_pixels_per_line; 206 s2 = vld1q_u8(src_ptr); 207 src_ptr += src_pixels_per_line; 208 s3 = vld1q_u8(src_ptr); 209 src_ptr += src_pixels_per_line; 210 211 // Shift to extract values for filter[5] 212 // If src[] is 0, this puts: 213 // 3 4 5 6 7 8 9 10 in s0_f5 214 // Can't use vshr.u64 because it crosses the double word boundary. 215 s0_f5 = vext_u8(vget_low_u8(s0), vget_high_u8(s0), 5); 216 s1_f5 = vext_u8(vget_low_u8(s1), vget_high_u8(s1), 5); 217 s2_f5 = vext_u8(vget_low_u8(s2), vget_high_u8(s2), 5); 218 s3_f5 = vext_u8(vget_low_u8(s3), vget_high_u8(s3), 5); 219 220 s01_f0 = vcombine_u8(vget_low_u8(s0), vget_low_u8(s1)); 221 s23_f0 = vcombine_u8(vget_low_u8(s2), vget_low_u8(s3)); 222 223 s01_f5_q = vzip_u32(vreinterpret_u32_u8(s0_f5), vreinterpret_u32_u8(s1_f5)); 224 s23_f5_q = vzip_u32(vreinterpret_u32_u8(s2_f5), vreinterpret_u32_u8(s3_f5)); 225 d0123 = vmull_u8(vreinterpret_u8_u32(s01_f5_q.val[0]), filter5); 226 d4567 = vmull_u8(vreinterpret_u8_u32(s23_f5_q.val[0]), filter5); 227 228 // Keep original src data as 64 bits to simplify shifting and extracting. 229 s01 = vreinterpretq_u64_u8(s01_f0); 230 s23 = vreinterpretq_u64_u8(s23_f0); 231 232 // 3 4 5 6 * filter0 233 filter_add_accumulate(s01_f0, s23_f0, filter0, &d0123, &d4567); 234 235 // Shift over one to use -1, 0, 1, 2 for filter1 236 // -1 0 1 2 * filter1 237 filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 8)), 238 vreinterpretq_u8_u64(vshrq_n_u64(s23, 8)), filter1, 239 &d0123, &d4567); 240 241 // 2 3 4 5 * filter4 242 filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 32)), 243 vreinterpretq_u8_u64(vshrq_n_u64(s23, 32)), filter4, 244 &d0123, &d4567); 245 246 // 0 1 2 3 * filter2 247 filter_add_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 16)), 248 vreinterpretq_u8_u64(vshrq_n_u64(s23, 16)), filter2, 249 &d0123, &d4567); 250 251 // 1 2 3 4 * filter3 252 s01_f3 = vshrq_n_u64(s01, 24); 253 s23_f3 = vshrq_n_u64(s23, 24); 254 s01_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s01_f3)), 255 vreinterpret_u32_u64(vget_high_u64(s01_f3))); 256 s23_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s23_f3)), 257 vreinterpret_u32_u64(vget_high_u64(s23_f3))); 258 // Accumulate into different registers so it can use saturated addition. 259 d0123_a = vmull_u8(vreinterpret_u8_u32(s01_f3_q.val[0]), filter3); 260 d4567_a = vmull_u8(vreinterpret_u8_u32(s23_f3_q.val[0]), filter3); 261 262 e0123 = 263 vqaddq_s16(vreinterpretq_s16_u16(d0123), vreinterpretq_s16_u16(d0123_a)); 264 e4567 = 265 vqaddq_s16(vreinterpretq_s16_u16(d4567), vreinterpretq_s16_u16(d4567_a)); 266 267 // Shift and narrow. 268 b0 = vqrshrun_n_s16(e0123, 7); 269 b2 = vqrshrun_n_s16(e4567, 7); 270 271 if (yoffset == 0) { // firstpass_filter4x4_only 272 store_unaligned_u8q(dst_ptr, dst_pitch, vcombine_u8(b0, b2)); 273 return; 274 } 275 276 // Load additional context when doing both filters. 277 s0 = vld1q_u8(src_ptr); 278 src_ptr += src_pixels_per_line; 279 s1 = vld1q_u8(src_ptr); 280 src_ptr += src_pixels_per_line; 281 s2 = vld1q_u8(src_ptr); 282 src_ptr += src_pixels_per_line; 283 s3 = vld1q_u8(src_ptr); 284 src_ptr += src_pixels_per_line; 285 s4 = vld1q_u8(src_ptr); 286 287 s0_f5 = vext_u8(vget_low_u8(s0), vget_high_u8(s0), 5); 288 s1_f5 = vext_u8(vget_low_u8(s1), vget_high_u8(s1), 5); 289 s2_f5 = vext_u8(vget_low_u8(s2), vget_high_u8(s2), 5); 290 s3_f5 = vext_u8(vget_low_u8(s3), vget_high_u8(s3), 5); 291 s4_f5 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 5); 292 293 // 3 4 5 6 * filter0 294 s01_f0 = vcombine_u8(vget_low_u8(s0), vget_low_u8(s1)); 295 s23_f0 = vcombine_u8(vget_low_u8(s2), vget_low_u8(s3)); 296 297 s01_f5_q = vzip_u32(vreinterpret_u32_u8(s0_f5), vreinterpret_u32_u8(s1_f5)); 298 s23_f5_q = vzip_u32(vreinterpret_u32_u8(s2_f5), vreinterpret_u32_u8(s3_f5)); 299 // But this time instead of 16 pixels to filter, there are 20. So an extra 300 // run with a doubleword register. 301 d0123 = vmull_u8(vreinterpret_u8_u32(s01_f5_q.val[0]), filter5); 302 d4567 = vmull_u8(vreinterpret_u8_u32(s23_f5_q.val[0]), filter5); 303 d89 = vmull_u8(s4_f5, filter5); 304 305 // Save a copy as u64 for shifting. 306 s01 = vreinterpretq_u64_u8(s01_f0); 307 s23 = vreinterpretq_u64_u8(s23_f0); 308 309 filter_add_accumulate(s01_f0, s23_f0, filter0, &d0123, &d4567); 310 d89 = vmlal_u8(d89, vget_low_u8(s4), filter0); 311 312 filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 8)), 313 vreinterpretq_u8_u64(vshrq_n_u64(s23, 8)), filter1, 314 &d0123, &d4567); 315 s4_f1 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 1); 316 d89 = vmlsl_u8(d89, s4_f1, filter1); 317 318 filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 32)), 319 vreinterpretq_u8_u64(vshrq_n_u64(s23, 32)), filter4, 320 &d0123, &d4567); 321 s4_f4 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 4); 322 d89 = vmlsl_u8(d89, s4_f4, filter4); 323 324 filter_add_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 16)), 325 vreinterpretq_u8_u64(vshrq_n_u64(s23, 16)), filter2, 326 &d0123, &d4567); 327 s4_f2 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 2); 328 d89 = vmlal_u8(d89, s4_f2, filter2); 329 330 s01_f3 = vshrq_n_u64(s01, 24); 331 s23_f3 = vshrq_n_u64(s23, 24); 332 s01_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s01_f3)), 333 vreinterpret_u32_u64(vget_high_u64(s01_f3))); 334 s23_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s23_f3)), 335 vreinterpret_u32_u64(vget_high_u64(s23_f3))); 336 s4_f3 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 3); 337 d0123_a = vmull_u8(vreinterpret_u8_u32(s01_f3_q.val[0]), filter3); 338 d4567_a = vmull_u8(vreinterpret_u8_u32(s23_f3_q.val[0]), filter3); 339 d89_a = vmull_u8(s4_f3, filter3); 340 341 e0123 = 342 vqaddq_s16(vreinterpretq_s16_u16(d0123), vreinterpretq_s16_u16(d0123_a)); 343 e4567 = 344 vqaddq_s16(vreinterpretq_s16_u16(d4567), vreinterpretq_s16_u16(d4567_a)); 345 e89 = vqaddq_s16(vreinterpretq_s16_u16(d89), vreinterpretq_s16_u16(d89_a)); 346 347 b4 = vqrshrun_n_s16(e0123, 7); 348 b6 = vqrshrun_n_s16(e4567, 7); 349 b8 = vqrshrun_n_s16(e89, 7); 350 351 // Second pass: 4x4 352 filter = vld1_u8(abs_filters[yoffset]); 353 filter0 = vdup_lane_u8(filter, 0); 354 filter1 = vdup_lane_u8(filter, 1); 355 filter2 = vdup_lane_u8(filter, 2); 356 filter3 = vdup_lane_u8(filter, 3); 357 filter4 = vdup_lane_u8(filter, 4); 358 filter5 = vdup_lane_u8(filter, 5); 359 360 b1 = vext_u8(b0, b2, 4); 361 b3 = vext_u8(b2, b4, 4); 362 b5 = vext_u8(b4, b6, 4); 363 b7 = vext_u8(b6, b8, 4); 364 365 c0 = vmull_u8(b0, filter0); 366 c1 = vmull_u8(b2, filter0); 367 c2 = vmull_u8(b5, filter5); 368 c3 = vmull_u8(b7, filter5); 369 370 c0 = vmlsl_u8(c0, b4, filter4); 371 c1 = vmlsl_u8(c1, b6, filter4); 372 c2 = vmlsl_u8(c2, b1, filter1); 373 c3 = vmlsl_u8(c3, b3, filter1); 374 375 c0 = vmlal_u8(c0, b2, filter2); 376 c1 = vmlal_u8(c1, b4, filter2); 377 c2 = vmlal_u8(c2, b3, filter3); 378 c3 = vmlal_u8(c3, b5, filter3); 379 380 d0 = vqaddq_s16(vreinterpretq_s16_u16(c2), vreinterpretq_s16_u16(c0)); 381 d1 = vqaddq_s16(vreinterpretq_s16_u16(c3), vreinterpretq_s16_u16(c1)); 382 383 e0 = vqrshrun_n_s16(d0, 7); 384 e1 = vqrshrun_n_s16(d1, 7); 385 386 store_unaligned_u8q(dst_ptr, dst_pitch, vcombine_u8(e0, e1)); 387 } 388 389 void vp8_sixtap_predict8x4_neon(unsigned char *src_ptr, int src_pixels_per_line, 390 int xoffset, int yoffset, 391 unsigned char *dst_ptr, int dst_pitch) { 392 unsigned char *src; 393 uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8; 394 uint8x8_t d22u8, d23u8, d24u8, d25u8, d26u8; 395 uint8x8_t d27u8, d28u8, d29u8, d30u8, d31u8; 396 int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8; 397 uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16; 398 uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16; 399 int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16; 400 int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16; 401 uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8; 402 403 if (xoffset == 0) { // secondpass_filter8x4_only 404 // load second_pass filter 405 dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); 406 d0s8 = vdup_lane_s8(dtmps8, 0); 407 d1s8 = vdup_lane_s8(dtmps8, 1); 408 d2s8 = vdup_lane_s8(dtmps8, 2); 409 d3s8 = vdup_lane_s8(dtmps8, 3); 410 d4s8 = vdup_lane_s8(dtmps8, 4); 411 d5s8 = vdup_lane_s8(dtmps8, 5); 412 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); 413 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); 414 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); 415 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); 416 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); 417 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); 418 419 // load src data 420 src = src_ptr - src_pixels_per_line * 2; 421 d22u8 = vld1_u8(src); 422 src += src_pixels_per_line; 423 d23u8 = vld1_u8(src); 424 src += src_pixels_per_line; 425 d24u8 = vld1_u8(src); 426 src += src_pixels_per_line; 427 d25u8 = vld1_u8(src); 428 src += src_pixels_per_line; 429 d26u8 = vld1_u8(src); 430 src += src_pixels_per_line; 431 d27u8 = vld1_u8(src); 432 src += src_pixels_per_line; 433 d28u8 = vld1_u8(src); 434 src += src_pixels_per_line; 435 d29u8 = vld1_u8(src); 436 src += src_pixels_per_line; 437 d30u8 = vld1_u8(src); 438 439 q3u16 = vmull_u8(d22u8, d0u8); 440 q4u16 = vmull_u8(d23u8, d0u8); 441 q5u16 = vmull_u8(d24u8, d0u8); 442 q6u16 = vmull_u8(d25u8, d0u8); 443 444 q3u16 = vmlsl_u8(q3u16, d23u8, d1u8); 445 q4u16 = vmlsl_u8(q4u16, d24u8, d1u8); 446 q5u16 = vmlsl_u8(q5u16, d25u8, d1u8); 447 q6u16 = vmlsl_u8(q6u16, d26u8, d1u8); 448 449 q3u16 = vmlsl_u8(q3u16, d26u8, d4u8); 450 q4u16 = vmlsl_u8(q4u16, d27u8, d4u8); 451 q5u16 = vmlsl_u8(q5u16, d28u8, d4u8); 452 q6u16 = vmlsl_u8(q6u16, d29u8, d4u8); 453 454 q3u16 = vmlal_u8(q3u16, d24u8, d2u8); 455 q4u16 = vmlal_u8(q4u16, d25u8, d2u8); 456 q5u16 = vmlal_u8(q5u16, d26u8, d2u8); 457 q6u16 = vmlal_u8(q6u16, d27u8, d2u8); 458 459 q3u16 = vmlal_u8(q3u16, d27u8, d5u8); 460 q4u16 = vmlal_u8(q4u16, d28u8, d5u8); 461 q5u16 = vmlal_u8(q5u16, d29u8, d5u8); 462 q6u16 = vmlal_u8(q6u16, d30u8, d5u8); 463 464 q7u16 = vmull_u8(d25u8, d3u8); 465 q8u16 = vmull_u8(d26u8, d3u8); 466 q9u16 = vmull_u8(d27u8, d3u8); 467 q10u16 = vmull_u8(d28u8, d3u8); 468 469 q3s16 = vreinterpretq_s16_u16(q3u16); 470 q4s16 = vreinterpretq_s16_u16(q4u16); 471 q5s16 = vreinterpretq_s16_u16(q5u16); 472 q6s16 = vreinterpretq_s16_u16(q6u16); 473 q7s16 = vreinterpretq_s16_u16(q7u16); 474 q8s16 = vreinterpretq_s16_u16(q8u16); 475 q9s16 = vreinterpretq_s16_u16(q9u16); 476 q10s16 = vreinterpretq_s16_u16(q10u16); 477 478 q7s16 = vqaddq_s16(q7s16, q3s16); 479 q8s16 = vqaddq_s16(q8s16, q4s16); 480 q9s16 = vqaddq_s16(q9s16, q5s16); 481 q10s16 = vqaddq_s16(q10s16, q6s16); 482 483 d6u8 = vqrshrun_n_s16(q7s16, 7); 484 d7u8 = vqrshrun_n_s16(q8s16, 7); 485 d8u8 = vqrshrun_n_s16(q9s16, 7); 486 d9u8 = vqrshrun_n_s16(q10s16, 7); 487 488 vst1_u8(dst_ptr, d6u8); 489 dst_ptr += dst_pitch; 490 vst1_u8(dst_ptr, d7u8); 491 dst_ptr += dst_pitch; 492 vst1_u8(dst_ptr, d8u8); 493 dst_ptr += dst_pitch; 494 vst1_u8(dst_ptr, d9u8); 495 return; 496 } 497 498 // load first_pass filter 499 dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]); 500 d0s8 = vdup_lane_s8(dtmps8, 0); 501 d1s8 = vdup_lane_s8(dtmps8, 1); 502 d2s8 = vdup_lane_s8(dtmps8, 2); 503 d3s8 = vdup_lane_s8(dtmps8, 3); 504 d4s8 = vdup_lane_s8(dtmps8, 4); 505 d5s8 = vdup_lane_s8(dtmps8, 5); 506 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); 507 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); 508 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); 509 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); 510 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); 511 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); 512 513 // First pass: output_height lines x output_width columns (9x4) 514 if (yoffset == 0) // firstpass_filter4x4_only 515 src = src_ptr - 2; 516 else 517 src = src_ptr - 2 - (src_pixels_per_line * 2); 518 q3u8 = vld1q_u8(src); 519 src += src_pixels_per_line; 520 q4u8 = vld1q_u8(src); 521 src += src_pixels_per_line; 522 q5u8 = vld1q_u8(src); 523 src += src_pixels_per_line; 524 q6u8 = vld1q_u8(src); 525 526 q7u16 = vmull_u8(vget_low_u8(q3u8), d0u8); 527 q8u16 = vmull_u8(vget_low_u8(q4u8), d0u8); 528 q9u16 = vmull_u8(vget_low_u8(q5u8), d0u8); 529 q10u16 = vmull_u8(vget_low_u8(q6u8), d0u8); 530 531 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); 532 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1); 533 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1); 534 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1); 535 536 q7u16 = vmlsl_u8(q7u16, d28u8, d1u8); 537 q8u16 = vmlsl_u8(q8u16, d29u8, d1u8); 538 q9u16 = vmlsl_u8(q9u16, d30u8, d1u8); 539 q10u16 = vmlsl_u8(q10u16, d31u8, d1u8); 540 541 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4); 542 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4); 543 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4); 544 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4); 545 546 q7u16 = vmlsl_u8(q7u16, d28u8, d4u8); 547 q8u16 = vmlsl_u8(q8u16, d29u8, d4u8); 548 q9u16 = vmlsl_u8(q9u16, d30u8, d4u8); 549 q10u16 = vmlsl_u8(q10u16, d31u8, d4u8); 550 551 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2); 552 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2); 553 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2); 554 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2); 555 556 q7u16 = vmlal_u8(q7u16, d28u8, d2u8); 557 q8u16 = vmlal_u8(q8u16, d29u8, d2u8); 558 q9u16 = vmlal_u8(q9u16, d30u8, d2u8); 559 q10u16 = vmlal_u8(q10u16, d31u8, d2u8); 560 561 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5); 562 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5); 563 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5); 564 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5); 565 566 q7u16 = vmlal_u8(q7u16, d28u8, d5u8); 567 q8u16 = vmlal_u8(q8u16, d29u8, d5u8); 568 q9u16 = vmlal_u8(q9u16, d30u8, d5u8); 569 q10u16 = vmlal_u8(q10u16, d31u8, d5u8); 570 571 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3); 572 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3); 573 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3); 574 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3); 575 576 q3u16 = vmull_u8(d28u8, d3u8); 577 q4u16 = vmull_u8(d29u8, d3u8); 578 q5u16 = vmull_u8(d30u8, d3u8); 579 q6u16 = vmull_u8(d31u8, d3u8); 580 581 q3s16 = vreinterpretq_s16_u16(q3u16); 582 q4s16 = vreinterpretq_s16_u16(q4u16); 583 q5s16 = vreinterpretq_s16_u16(q5u16); 584 q6s16 = vreinterpretq_s16_u16(q6u16); 585 q7s16 = vreinterpretq_s16_u16(q7u16); 586 q8s16 = vreinterpretq_s16_u16(q8u16); 587 q9s16 = vreinterpretq_s16_u16(q9u16); 588 q10s16 = vreinterpretq_s16_u16(q10u16); 589 590 q7s16 = vqaddq_s16(q7s16, q3s16); 591 q8s16 = vqaddq_s16(q8s16, q4s16); 592 q9s16 = vqaddq_s16(q9s16, q5s16); 593 q10s16 = vqaddq_s16(q10s16, q6s16); 594 595 d22u8 = vqrshrun_n_s16(q7s16, 7); 596 d23u8 = vqrshrun_n_s16(q8s16, 7); 597 d24u8 = vqrshrun_n_s16(q9s16, 7); 598 d25u8 = vqrshrun_n_s16(q10s16, 7); 599 600 if (yoffset == 0) { // firstpass_filter8x4_only 601 vst1_u8(dst_ptr, d22u8); 602 dst_ptr += dst_pitch; 603 vst1_u8(dst_ptr, d23u8); 604 dst_ptr += dst_pitch; 605 vst1_u8(dst_ptr, d24u8); 606 dst_ptr += dst_pitch; 607 vst1_u8(dst_ptr, d25u8); 608 return; 609 } 610 611 // First Pass on rest 5-line data 612 src += src_pixels_per_line; 613 q3u8 = vld1q_u8(src); 614 src += src_pixels_per_line; 615 q4u8 = vld1q_u8(src); 616 src += src_pixels_per_line; 617 q5u8 = vld1q_u8(src); 618 src += src_pixels_per_line; 619 q6u8 = vld1q_u8(src); 620 src += src_pixels_per_line; 621 q7u8 = vld1q_u8(src); 622 623 q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8); 624 q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8); 625 q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8); 626 q11u16 = vmull_u8(vget_low_u8(q6u8), d0u8); 627 q12u16 = vmull_u8(vget_low_u8(q7u8), d0u8); 628 629 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); 630 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1); 631 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1); 632 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1); 633 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 1); 634 635 q8u16 = vmlsl_u8(q8u16, d27u8, d1u8); 636 q9u16 = vmlsl_u8(q9u16, d28u8, d1u8); 637 q10u16 = vmlsl_u8(q10u16, d29u8, d1u8); 638 q11u16 = vmlsl_u8(q11u16, d30u8, d1u8); 639 q12u16 = vmlsl_u8(q12u16, d31u8, d1u8); 640 641 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4); 642 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4); 643 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4); 644 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4); 645 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 4); 646 647 q8u16 = vmlsl_u8(q8u16, d27u8, d4u8); 648 q9u16 = vmlsl_u8(q9u16, d28u8, d4u8); 649 q10u16 = vmlsl_u8(q10u16, d29u8, d4u8); 650 q11u16 = vmlsl_u8(q11u16, d30u8, d4u8); 651 q12u16 = vmlsl_u8(q12u16, d31u8, d4u8); 652 653 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2); 654 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2); 655 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2); 656 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2); 657 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 2); 658 659 q8u16 = vmlal_u8(q8u16, d27u8, d2u8); 660 q9u16 = vmlal_u8(q9u16, d28u8, d2u8); 661 q10u16 = vmlal_u8(q10u16, d29u8, d2u8); 662 q11u16 = vmlal_u8(q11u16, d30u8, d2u8); 663 q12u16 = vmlal_u8(q12u16, d31u8, d2u8); 664 665 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5); 666 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5); 667 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5); 668 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5); 669 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 5); 670 671 q8u16 = vmlal_u8(q8u16, d27u8, d5u8); 672 q9u16 = vmlal_u8(q9u16, d28u8, d5u8); 673 q10u16 = vmlal_u8(q10u16, d29u8, d5u8); 674 q11u16 = vmlal_u8(q11u16, d30u8, d5u8); 675 q12u16 = vmlal_u8(q12u16, d31u8, d5u8); 676 677 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3); 678 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3); 679 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3); 680 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3); 681 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 3); 682 683 q3u16 = vmull_u8(d27u8, d3u8); 684 q4u16 = vmull_u8(d28u8, d3u8); 685 q5u16 = vmull_u8(d29u8, d3u8); 686 q6u16 = vmull_u8(d30u8, d3u8); 687 q7u16 = vmull_u8(d31u8, d3u8); 688 689 q3s16 = vreinterpretq_s16_u16(q3u16); 690 q4s16 = vreinterpretq_s16_u16(q4u16); 691 q5s16 = vreinterpretq_s16_u16(q5u16); 692 q6s16 = vreinterpretq_s16_u16(q6u16); 693 q7s16 = vreinterpretq_s16_u16(q7u16); 694 q8s16 = vreinterpretq_s16_u16(q8u16); 695 q9s16 = vreinterpretq_s16_u16(q9u16); 696 q10s16 = vreinterpretq_s16_u16(q10u16); 697 q11s16 = vreinterpretq_s16_u16(q11u16); 698 q12s16 = vreinterpretq_s16_u16(q12u16); 699 700 q8s16 = vqaddq_s16(q8s16, q3s16); 701 q9s16 = vqaddq_s16(q9s16, q4s16); 702 q10s16 = vqaddq_s16(q10s16, q5s16); 703 q11s16 = vqaddq_s16(q11s16, q6s16); 704 q12s16 = vqaddq_s16(q12s16, q7s16); 705 706 d26u8 = vqrshrun_n_s16(q8s16, 7); 707 d27u8 = vqrshrun_n_s16(q9s16, 7); 708 d28u8 = vqrshrun_n_s16(q10s16, 7); 709 d29u8 = vqrshrun_n_s16(q11s16, 7); 710 d30u8 = vqrshrun_n_s16(q12s16, 7); 711 712 // Second pass: 8x4 713 dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); 714 d0s8 = vdup_lane_s8(dtmps8, 0); 715 d1s8 = vdup_lane_s8(dtmps8, 1); 716 d2s8 = vdup_lane_s8(dtmps8, 2); 717 d3s8 = vdup_lane_s8(dtmps8, 3); 718 d4s8 = vdup_lane_s8(dtmps8, 4); 719 d5s8 = vdup_lane_s8(dtmps8, 5); 720 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); 721 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); 722 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); 723 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); 724 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); 725 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); 726 727 q3u16 = vmull_u8(d22u8, d0u8); 728 q4u16 = vmull_u8(d23u8, d0u8); 729 q5u16 = vmull_u8(d24u8, d0u8); 730 q6u16 = vmull_u8(d25u8, d0u8); 731 732 q3u16 = vmlsl_u8(q3u16, d23u8, d1u8); 733 q4u16 = vmlsl_u8(q4u16, d24u8, d1u8); 734 q5u16 = vmlsl_u8(q5u16, d25u8, d1u8); 735 q6u16 = vmlsl_u8(q6u16, d26u8, d1u8); 736 737 q3u16 = vmlsl_u8(q3u16, d26u8, d4u8); 738 q4u16 = vmlsl_u8(q4u16, d27u8, d4u8); 739 q5u16 = vmlsl_u8(q5u16, d28u8, d4u8); 740 q6u16 = vmlsl_u8(q6u16, d29u8, d4u8); 741 742 q3u16 = vmlal_u8(q3u16, d24u8, d2u8); 743 q4u16 = vmlal_u8(q4u16, d25u8, d2u8); 744 q5u16 = vmlal_u8(q5u16, d26u8, d2u8); 745 q6u16 = vmlal_u8(q6u16, d27u8, d2u8); 746 747 q3u16 = vmlal_u8(q3u16, d27u8, d5u8); 748 q4u16 = vmlal_u8(q4u16, d28u8, d5u8); 749 q5u16 = vmlal_u8(q5u16, d29u8, d5u8); 750 q6u16 = vmlal_u8(q6u16, d30u8, d5u8); 751 752 q7u16 = vmull_u8(d25u8, d3u8); 753 q8u16 = vmull_u8(d26u8, d3u8); 754 q9u16 = vmull_u8(d27u8, d3u8); 755 q10u16 = vmull_u8(d28u8, d3u8); 756 757 q3s16 = vreinterpretq_s16_u16(q3u16); 758 q4s16 = vreinterpretq_s16_u16(q4u16); 759 q5s16 = vreinterpretq_s16_u16(q5u16); 760 q6s16 = vreinterpretq_s16_u16(q6u16); 761 q7s16 = vreinterpretq_s16_u16(q7u16); 762 q8s16 = vreinterpretq_s16_u16(q8u16); 763 q9s16 = vreinterpretq_s16_u16(q9u16); 764 q10s16 = vreinterpretq_s16_u16(q10u16); 765 766 q7s16 = vqaddq_s16(q7s16, q3s16); 767 q8s16 = vqaddq_s16(q8s16, q4s16); 768 q9s16 = vqaddq_s16(q9s16, q5s16); 769 q10s16 = vqaddq_s16(q10s16, q6s16); 770 771 d6u8 = vqrshrun_n_s16(q7s16, 7); 772 d7u8 = vqrshrun_n_s16(q8s16, 7); 773 d8u8 = vqrshrun_n_s16(q9s16, 7); 774 d9u8 = vqrshrun_n_s16(q10s16, 7); 775 776 vst1_u8(dst_ptr, d6u8); 777 dst_ptr += dst_pitch; 778 vst1_u8(dst_ptr, d7u8); 779 dst_ptr += dst_pitch; 780 vst1_u8(dst_ptr, d8u8); 781 dst_ptr += dst_pitch; 782 vst1_u8(dst_ptr, d9u8); 783 return; 784 } 785 786 void vp8_sixtap_predict8x8_neon(unsigned char *src_ptr, int src_pixels_per_line, 787 int xoffset, int yoffset, 788 unsigned char *dst_ptr, int dst_pitch) { 789 unsigned char *src, *tmpp; 790 unsigned char tmp[64]; 791 int i; 792 uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8; 793 uint8x8_t d18u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8, d25u8; 794 uint8x8_t d26u8, d27u8, d28u8, d29u8, d30u8, d31u8; 795 int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8; 796 uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16; 797 uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16; 798 int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16; 799 int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16; 800 uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q9u8, q10u8, q11u8, q12u8; 801 802 if (xoffset == 0) { // secondpass_filter8x8_only 803 // load second_pass filter 804 dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); 805 d0s8 = vdup_lane_s8(dtmps8, 0); 806 d1s8 = vdup_lane_s8(dtmps8, 1); 807 d2s8 = vdup_lane_s8(dtmps8, 2); 808 d3s8 = vdup_lane_s8(dtmps8, 3); 809 d4s8 = vdup_lane_s8(dtmps8, 4); 810 d5s8 = vdup_lane_s8(dtmps8, 5); 811 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); 812 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); 813 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); 814 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); 815 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); 816 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); 817 818 // load src data 819 src = src_ptr - src_pixels_per_line * 2; 820 d18u8 = vld1_u8(src); 821 src += src_pixels_per_line; 822 d19u8 = vld1_u8(src); 823 src += src_pixels_per_line; 824 d20u8 = vld1_u8(src); 825 src += src_pixels_per_line; 826 d21u8 = vld1_u8(src); 827 src += src_pixels_per_line; 828 d22u8 = vld1_u8(src); 829 src += src_pixels_per_line; 830 d23u8 = vld1_u8(src); 831 src += src_pixels_per_line; 832 d24u8 = vld1_u8(src); 833 src += src_pixels_per_line; 834 d25u8 = vld1_u8(src); 835 src += src_pixels_per_line; 836 d26u8 = vld1_u8(src); 837 src += src_pixels_per_line; 838 d27u8 = vld1_u8(src); 839 src += src_pixels_per_line; 840 d28u8 = vld1_u8(src); 841 src += src_pixels_per_line; 842 d29u8 = vld1_u8(src); 843 src += src_pixels_per_line; 844 d30u8 = vld1_u8(src); 845 846 for (i = 2; i > 0; i--) { 847 q3u16 = vmull_u8(d18u8, d0u8); 848 q4u16 = vmull_u8(d19u8, d0u8); 849 q5u16 = vmull_u8(d20u8, d0u8); 850 q6u16 = vmull_u8(d21u8, d0u8); 851 852 q3u16 = vmlsl_u8(q3u16, d19u8, d1u8); 853 q4u16 = vmlsl_u8(q4u16, d20u8, d1u8); 854 q5u16 = vmlsl_u8(q5u16, d21u8, d1u8); 855 q6u16 = vmlsl_u8(q6u16, d22u8, d1u8); 856 857 q3u16 = vmlsl_u8(q3u16, d22u8, d4u8); 858 q4u16 = vmlsl_u8(q4u16, d23u8, d4u8); 859 q5u16 = vmlsl_u8(q5u16, d24u8, d4u8); 860 q6u16 = vmlsl_u8(q6u16, d25u8, d4u8); 861 862 q3u16 = vmlal_u8(q3u16, d20u8, d2u8); 863 q4u16 = vmlal_u8(q4u16, d21u8, d2u8); 864 q5u16 = vmlal_u8(q5u16, d22u8, d2u8); 865 q6u16 = vmlal_u8(q6u16, d23u8, d2u8); 866 867 q3u16 = vmlal_u8(q3u16, d23u8, d5u8); 868 q4u16 = vmlal_u8(q4u16, d24u8, d5u8); 869 q5u16 = vmlal_u8(q5u16, d25u8, d5u8); 870 q6u16 = vmlal_u8(q6u16, d26u8, d5u8); 871 872 q7u16 = vmull_u8(d21u8, d3u8); 873 q8u16 = vmull_u8(d22u8, d3u8); 874 q9u16 = vmull_u8(d23u8, d3u8); 875 q10u16 = vmull_u8(d24u8, d3u8); 876 877 q3s16 = vreinterpretq_s16_u16(q3u16); 878 q4s16 = vreinterpretq_s16_u16(q4u16); 879 q5s16 = vreinterpretq_s16_u16(q5u16); 880 q6s16 = vreinterpretq_s16_u16(q6u16); 881 q7s16 = vreinterpretq_s16_u16(q7u16); 882 q8s16 = vreinterpretq_s16_u16(q8u16); 883 q9s16 = vreinterpretq_s16_u16(q9u16); 884 q10s16 = vreinterpretq_s16_u16(q10u16); 885 886 q7s16 = vqaddq_s16(q7s16, q3s16); 887 q8s16 = vqaddq_s16(q8s16, q4s16); 888 q9s16 = vqaddq_s16(q9s16, q5s16); 889 q10s16 = vqaddq_s16(q10s16, q6s16); 890 891 d6u8 = vqrshrun_n_s16(q7s16, 7); 892 d7u8 = vqrshrun_n_s16(q8s16, 7); 893 d8u8 = vqrshrun_n_s16(q9s16, 7); 894 d9u8 = vqrshrun_n_s16(q10s16, 7); 895 896 d18u8 = d22u8; 897 d19u8 = d23u8; 898 d20u8 = d24u8; 899 d21u8 = d25u8; 900 d22u8 = d26u8; 901 d23u8 = d27u8; 902 d24u8 = d28u8; 903 d25u8 = d29u8; 904 d26u8 = d30u8; 905 906 vst1_u8(dst_ptr, d6u8); 907 dst_ptr += dst_pitch; 908 vst1_u8(dst_ptr, d7u8); 909 dst_ptr += dst_pitch; 910 vst1_u8(dst_ptr, d8u8); 911 dst_ptr += dst_pitch; 912 vst1_u8(dst_ptr, d9u8); 913 dst_ptr += dst_pitch; 914 } 915 return; 916 } 917 918 // load first_pass filter 919 dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]); 920 d0s8 = vdup_lane_s8(dtmps8, 0); 921 d1s8 = vdup_lane_s8(dtmps8, 1); 922 d2s8 = vdup_lane_s8(dtmps8, 2); 923 d3s8 = vdup_lane_s8(dtmps8, 3); 924 d4s8 = vdup_lane_s8(dtmps8, 4); 925 d5s8 = vdup_lane_s8(dtmps8, 5); 926 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); 927 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); 928 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); 929 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); 930 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); 931 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); 932 933 // First pass: output_height lines x output_width columns (9x4) 934 if (yoffset == 0) // firstpass_filter4x4_only 935 src = src_ptr - 2; 936 else 937 src = src_ptr - 2 - (src_pixels_per_line * 2); 938 939 tmpp = tmp; 940 for (i = 2; i > 0; i--) { 941 q3u8 = vld1q_u8(src); 942 src += src_pixels_per_line; 943 q4u8 = vld1q_u8(src); 944 src += src_pixels_per_line; 945 q5u8 = vld1q_u8(src); 946 src += src_pixels_per_line; 947 q6u8 = vld1q_u8(src); 948 src += src_pixels_per_line; 949 950 __builtin_prefetch(src); 951 __builtin_prefetch(src + src_pixels_per_line); 952 __builtin_prefetch(src + src_pixels_per_line * 2); 953 954 q7u16 = vmull_u8(vget_low_u8(q3u8), d0u8); 955 q8u16 = vmull_u8(vget_low_u8(q4u8), d0u8); 956 q9u16 = vmull_u8(vget_low_u8(q5u8), d0u8); 957 q10u16 = vmull_u8(vget_low_u8(q6u8), d0u8); 958 959 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); 960 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1); 961 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1); 962 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1); 963 964 q7u16 = vmlsl_u8(q7u16, d28u8, d1u8); 965 q8u16 = vmlsl_u8(q8u16, d29u8, d1u8); 966 q9u16 = vmlsl_u8(q9u16, d30u8, d1u8); 967 q10u16 = vmlsl_u8(q10u16, d31u8, d1u8); 968 969 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4); 970 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4); 971 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4); 972 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4); 973 974 q7u16 = vmlsl_u8(q7u16, d28u8, d4u8); 975 q8u16 = vmlsl_u8(q8u16, d29u8, d4u8); 976 q9u16 = vmlsl_u8(q9u16, d30u8, d4u8); 977 q10u16 = vmlsl_u8(q10u16, d31u8, d4u8); 978 979 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2); 980 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2); 981 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2); 982 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2); 983 984 q7u16 = vmlal_u8(q7u16, d28u8, d2u8); 985 q8u16 = vmlal_u8(q8u16, d29u8, d2u8); 986 q9u16 = vmlal_u8(q9u16, d30u8, d2u8); 987 q10u16 = vmlal_u8(q10u16, d31u8, d2u8); 988 989 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5); 990 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5); 991 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5); 992 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5); 993 994 q7u16 = vmlal_u8(q7u16, d28u8, d5u8); 995 q8u16 = vmlal_u8(q8u16, d29u8, d5u8); 996 q9u16 = vmlal_u8(q9u16, d30u8, d5u8); 997 q10u16 = vmlal_u8(q10u16, d31u8, d5u8); 998 999 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3); 1000 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3); 1001 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3); 1002 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3); 1003 1004 q3u16 = vmull_u8(d28u8, d3u8); 1005 q4u16 = vmull_u8(d29u8, d3u8); 1006 q5u16 = vmull_u8(d30u8, d3u8); 1007 q6u16 = vmull_u8(d31u8, d3u8); 1008 1009 q3s16 = vreinterpretq_s16_u16(q3u16); 1010 q4s16 = vreinterpretq_s16_u16(q4u16); 1011 q5s16 = vreinterpretq_s16_u16(q5u16); 1012 q6s16 = vreinterpretq_s16_u16(q6u16); 1013 q7s16 = vreinterpretq_s16_u16(q7u16); 1014 q8s16 = vreinterpretq_s16_u16(q8u16); 1015 q9s16 = vreinterpretq_s16_u16(q9u16); 1016 q10s16 = vreinterpretq_s16_u16(q10u16); 1017 1018 q7s16 = vqaddq_s16(q7s16, q3s16); 1019 q8s16 = vqaddq_s16(q8s16, q4s16); 1020 q9s16 = vqaddq_s16(q9s16, q5s16); 1021 q10s16 = vqaddq_s16(q10s16, q6s16); 1022 1023 d22u8 = vqrshrun_n_s16(q7s16, 7); 1024 d23u8 = vqrshrun_n_s16(q8s16, 7); 1025 d24u8 = vqrshrun_n_s16(q9s16, 7); 1026 d25u8 = vqrshrun_n_s16(q10s16, 7); 1027 1028 if (yoffset == 0) { // firstpass_filter8x4_only 1029 vst1_u8(dst_ptr, d22u8); 1030 dst_ptr += dst_pitch; 1031 vst1_u8(dst_ptr, d23u8); 1032 dst_ptr += dst_pitch; 1033 vst1_u8(dst_ptr, d24u8); 1034 dst_ptr += dst_pitch; 1035 vst1_u8(dst_ptr, d25u8); 1036 dst_ptr += dst_pitch; 1037 } else { 1038 vst1_u8(tmpp, d22u8); 1039 tmpp += 8; 1040 vst1_u8(tmpp, d23u8); 1041 tmpp += 8; 1042 vst1_u8(tmpp, d24u8); 1043 tmpp += 8; 1044 vst1_u8(tmpp, d25u8); 1045 tmpp += 8; 1046 } 1047 } 1048 if (yoffset == 0) return; 1049 1050 // First Pass on rest 5-line data 1051 q3u8 = vld1q_u8(src); 1052 src += src_pixels_per_line; 1053 q4u8 = vld1q_u8(src); 1054 src += src_pixels_per_line; 1055 q5u8 = vld1q_u8(src); 1056 src += src_pixels_per_line; 1057 q6u8 = vld1q_u8(src); 1058 src += src_pixels_per_line; 1059 q7u8 = vld1q_u8(src); 1060 1061 q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8); 1062 q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8); 1063 q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8); 1064 q11u16 = vmull_u8(vget_low_u8(q6u8), d0u8); 1065 q12u16 = vmull_u8(vget_low_u8(q7u8), d0u8); 1066 1067 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); 1068 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1); 1069 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1); 1070 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1); 1071 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 1); 1072 1073 q8u16 = vmlsl_u8(q8u16, d27u8, d1u8); 1074 q9u16 = vmlsl_u8(q9u16, d28u8, d1u8); 1075 q10u16 = vmlsl_u8(q10u16, d29u8, d1u8); 1076 q11u16 = vmlsl_u8(q11u16, d30u8, d1u8); 1077 q12u16 = vmlsl_u8(q12u16, d31u8, d1u8); 1078 1079 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4); 1080 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4); 1081 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4); 1082 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4); 1083 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 4); 1084 1085 q8u16 = vmlsl_u8(q8u16, d27u8, d4u8); 1086 q9u16 = vmlsl_u8(q9u16, d28u8, d4u8); 1087 q10u16 = vmlsl_u8(q10u16, d29u8, d4u8); 1088 q11u16 = vmlsl_u8(q11u16, d30u8, d4u8); 1089 q12u16 = vmlsl_u8(q12u16, d31u8, d4u8); 1090 1091 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2); 1092 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2); 1093 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2); 1094 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2); 1095 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 2); 1096 1097 q8u16 = vmlal_u8(q8u16, d27u8, d2u8); 1098 q9u16 = vmlal_u8(q9u16, d28u8, d2u8); 1099 q10u16 = vmlal_u8(q10u16, d29u8, d2u8); 1100 q11u16 = vmlal_u8(q11u16, d30u8, d2u8); 1101 q12u16 = vmlal_u8(q12u16, d31u8, d2u8); 1102 1103 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5); 1104 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5); 1105 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5); 1106 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5); 1107 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 5); 1108 1109 q8u16 = vmlal_u8(q8u16, d27u8, d5u8); 1110 q9u16 = vmlal_u8(q9u16, d28u8, d5u8); 1111 q10u16 = vmlal_u8(q10u16, d29u8, d5u8); 1112 q11u16 = vmlal_u8(q11u16, d30u8, d5u8); 1113 q12u16 = vmlal_u8(q12u16, d31u8, d5u8); 1114 1115 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3); 1116 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3); 1117 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3); 1118 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3); 1119 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 3); 1120 1121 q3u16 = vmull_u8(d27u8, d3u8); 1122 q4u16 = vmull_u8(d28u8, d3u8); 1123 q5u16 = vmull_u8(d29u8, d3u8); 1124 q6u16 = vmull_u8(d30u8, d3u8); 1125 q7u16 = vmull_u8(d31u8, d3u8); 1126 1127 q3s16 = vreinterpretq_s16_u16(q3u16); 1128 q4s16 = vreinterpretq_s16_u16(q4u16); 1129 q5s16 = vreinterpretq_s16_u16(q5u16); 1130 q6s16 = vreinterpretq_s16_u16(q6u16); 1131 q7s16 = vreinterpretq_s16_u16(q7u16); 1132 q8s16 = vreinterpretq_s16_u16(q8u16); 1133 q9s16 = vreinterpretq_s16_u16(q9u16); 1134 q10s16 = vreinterpretq_s16_u16(q10u16); 1135 q11s16 = vreinterpretq_s16_u16(q11u16); 1136 q12s16 = vreinterpretq_s16_u16(q12u16); 1137 1138 q8s16 = vqaddq_s16(q8s16, q3s16); 1139 q9s16 = vqaddq_s16(q9s16, q4s16); 1140 q10s16 = vqaddq_s16(q10s16, q5s16); 1141 q11s16 = vqaddq_s16(q11s16, q6s16); 1142 q12s16 = vqaddq_s16(q12s16, q7s16); 1143 1144 d26u8 = vqrshrun_n_s16(q8s16, 7); 1145 d27u8 = vqrshrun_n_s16(q9s16, 7); 1146 d28u8 = vqrshrun_n_s16(q10s16, 7); 1147 d29u8 = vqrshrun_n_s16(q11s16, 7); 1148 d30u8 = vqrshrun_n_s16(q12s16, 7); 1149 1150 // Second pass: 8x8 1151 dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); 1152 d0s8 = vdup_lane_s8(dtmps8, 0); 1153 d1s8 = vdup_lane_s8(dtmps8, 1); 1154 d2s8 = vdup_lane_s8(dtmps8, 2); 1155 d3s8 = vdup_lane_s8(dtmps8, 3); 1156 d4s8 = vdup_lane_s8(dtmps8, 4); 1157 d5s8 = vdup_lane_s8(dtmps8, 5); 1158 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); 1159 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); 1160 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); 1161 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); 1162 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); 1163 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); 1164 1165 tmpp = tmp; 1166 q9u8 = vld1q_u8(tmpp); 1167 tmpp += 16; 1168 q10u8 = vld1q_u8(tmpp); 1169 tmpp += 16; 1170 q11u8 = vld1q_u8(tmpp); 1171 tmpp += 16; 1172 q12u8 = vld1q_u8(tmpp); 1173 1174 d18u8 = vget_low_u8(q9u8); 1175 d19u8 = vget_high_u8(q9u8); 1176 d20u8 = vget_low_u8(q10u8); 1177 d21u8 = vget_high_u8(q10u8); 1178 d22u8 = vget_low_u8(q11u8); 1179 d23u8 = vget_high_u8(q11u8); 1180 d24u8 = vget_low_u8(q12u8); 1181 d25u8 = vget_high_u8(q12u8); 1182 1183 for (i = 2; i > 0; i--) { 1184 q3u16 = vmull_u8(d18u8, d0u8); 1185 q4u16 = vmull_u8(d19u8, d0u8); 1186 q5u16 = vmull_u8(d20u8, d0u8); 1187 q6u16 = vmull_u8(d21u8, d0u8); 1188 1189 q3u16 = vmlsl_u8(q3u16, d19u8, d1u8); 1190 q4u16 = vmlsl_u8(q4u16, d20u8, d1u8); 1191 q5u16 = vmlsl_u8(q5u16, d21u8, d1u8); 1192 q6u16 = vmlsl_u8(q6u16, d22u8, d1u8); 1193 1194 q3u16 = vmlsl_u8(q3u16, d22u8, d4u8); 1195 q4u16 = vmlsl_u8(q4u16, d23u8, d4u8); 1196 q5u16 = vmlsl_u8(q5u16, d24u8, d4u8); 1197 q6u16 = vmlsl_u8(q6u16, d25u8, d4u8); 1198 1199 q3u16 = vmlal_u8(q3u16, d20u8, d2u8); 1200 q4u16 = vmlal_u8(q4u16, d21u8, d2u8); 1201 q5u16 = vmlal_u8(q5u16, d22u8, d2u8); 1202 q6u16 = vmlal_u8(q6u16, d23u8, d2u8); 1203 1204 q3u16 = vmlal_u8(q3u16, d23u8, d5u8); 1205 q4u16 = vmlal_u8(q4u16, d24u8, d5u8); 1206 q5u16 = vmlal_u8(q5u16, d25u8, d5u8); 1207 q6u16 = vmlal_u8(q6u16, d26u8, d5u8); 1208 1209 q7u16 = vmull_u8(d21u8, d3u8); 1210 q8u16 = vmull_u8(d22u8, d3u8); 1211 q9u16 = vmull_u8(d23u8, d3u8); 1212 q10u16 = vmull_u8(d24u8, d3u8); 1213 1214 q3s16 = vreinterpretq_s16_u16(q3u16); 1215 q4s16 = vreinterpretq_s16_u16(q4u16); 1216 q5s16 = vreinterpretq_s16_u16(q5u16); 1217 q6s16 = vreinterpretq_s16_u16(q6u16); 1218 q7s16 = vreinterpretq_s16_u16(q7u16); 1219 q8s16 = vreinterpretq_s16_u16(q8u16); 1220 q9s16 = vreinterpretq_s16_u16(q9u16); 1221 q10s16 = vreinterpretq_s16_u16(q10u16); 1222 1223 q7s16 = vqaddq_s16(q7s16, q3s16); 1224 q8s16 = vqaddq_s16(q8s16, q4s16); 1225 q9s16 = vqaddq_s16(q9s16, q5s16); 1226 q10s16 = vqaddq_s16(q10s16, q6s16); 1227 1228 d6u8 = vqrshrun_n_s16(q7s16, 7); 1229 d7u8 = vqrshrun_n_s16(q8s16, 7); 1230 d8u8 = vqrshrun_n_s16(q9s16, 7); 1231 d9u8 = vqrshrun_n_s16(q10s16, 7); 1232 1233 d18u8 = d22u8; 1234 d19u8 = d23u8; 1235 d20u8 = d24u8; 1236 d21u8 = d25u8; 1237 d22u8 = d26u8; 1238 d23u8 = d27u8; 1239 d24u8 = d28u8; 1240 d25u8 = d29u8; 1241 d26u8 = d30u8; 1242 1243 vst1_u8(dst_ptr, d6u8); 1244 dst_ptr += dst_pitch; 1245 vst1_u8(dst_ptr, d7u8); 1246 dst_ptr += dst_pitch; 1247 vst1_u8(dst_ptr, d8u8); 1248 dst_ptr += dst_pitch; 1249 vst1_u8(dst_ptr, d9u8); 1250 dst_ptr += dst_pitch; 1251 } 1252 return; 1253 } 1254 1255 void vp8_sixtap_predict16x16_neon(unsigned char *src_ptr, 1256 int src_pixels_per_line, int xoffset, 1257 int yoffset, unsigned char *dst_ptr, 1258 int dst_pitch) { 1259 unsigned char *src, *src_tmp, *dst, *tmpp; 1260 unsigned char tmp[336]; 1261 int i, j; 1262 uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8; 1263 uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d18u8, d19u8; 1264 uint8x8_t d20u8, d21u8, d22u8, d23u8, d24u8, d25u8, d26u8, d27u8; 1265 uint8x8_t d28u8, d29u8, d30u8, d31u8; 1266 int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8; 1267 uint8x16_t q3u8, q4u8; 1268 uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16, q8u16, q9u16, q10u16; 1269 uint16x8_t q11u16, q12u16, q13u16, q15u16; 1270 int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16, q8s16, q9s16, q10s16; 1271 int16x8_t q11s16, q12s16, q13s16, q15s16; 1272 1273 if (xoffset == 0) { // secondpass_filter8x8_only 1274 // load second_pass filter 1275 dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); 1276 d0s8 = vdup_lane_s8(dtmps8, 0); 1277 d1s8 = vdup_lane_s8(dtmps8, 1); 1278 d2s8 = vdup_lane_s8(dtmps8, 2); 1279 d3s8 = vdup_lane_s8(dtmps8, 3); 1280 d4s8 = vdup_lane_s8(dtmps8, 4); 1281 d5s8 = vdup_lane_s8(dtmps8, 5); 1282 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); 1283 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); 1284 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); 1285 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); 1286 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); 1287 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); 1288 1289 // load src data 1290 src_tmp = src_ptr - src_pixels_per_line * 2; 1291 for (i = 0; i < 2; ++i) { 1292 src = src_tmp + i * 8; 1293 dst = dst_ptr + i * 8; 1294 d18u8 = vld1_u8(src); 1295 src += src_pixels_per_line; 1296 d19u8 = vld1_u8(src); 1297 src += src_pixels_per_line; 1298 d20u8 = vld1_u8(src); 1299 src += src_pixels_per_line; 1300 d21u8 = vld1_u8(src); 1301 src += src_pixels_per_line; 1302 d22u8 = vld1_u8(src); 1303 src += src_pixels_per_line; 1304 for (j = 0; j < 4; ++j) { 1305 d23u8 = vld1_u8(src); 1306 src += src_pixels_per_line; 1307 d24u8 = vld1_u8(src); 1308 src += src_pixels_per_line; 1309 d25u8 = vld1_u8(src); 1310 src += src_pixels_per_line; 1311 d26u8 = vld1_u8(src); 1312 src += src_pixels_per_line; 1313 1314 q3u16 = vmull_u8(d18u8, d0u8); 1315 q4u16 = vmull_u8(d19u8, d0u8); 1316 q5u16 = vmull_u8(d20u8, d0u8); 1317 q6u16 = vmull_u8(d21u8, d0u8); 1318 1319 q3u16 = vmlsl_u8(q3u16, d19u8, d1u8); 1320 q4u16 = vmlsl_u8(q4u16, d20u8, d1u8); 1321 q5u16 = vmlsl_u8(q5u16, d21u8, d1u8); 1322 q6u16 = vmlsl_u8(q6u16, d22u8, d1u8); 1323 1324 q3u16 = vmlsl_u8(q3u16, d22u8, d4u8); 1325 q4u16 = vmlsl_u8(q4u16, d23u8, d4u8); 1326 q5u16 = vmlsl_u8(q5u16, d24u8, d4u8); 1327 q6u16 = vmlsl_u8(q6u16, d25u8, d4u8); 1328 1329 q3u16 = vmlal_u8(q3u16, d20u8, d2u8); 1330 q4u16 = vmlal_u8(q4u16, d21u8, d2u8); 1331 q5u16 = vmlal_u8(q5u16, d22u8, d2u8); 1332 q6u16 = vmlal_u8(q6u16, d23u8, d2u8); 1333 1334 q3u16 = vmlal_u8(q3u16, d23u8, d5u8); 1335 q4u16 = vmlal_u8(q4u16, d24u8, d5u8); 1336 q5u16 = vmlal_u8(q5u16, d25u8, d5u8); 1337 q6u16 = vmlal_u8(q6u16, d26u8, d5u8); 1338 1339 q7u16 = vmull_u8(d21u8, d3u8); 1340 q8u16 = vmull_u8(d22u8, d3u8); 1341 q9u16 = vmull_u8(d23u8, d3u8); 1342 q10u16 = vmull_u8(d24u8, d3u8); 1343 1344 q3s16 = vreinterpretq_s16_u16(q3u16); 1345 q4s16 = vreinterpretq_s16_u16(q4u16); 1346 q5s16 = vreinterpretq_s16_u16(q5u16); 1347 q6s16 = vreinterpretq_s16_u16(q6u16); 1348 q7s16 = vreinterpretq_s16_u16(q7u16); 1349 q8s16 = vreinterpretq_s16_u16(q8u16); 1350 q9s16 = vreinterpretq_s16_u16(q9u16); 1351 q10s16 = vreinterpretq_s16_u16(q10u16); 1352 1353 q7s16 = vqaddq_s16(q7s16, q3s16); 1354 q8s16 = vqaddq_s16(q8s16, q4s16); 1355 q9s16 = vqaddq_s16(q9s16, q5s16); 1356 q10s16 = vqaddq_s16(q10s16, q6s16); 1357 1358 d6u8 = vqrshrun_n_s16(q7s16, 7); 1359 d7u8 = vqrshrun_n_s16(q8s16, 7); 1360 d8u8 = vqrshrun_n_s16(q9s16, 7); 1361 d9u8 = vqrshrun_n_s16(q10s16, 7); 1362 1363 d18u8 = d22u8; 1364 d19u8 = d23u8; 1365 d20u8 = d24u8; 1366 d21u8 = d25u8; 1367 d22u8 = d26u8; 1368 1369 vst1_u8(dst, d6u8); 1370 dst += dst_pitch; 1371 vst1_u8(dst, d7u8); 1372 dst += dst_pitch; 1373 vst1_u8(dst, d8u8); 1374 dst += dst_pitch; 1375 vst1_u8(dst, d9u8); 1376 dst += dst_pitch; 1377 } 1378 } 1379 return; 1380 } 1381 1382 // load first_pass filter 1383 dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]); 1384 d0s8 = vdup_lane_s8(dtmps8, 0); 1385 d1s8 = vdup_lane_s8(dtmps8, 1); 1386 d2s8 = vdup_lane_s8(dtmps8, 2); 1387 d3s8 = vdup_lane_s8(dtmps8, 3); 1388 d4s8 = vdup_lane_s8(dtmps8, 4); 1389 d5s8 = vdup_lane_s8(dtmps8, 5); 1390 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); 1391 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); 1392 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); 1393 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); 1394 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); 1395 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); 1396 1397 // First pass: output_height lines x output_width columns (9x4) 1398 if (yoffset == 0) { // firstpass_filter4x4_only 1399 src = src_ptr - 2; 1400 dst = dst_ptr; 1401 for (i = 0; i < 8; ++i) { 1402 d6u8 = vld1_u8(src); 1403 d7u8 = vld1_u8(src + 8); 1404 d8u8 = vld1_u8(src + 16); 1405 src += src_pixels_per_line; 1406 d9u8 = vld1_u8(src); 1407 d10u8 = vld1_u8(src + 8); 1408 d11u8 = vld1_u8(src + 16); 1409 src += src_pixels_per_line; 1410 1411 __builtin_prefetch(src); 1412 __builtin_prefetch(src + src_pixels_per_line); 1413 1414 q6u16 = vmull_u8(d6u8, d0u8); 1415 q7u16 = vmull_u8(d7u8, d0u8); 1416 q8u16 = vmull_u8(d9u8, d0u8); 1417 q9u16 = vmull_u8(d10u8, d0u8); 1418 1419 d20u8 = vext_u8(d6u8, d7u8, 1); 1420 d21u8 = vext_u8(d9u8, d10u8, 1); 1421 d22u8 = vext_u8(d7u8, d8u8, 1); 1422 d23u8 = vext_u8(d10u8, d11u8, 1); 1423 d24u8 = vext_u8(d6u8, d7u8, 4); 1424 d25u8 = vext_u8(d9u8, d10u8, 4); 1425 d26u8 = vext_u8(d7u8, d8u8, 4); 1426 d27u8 = vext_u8(d10u8, d11u8, 4); 1427 d28u8 = vext_u8(d6u8, d7u8, 5); 1428 d29u8 = vext_u8(d9u8, d10u8, 5); 1429 1430 q6u16 = vmlsl_u8(q6u16, d20u8, d1u8); 1431 q8u16 = vmlsl_u8(q8u16, d21u8, d1u8); 1432 q7u16 = vmlsl_u8(q7u16, d22u8, d1u8); 1433 q9u16 = vmlsl_u8(q9u16, d23u8, d1u8); 1434 q6u16 = vmlsl_u8(q6u16, d24u8, d4u8); 1435 q8u16 = vmlsl_u8(q8u16, d25u8, d4u8); 1436 q7u16 = vmlsl_u8(q7u16, d26u8, d4u8); 1437 q9u16 = vmlsl_u8(q9u16, d27u8, d4u8); 1438 q6u16 = vmlal_u8(q6u16, d28u8, d5u8); 1439 q8u16 = vmlal_u8(q8u16, d29u8, d5u8); 1440 1441 d20u8 = vext_u8(d7u8, d8u8, 5); 1442 d21u8 = vext_u8(d10u8, d11u8, 5); 1443 d22u8 = vext_u8(d6u8, d7u8, 2); 1444 d23u8 = vext_u8(d9u8, d10u8, 2); 1445 d24u8 = vext_u8(d7u8, d8u8, 2); 1446 d25u8 = vext_u8(d10u8, d11u8, 2); 1447 d26u8 = vext_u8(d6u8, d7u8, 3); 1448 d27u8 = vext_u8(d9u8, d10u8, 3); 1449 d28u8 = vext_u8(d7u8, d8u8, 3); 1450 d29u8 = vext_u8(d10u8, d11u8, 3); 1451 1452 q7u16 = vmlal_u8(q7u16, d20u8, d5u8); 1453 q9u16 = vmlal_u8(q9u16, d21u8, d5u8); 1454 q6u16 = vmlal_u8(q6u16, d22u8, d2u8); 1455 q8u16 = vmlal_u8(q8u16, d23u8, d2u8); 1456 q7u16 = vmlal_u8(q7u16, d24u8, d2u8); 1457 q9u16 = vmlal_u8(q9u16, d25u8, d2u8); 1458 1459 q10u16 = vmull_u8(d26u8, d3u8); 1460 q11u16 = vmull_u8(d27u8, d3u8); 1461 q12u16 = vmull_u8(d28u8, d3u8); 1462 q15u16 = vmull_u8(d29u8, d3u8); 1463 1464 q6s16 = vreinterpretq_s16_u16(q6u16); 1465 q7s16 = vreinterpretq_s16_u16(q7u16); 1466 q8s16 = vreinterpretq_s16_u16(q8u16); 1467 q9s16 = vreinterpretq_s16_u16(q9u16); 1468 q10s16 = vreinterpretq_s16_u16(q10u16); 1469 q11s16 = vreinterpretq_s16_u16(q11u16); 1470 q12s16 = vreinterpretq_s16_u16(q12u16); 1471 q15s16 = vreinterpretq_s16_u16(q15u16); 1472 1473 q6s16 = vqaddq_s16(q6s16, q10s16); 1474 q8s16 = vqaddq_s16(q8s16, q11s16); 1475 q7s16 = vqaddq_s16(q7s16, q12s16); 1476 q9s16 = vqaddq_s16(q9s16, q15s16); 1477 1478 d6u8 = vqrshrun_n_s16(q6s16, 7); 1479 d7u8 = vqrshrun_n_s16(q7s16, 7); 1480 d8u8 = vqrshrun_n_s16(q8s16, 7); 1481 d9u8 = vqrshrun_n_s16(q9s16, 7); 1482 1483 q3u8 = vcombine_u8(d6u8, d7u8); 1484 q4u8 = vcombine_u8(d8u8, d9u8); 1485 vst1q_u8(dst, q3u8); 1486 dst += dst_pitch; 1487 vst1q_u8(dst, q4u8); 1488 dst += dst_pitch; 1489 } 1490 return; 1491 } 1492 1493 src = src_ptr - 2 - src_pixels_per_line * 2; 1494 tmpp = tmp; 1495 for (i = 0; i < 7; ++i) { 1496 d6u8 = vld1_u8(src); 1497 d7u8 = vld1_u8(src + 8); 1498 d8u8 = vld1_u8(src + 16); 1499 src += src_pixels_per_line; 1500 d9u8 = vld1_u8(src); 1501 d10u8 = vld1_u8(src + 8); 1502 d11u8 = vld1_u8(src + 16); 1503 src += src_pixels_per_line; 1504 d12u8 = vld1_u8(src); 1505 d13u8 = vld1_u8(src + 8); 1506 d14u8 = vld1_u8(src + 16); 1507 src += src_pixels_per_line; 1508 1509 __builtin_prefetch(src); 1510 __builtin_prefetch(src + src_pixels_per_line); 1511 __builtin_prefetch(src + src_pixels_per_line * 2); 1512 1513 q8u16 = vmull_u8(d6u8, d0u8); 1514 q9u16 = vmull_u8(d7u8, d0u8); 1515 q10u16 = vmull_u8(d9u8, d0u8); 1516 q11u16 = vmull_u8(d10u8, d0u8); 1517 q12u16 = vmull_u8(d12u8, d0u8); 1518 q13u16 = vmull_u8(d13u8, d0u8); 1519 1520 d28u8 = vext_u8(d6u8, d7u8, 1); 1521 d29u8 = vext_u8(d9u8, d10u8, 1); 1522 d30u8 = vext_u8(d12u8, d13u8, 1); 1523 q8u16 = vmlsl_u8(q8u16, d28u8, d1u8); 1524 q10u16 = vmlsl_u8(q10u16, d29u8, d1u8); 1525 q12u16 = vmlsl_u8(q12u16, d30u8, d1u8); 1526 d28u8 = vext_u8(d7u8, d8u8, 1); 1527 d29u8 = vext_u8(d10u8, d11u8, 1); 1528 d30u8 = vext_u8(d13u8, d14u8, 1); 1529 q9u16 = vmlsl_u8(q9u16, d28u8, d1u8); 1530 q11u16 = vmlsl_u8(q11u16, d29u8, d1u8); 1531 q13u16 = vmlsl_u8(q13u16, d30u8, d1u8); 1532 1533 d28u8 = vext_u8(d6u8, d7u8, 4); 1534 d29u8 = vext_u8(d9u8, d10u8, 4); 1535 d30u8 = vext_u8(d12u8, d13u8, 4); 1536 q8u16 = vmlsl_u8(q8u16, d28u8, d4u8); 1537 q10u16 = vmlsl_u8(q10u16, d29u8, d4u8); 1538 q12u16 = vmlsl_u8(q12u16, d30u8, d4u8); 1539 d28u8 = vext_u8(d7u8, d8u8, 4); 1540 d29u8 = vext_u8(d10u8, d11u8, 4); 1541 d30u8 = vext_u8(d13u8, d14u8, 4); 1542 q9u16 = vmlsl_u8(q9u16, d28u8, d4u8); 1543 q11u16 = vmlsl_u8(q11u16, d29u8, d4u8); 1544 q13u16 = vmlsl_u8(q13u16, d30u8, d4u8); 1545 1546 d28u8 = vext_u8(d6u8, d7u8, 5); 1547 d29u8 = vext_u8(d9u8, d10u8, 5); 1548 d30u8 = vext_u8(d12u8, d13u8, 5); 1549 q8u16 = vmlal_u8(q8u16, d28u8, d5u8); 1550 q10u16 = vmlal_u8(q10u16, d29u8, d5u8); 1551 q12u16 = vmlal_u8(q12u16, d30u8, d5u8); 1552 d28u8 = vext_u8(d7u8, d8u8, 5); 1553 d29u8 = vext_u8(d10u8, d11u8, 5); 1554 d30u8 = vext_u8(d13u8, d14u8, 5); 1555 q9u16 = vmlal_u8(q9u16, d28u8, d5u8); 1556 q11u16 = vmlal_u8(q11u16, d29u8, d5u8); 1557 q13u16 = vmlal_u8(q13u16, d30u8, d5u8); 1558 1559 d28u8 = vext_u8(d6u8, d7u8, 2); 1560 d29u8 = vext_u8(d9u8, d10u8, 2); 1561 d30u8 = vext_u8(d12u8, d13u8, 2); 1562 q8u16 = vmlal_u8(q8u16, d28u8, d2u8); 1563 q10u16 = vmlal_u8(q10u16, d29u8, d2u8); 1564 q12u16 = vmlal_u8(q12u16, d30u8, d2u8); 1565 d28u8 = vext_u8(d7u8, d8u8, 2); 1566 d29u8 = vext_u8(d10u8, d11u8, 2); 1567 d30u8 = vext_u8(d13u8, d14u8, 2); 1568 q9u16 = vmlal_u8(q9u16, d28u8, d2u8); 1569 q11u16 = vmlal_u8(q11u16, d29u8, d2u8); 1570 q13u16 = vmlal_u8(q13u16, d30u8, d2u8); 1571 1572 d28u8 = vext_u8(d6u8, d7u8, 3); 1573 d29u8 = vext_u8(d9u8, d10u8, 3); 1574 d30u8 = vext_u8(d12u8, d13u8, 3); 1575 d15u8 = vext_u8(d7u8, d8u8, 3); 1576 d31u8 = vext_u8(d10u8, d11u8, 3); 1577 d6u8 = vext_u8(d13u8, d14u8, 3); 1578 q4u16 = vmull_u8(d28u8, d3u8); 1579 q5u16 = vmull_u8(d29u8, d3u8); 1580 q6u16 = vmull_u8(d30u8, d3u8); 1581 q4s16 = vreinterpretq_s16_u16(q4u16); 1582 q5s16 = vreinterpretq_s16_u16(q5u16); 1583 q6s16 = vreinterpretq_s16_u16(q6u16); 1584 q8s16 = vreinterpretq_s16_u16(q8u16); 1585 q10s16 = vreinterpretq_s16_u16(q10u16); 1586 q12s16 = vreinterpretq_s16_u16(q12u16); 1587 q8s16 = vqaddq_s16(q8s16, q4s16); 1588 q10s16 = vqaddq_s16(q10s16, q5s16); 1589 q12s16 = vqaddq_s16(q12s16, q6s16); 1590 1591 q6u16 = vmull_u8(d15u8, d3u8); 1592 q7u16 = vmull_u8(d31u8, d3u8); 1593 q3u16 = vmull_u8(d6u8, d3u8); 1594 q3s16 = vreinterpretq_s16_u16(q3u16); 1595 q6s16 = vreinterpretq_s16_u16(q6u16); 1596 q7s16 = vreinterpretq_s16_u16(q7u16); 1597 q9s16 = vreinterpretq_s16_u16(q9u16); 1598 q11s16 = vreinterpretq_s16_u16(q11u16); 1599 q13s16 = vreinterpretq_s16_u16(q13u16); 1600 q9s16 = vqaddq_s16(q9s16, q6s16); 1601 q11s16 = vqaddq_s16(q11s16, q7s16); 1602 q13s16 = vqaddq_s16(q13s16, q3s16); 1603 1604 d6u8 = vqrshrun_n_s16(q8s16, 7); 1605 d7u8 = vqrshrun_n_s16(q9s16, 7); 1606 d8u8 = vqrshrun_n_s16(q10s16, 7); 1607 d9u8 = vqrshrun_n_s16(q11s16, 7); 1608 d10u8 = vqrshrun_n_s16(q12s16, 7); 1609 d11u8 = vqrshrun_n_s16(q13s16, 7); 1610 1611 vst1_u8(tmpp, d6u8); 1612 tmpp += 8; 1613 vst1_u8(tmpp, d7u8); 1614 tmpp += 8; 1615 vst1_u8(tmpp, d8u8); 1616 tmpp += 8; 1617 vst1_u8(tmpp, d9u8); 1618 tmpp += 8; 1619 vst1_u8(tmpp, d10u8); 1620 tmpp += 8; 1621 vst1_u8(tmpp, d11u8); 1622 tmpp += 8; 1623 } 1624 1625 // Second pass: 16x16 1626 dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); 1627 d0s8 = vdup_lane_s8(dtmps8, 0); 1628 d1s8 = vdup_lane_s8(dtmps8, 1); 1629 d2s8 = vdup_lane_s8(dtmps8, 2); 1630 d3s8 = vdup_lane_s8(dtmps8, 3); 1631 d4s8 = vdup_lane_s8(dtmps8, 4); 1632 d5s8 = vdup_lane_s8(dtmps8, 5); 1633 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); 1634 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); 1635 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); 1636 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); 1637 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); 1638 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); 1639 1640 for (i = 0; i < 2; ++i) { 1641 dst = dst_ptr + 8 * i; 1642 tmpp = tmp + 8 * i; 1643 d18u8 = vld1_u8(tmpp); 1644 tmpp += 16; 1645 d19u8 = vld1_u8(tmpp); 1646 tmpp += 16; 1647 d20u8 = vld1_u8(tmpp); 1648 tmpp += 16; 1649 d21u8 = vld1_u8(tmpp); 1650 tmpp += 16; 1651 d22u8 = vld1_u8(tmpp); 1652 tmpp += 16; 1653 for (j = 0; j < 4; ++j) { 1654 d23u8 = vld1_u8(tmpp); 1655 tmpp += 16; 1656 d24u8 = vld1_u8(tmpp); 1657 tmpp += 16; 1658 d25u8 = vld1_u8(tmpp); 1659 tmpp += 16; 1660 d26u8 = vld1_u8(tmpp); 1661 tmpp += 16; 1662 1663 q3u16 = vmull_u8(d18u8, d0u8); 1664 q4u16 = vmull_u8(d19u8, d0u8); 1665 q5u16 = vmull_u8(d20u8, d0u8); 1666 q6u16 = vmull_u8(d21u8, d0u8); 1667 1668 q3u16 = vmlsl_u8(q3u16, d19u8, d1u8); 1669 q4u16 = vmlsl_u8(q4u16, d20u8, d1u8); 1670 q5u16 = vmlsl_u8(q5u16, d21u8, d1u8); 1671 q6u16 = vmlsl_u8(q6u16, d22u8, d1u8); 1672 1673 q3u16 = vmlsl_u8(q3u16, d22u8, d4u8); 1674 q4u16 = vmlsl_u8(q4u16, d23u8, d4u8); 1675 q5u16 = vmlsl_u8(q5u16, d24u8, d4u8); 1676 q6u16 = vmlsl_u8(q6u16, d25u8, d4u8); 1677 1678 q3u16 = vmlal_u8(q3u16, d20u8, d2u8); 1679 q4u16 = vmlal_u8(q4u16, d21u8, d2u8); 1680 q5u16 = vmlal_u8(q5u16, d22u8, d2u8); 1681 q6u16 = vmlal_u8(q6u16, d23u8, d2u8); 1682 1683 q3u16 = vmlal_u8(q3u16, d23u8, d5u8); 1684 q4u16 = vmlal_u8(q4u16, d24u8, d5u8); 1685 q5u16 = vmlal_u8(q5u16, d25u8, d5u8); 1686 q6u16 = vmlal_u8(q6u16, d26u8, d5u8); 1687 1688 q7u16 = vmull_u8(d21u8, d3u8); 1689 q8u16 = vmull_u8(d22u8, d3u8); 1690 q9u16 = vmull_u8(d23u8, d3u8); 1691 q10u16 = vmull_u8(d24u8, d3u8); 1692 1693 q3s16 = vreinterpretq_s16_u16(q3u16); 1694 q4s16 = vreinterpretq_s16_u16(q4u16); 1695 q5s16 = vreinterpretq_s16_u16(q5u16); 1696 q6s16 = vreinterpretq_s16_u16(q6u16); 1697 q7s16 = vreinterpretq_s16_u16(q7u16); 1698 q8s16 = vreinterpretq_s16_u16(q8u16); 1699 q9s16 = vreinterpretq_s16_u16(q9u16); 1700 q10s16 = vreinterpretq_s16_u16(q10u16); 1701 1702 q7s16 = vqaddq_s16(q7s16, q3s16); 1703 q8s16 = vqaddq_s16(q8s16, q4s16); 1704 q9s16 = vqaddq_s16(q9s16, q5s16); 1705 q10s16 = vqaddq_s16(q10s16, q6s16); 1706 1707 d6u8 = vqrshrun_n_s16(q7s16, 7); 1708 d7u8 = vqrshrun_n_s16(q8s16, 7); 1709 d8u8 = vqrshrun_n_s16(q9s16, 7); 1710 d9u8 = vqrshrun_n_s16(q10s16, 7); 1711 1712 d18u8 = d22u8; 1713 d19u8 = d23u8; 1714 d20u8 = d24u8; 1715 d21u8 = d25u8; 1716 d22u8 = d26u8; 1717 1718 vst1_u8(dst, d6u8); 1719 dst += dst_pitch; 1720 vst1_u8(dst, d7u8); 1721 dst += dst_pitch; 1722 vst1_u8(dst, d8u8); 1723 dst += dst_pitch; 1724 vst1_u8(dst, d9u8); 1725 dst += dst_pitch; 1726 } 1727 } 1728 return; 1729 } 1730