1 /* 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <arm_neon.h> 12 13 #ifdef _MSC_VER 14 #define __builtin_prefetch(x) 15 #endif 16 17 static const int8_t vp8_sub_pel_filters[8][8] = { 18 {0, 0, 128, 0, 0, 0, 0, 0}, /* note that 1/8 pel positionyys are */ 19 {0, -6, 123, 12, -1, 0, 0, 0}, /* just as per alpha -0.5 bicubic */ 20 {2, -11, 108, 36, -8, 1, 0, 0}, /* New 1/4 pel 6 tap filter */ 21 {0, -9, 93, 50, -6, 0, 0, 0}, 22 {3, -16, 77, 77, -16, 3, 0, 0}, /* New 1/2 pel 6 tap filter */ 23 {0, -6, 50, 93, -9, 0, 0, 0}, 24 {1, -8, 36, 108, -11, 2, 0, 0}, /* New 1/4 pel 6 tap filter */ 25 {0, -1, 12, 123, -6, 0, 0, 0}, 26 }; 27 28 void vp8_sixtap_predict4x4_neon( 29 unsigned char *src_ptr, 30 int src_pixels_per_line, 31 int xoffset, 32 int yoffset, 33 unsigned char *dst_ptr, 34 int dst_pitch) { 35 unsigned char *src; 36 uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d18u8, d19u8, d20u8, d21u8; 37 uint8x8_t d23u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8, d30u8, d31u8; 38 int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8; 39 uint32x2_t d27u32, d28u32, d29u32, d30u32, d31u32; 40 uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16; 41 uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16; 42 int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16; 43 int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16; 44 uint8x16_t q3u8, q4u8, q5u8, q6u8, q11u8; 45 uint64x2_t q3u64, q4u64, q5u64, q6u64, q9u64, q10u64; 46 uint32x2x2_t d0u32x2, d1u32x2; 47 48 if (xoffset == 0) { // secondpass_filter4x4_only 49 // load second_pass filter 50 dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); 51 d0s8 = vdup_lane_s8(dtmps8, 0); 52 d1s8 = vdup_lane_s8(dtmps8, 1); 53 d2s8 = vdup_lane_s8(dtmps8, 2); 54 d3s8 = vdup_lane_s8(dtmps8, 3); 55 d4s8 = vdup_lane_s8(dtmps8, 4); 56 d5s8 = vdup_lane_s8(dtmps8, 5); 57 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); 58 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); 59 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); 60 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); 61 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); 62 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); 63 64 // load src data 65 src = src_ptr - src_pixels_per_line * 2; 66 d27u32 = vld1_lane_u32((const uint32_t *)src, d27u32, 0); 67 src += src_pixels_per_line; 68 d27u32 = vld1_lane_u32((const uint32_t *)src, d27u32, 1); 69 src += src_pixels_per_line; 70 d28u32 = vld1_lane_u32((const uint32_t *)src, d28u32, 0); 71 src += src_pixels_per_line; 72 d28u32 = vld1_lane_u32((const uint32_t *)src, d28u32, 1); 73 src += src_pixels_per_line; 74 d29u32 = vld1_lane_u32((const uint32_t *)src, d29u32, 0); 75 src += src_pixels_per_line; 76 d29u32 = vld1_lane_u32((const uint32_t *)src, d29u32, 1); 77 src += src_pixels_per_line; 78 d30u32 = vld1_lane_u32((const uint32_t *)src, d30u32, 0); 79 src += src_pixels_per_line; 80 d30u32 = vld1_lane_u32((const uint32_t *)src, d30u32, 1); 81 src += src_pixels_per_line; 82 d31u32 = vld1_lane_u32((const uint32_t *)src, d31u32, 0); 83 84 d27u8 = vreinterpret_u8_u32(d27u32); 85 d28u8 = vreinterpret_u8_u32(d28u32); 86 d29u8 = vreinterpret_u8_u32(d29u32); 87 d30u8 = vreinterpret_u8_u32(d30u32); 88 d31u8 = vreinterpret_u8_u32(d31u32); 89 90 d23u8 = vext_u8(d27u8, d28u8, 4); 91 d24u8 = vext_u8(d28u8, d29u8, 4); 92 d25u8 = vext_u8(d29u8, d30u8, 4); 93 d26u8 = vext_u8(d30u8, d31u8, 4); 94 95 q3u16 = vmull_u8(d27u8, d0u8); 96 q4u16 = vmull_u8(d28u8, d0u8); 97 q5u16 = vmull_u8(d25u8, d5u8); 98 q6u16 = vmull_u8(d26u8, d5u8); 99 100 q3u16 = vmlsl_u8(q3u16, d29u8, d4u8); 101 q4u16 = vmlsl_u8(q4u16, d30u8, d4u8); 102 q5u16 = vmlsl_u8(q5u16, d23u8, d1u8); 103 q6u16 = vmlsl_u8(q6u16, d24u8, d1u8); 104 105 q3u16 = vmlal_u8(q3u16, d28u8, d2u8); 106 q4u16 = vmlal_u8(q4u16, d29u8, d2u8); 107 q5u16 = vmlal_u8(q5u16, d24u8, d3u8); 108 q6u16 = vmlal_u8(q6u16, d25u8, d3u8); 109 110 q3s16 = vreinterpretq_s16_u16(q3u16); 111 q4s16 = vreinterpretq_s16_u16(q4u16); 112 q5s16 = vreinterpretq_s16_u16(q5u16); 113 q6s16 = vreinterpretq_s16_u16(q6u16); 114 115 q5s16 = vqaddq_s16(q5s16, q3s16); 116 q6s16 = vqaddq_s16(q6s16, q4s16); 117 118 d3u8 = vqrshrun_n_s16(q5s16, 7); 119 d4u8 = vqrshrun_n_s16(q6s16, 7); 120 121 vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 0); 122 dst_ptr += dst_pitch; 123 vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 1); 124 dst_ptr += dst_pitch; 125 vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d4u8), 0); 126 dst_ptr += dst_pitch; 127 vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d4u8), 1); 128 return; 129 } 130 131 // load first_pass filter 132 dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]); 133 d0s8 = vdup_lane_s8(dtmps8, 0); 134 d1s8 = vdup_lane_s8(dtmps8, 1); 135 d2s8 = vdup_lane_s8(dtmps8, 2); 136 d3s8 = vdup_lane_s8(dtmps8, 3); 137 d4s8 = vdup_lane_s8(dtmps8, 4); 138 d5s8 = vdup_lane_s8(dtmps8, 5); 139 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); 140 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); 141 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); 142 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); 143 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); 144 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); 145 146 // First pass: output_height lines x output_width columns (9x4) 147 148 if (yoffset == 0) // firstpass_filter4x4_only 149 src = src_ptr - 2; 150 else 151 src = src_ptr - 2 - (src_pixels_per_line * 2); 152 153 q3u8 = vld1q_u8(src); 154 src += src_pixels_per_line; 155 q4u8 = vld1q_u8(src); 156 src += src_pixels_per_line; 157 q5u8 = vld1q_u8(src); 158 src += src_pixels_per_line; 159 q6u8 = vld1q_u8(src); 160 src += src_pixels_per_line; 161 162 d18u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5); 163 d19u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5); 164 d20u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5); 165 d21u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5); 166 167 // vswp here 168 q3u8 = vcombine_u8(vget_low_u8(q3u8), vget_low_u8(q4u8)); 169 q5u8 = vcombine_u8(vget_low_u8(q5u8), vget_low_u8(q6u8)); 170 171 d0u32x2 = vzip_u32(vreinterpret_u32_u8(d18u8), // d18 d19 172 vreinterpret_u32_u8(d19u8)); 173 d1u32x2 = vzip_u32(vreinterpret_u32_u8(d20u8), // d20 d21 174 vreinterpret_u32_u8(d21u8)); 175 q7u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d5u8); 176 q8u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d5u8); 177 178 // keep original src data in q4 q6 179 q4u64 = vreinterpretq_u64_u8(q3u8); 180 q6u64 = vreinterpretq_u64_u8(q5u8); 181 182 d0u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q3u8)), // d6 d7 183 vreinterpret_u32_u8(vget_high_u8(q3u8))); 184 d1u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q5u8)), // d10 d11 185 vreinterpret_u32_u8(vget_high_u8(q5u8))); 186 q9u64 = vshrq_n_u64(q4u64, 8); 187 q10u64 = vshrq_n_u64(q6u64, 8); 188 q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d0u8); 189 q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d0u8); 190 191 d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q9u64)), // d18 d19 192 vreinterpret_u32_u64(vget_high_u64(q9u64))); 193 d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q10u64)), // d20 d211 194 vreinterpret_u32_u64(vget_high_u64(q10u64))); 195 q3u64 = vshrq_n_u64(q4u64, 32); 196 q5u64 = vshrq_n_u64(q6u64, 32); 197 q7u16 = vmlsl_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d1u8); 198 q8u16 = vmlsl_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d1u8); 199 200 d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q3u64)), // d6 d7 201 vreinterpret_u32_u64(vget_high_u64(q3u64))); 202 d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)), // d10 d11 203 vreinterpret_u32_u64(vget_high_u64(q5u64))); 204 q9u64 = vshrq_n_u64(q4u64, 16); 205 q10u64 = vshrq_n_u64(q6u64, 16); 206 q7u16 = vmlsl_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d4u8); 207 q8u16 = vmlsl_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d4u8); 208 209 d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q9u64)), // d18 d19 210 vreinterpret_u32_u64(vget_high_u64(q9u64))); 211 d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q10u64)), // d20 d211 212 vreinterpret_u32_u64(vget_high_u64(q10u64))); 213 q3u64 = vshrq_n_u64(q4u64, 24); 214 q5u64 = vshrq_n_u64(q6u64, 24); 215 q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d2u8); 216 q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d2u8); 217 218 d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q3u64)), // d6 d7 219 vreinterpret_u32_u64(vget_high_u64(q3u64))); 220 d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)), // d10 d11 221 vreinterpret_u32_u64(vget_high_u64(q5u64))); 222 q9u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d3u8); 223 q10u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d3u8); 224 225 q7s16 = vreinterpretq_s16_u16(q7u16); 226 q8s16 = vreinterpretq_s16_u16(q8u16); 227 q9s16 = vreinterpretq_s16_u16(q9u16); 228 q10s16 = vreinterpretq_s16_u16(q10u16); 229 q7s16 = vqaddq_s16(q7s16, q9s16); 230 q8s16 = vqaddq_s16(q8s16, q10s16); 231 232 d27u8 = vqrshrun_n_s16(q7s16, 7); 233 d28u8 = vqrshrun_n_s16(q8s16, 7); 234 235 if (yoffset == 0) { // firstpass_filter4x4_only 236 vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d27u8), 0); 237 dst_ptr += dst_pitch; 238 vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d27u8), 1); 239 dst_ptr += dst_pitch; 240 vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d28u8), 0); 241 dst_ptr += dst_pitch; 242 vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d28u8), 1); 243 return; 244 } 245 246 // First Pass on rest 5-line data 247 q3u8 = vld1q_u8(src); 248 src += src_pixels_per_line; 249 q4u8 = vld1q_u8(src); 250 src += src_pixels_per_line; 251 q5u8 = vld1q_u8(src); 252 src += src_pixels_per_line; 253 q6u8 = vld1q_u8(src); 254 src += src_pixels_per_line; 255 q11u8 = vld1q_u8(src); 256 257 d18u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5); 258 d19u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5); 259 d20u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5); 260 d21u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5); 261 262 // vswp here 263 q3u8 = vcombine_u8(vget_low_u8(q3u8), vget_low_u8(q4u8)); 264 q5u8 = vcombine_u8(vget_low_u8(q5u8), vget_low_u8(q6u8)); 265 266 d0u32x2 = vzip_u32(vreinterpret_u32_u8(d18u8), // d18 d19 267 vreinterpret_u32_u8(d19u8)); 268 d1u32x2 = vzip_u32(vreinterpret_u32_u8(d20u8), // d20 d21 269 vreinterpret_u32_u8(d21u8)); 270 d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 5); 271 q7u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d5u8); 272 q8u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d5u8); 273 q12u16 = vmull_u8(d31u8, d5u8); 274 275 q4u64 = vreinterpretq_u64_u8(q3u8); 276 q6u64 = vreinterpretq_u64_u8(q5u8); 277 278 d0u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q3u8)), // d6 d7 279 vreinterpret_u32_u8(vget_high_u8(q3u8))); 280 d1u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q5u8)), // d10 d11 281 vreinterpret_u32_u8(vget_high_u8(q5u8))); 282 q9u64 = vshrq_n_u64(q4u64, 8); 283 q10u64 = vshrq_n_u64(q6u64, 8); 284 q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d0u8); 285 q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d0u8); 286 q12u16 = vmlal_u8(q12u16, vget_low_u8(q11u8), d0u8); 287 288 d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q9u64)), // d18 d19 289 vreinterpret_u32_u64(vget_high_u64(q9u64))); 290 d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q10u64)), // d20 d211 291 vreinterpret_u32_u64(vget_high_u64(q10u64))); 292 q3u64 = vshrq_n_u64(q4u64, 32); 293 q5u64 = vshrq_n_u64(q6u64, 32); 294 d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 1); 295 q7u16 = vmlsl_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d1u8); 296 q8u16 = vmlsl_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d1u8); 297 q12u16 = vmlsl_u8(q12u16, d31u8, d1u8); 298 299 d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q3u64)), // d6 d7 300 vreinterpret_u32_u64(vget_high_u64(q3u64))); 301 d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)), // d10 d11 302 vreinterpret_u32_u64(vget_high_u64(q5u64))); 303 q9u64 = vshrq_n_u64(q4u64, 16); 304 q10u64 = vshrq_n_u64(q6u64, 16); 305 d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 4); 306 q7u16 = vmlsl_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d4u8); 307 q8u16 = vmlsl_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d4u8); 308 q12u16 = vmlsl_u8(q12u16, d31u8, d4u8); 309 310 d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q9u64)), // d18 d19 311 vreinterpret_u32_u64(vget_high_u64(q9u64))); 312 d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q10u64)), // d20 d211 313 vreinterpret_u32_u64(vget_high_u64(q10u64))); 314 q3u64 = vshrq_n_u64(q4u64, 24); 315 q5u64 = vshrq_n_u64(q6u64, 24); 316 d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 2); 317 q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d2u8); 318 q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d2u8); 319 q12u16 = vmlal_u8(q12u16, d31u8, d2u8); 320 321 d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q3u64)), // d6 d7 322 vreinterpret_u32_u64(vget_high_u64(q3u64))); 323 d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)), // d10 d11 324 vreinterpret_u32_u64(vget_high_u64(q5u64))); 325 d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 3); 326 q9u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d3u8); 327 q10u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d3u8); 328 q11u16 = vmull_u8(d31u8, d3u8); 329 330 q7s16 = vreinterpretq_s16_u16(q7u16); 331 q8s16 = vreinterpretq_s16_u16(q8u16); 332 q9s16 = vreinterpretq_s16_u16(q9u16); 333 q10s16 = vreinterpretq_s16_u16(q10u16); 334 q11s16 = vreinterpretq_s16_u16(q11u16); 335 q12s16 = vreinterpretq_s16_u16(q12u16); 336 q7s16 = vqaddq_s16(q7s16, q9s16); 337 q8s16 = vqaddq_s16(q8s16, q10s16); 338 q12s16 = vqaddq_s16(q12s16, q11s16); 339 340 d29u8 = vqrshrun_n_s16(q7s16, 7); 341 d30u8 = vqrshrun_n_s16(q8s16, 7); 342 d31u8 = vqrshrun_n_s16(q12s16, 7); 343 344 // Second pass: 4x4 345 dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); 346 d0s8 = vdup_lane_s8(dtmps8, 0); 347 d1s8 = vdup_lane_s8(dtmps8, 1); 348 d2s8 = vdup_lane_s8(dtmps8, 2); 349 d3s8 = vdup_lane_s8(dtmps8, 3); 350 d4s8 = vdup_lane_s8(dtmps8, 4); 351 d5s8 = vdup_lane_s8(dtmps8, 5); 352 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); 353 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); 354 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); 355 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); 356 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); 357 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); 358 359 d23u8 = vext_u8(d27u8, d28u8, 4); 360 d24u8 = vext_u8(d28u8, d29u8, 4); 361 d25u8 = vext_u8(d29u8, d30u8, 4); 362 d26u8 = vext_u8(d30u8, d31u8, 4); 363 364 q3u16 = vmull_u8(d27u8, d0u8); 365 q4u16 = vmull_u8(d28u8, d0u8); 366 q5u16 = vmull_u8(d25u8, d5u8); 367 q6u16 = vmull_u8(d26u8, d5u8); 368 369 q3u16 = vmlsl_u8(q3u16, d29u8, d4u8); 370 q4u16 = vmlsl_u8(q4u16, d30u8, d4u8); 371 q5u16 = vmlsl_u8(q5u16, d23u8, d1u8); 372 q6u16 = vmlsl_u8(q6u16, d24u8, d1u8); 373 374 q3u16 = vmlal_u8(q3u16, d28u8, d2u8); 375 q4u16 = vmlal_u8(q4u16, d29u8, d2u8); 376 q5u16 = vmlal_u8(q5u16, d24u8, d3u8); 377 q6u16 = vmlal_u8(q6u16, d25u8, d3u8); 378 379 q3s16 = vreinterpretq_s16_u16(q3u16); 380 q4s16 = vreinterpretq_s16_u16(q4u16); 381 q5s16 = vreinterpretq_s16_u16(q5u16); 382 q6s16 = vreinterpretq_s16_u16(q6u16); 383 384 q5s16 = vqaddq_s16(q5s16, q3s16); 385 q6s16 = vqaddq_s16(q6s16, q4s16); 386 387 d3u8 = vqrshrun_n_s16(q5s16, 7); 388 d4u8 = vqrshrun_n_s16(q6s16, 7); 389 390 vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 0); 391 dst_ptr += dst_pitch; 392 vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 1); 393 dst_ptr += dst_pitch; 394 vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d4u8), 0); 395 dst_ptr += dst_pitch; 396 vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d4u8), 1); 397 return; 398 } 399 400 void vp8_sixtap_predict8x4_neon( 401 unsigned char *src_ptr, 402 int src_pixels_per_line, 403 int xoffset, 404 int yoffset, 405 unsigned char *dst_ptr, 406 int dst_pitch) { 407 unsigned char *src; 408 uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8; 409 uint8x8_t d22u8, d23u8, d24u8, d25u8, d26u8; 410 uint8x8_t d27u8, d28u8, d29u8, d30u8, d31u8; 411 int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8; 412 uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16; 413 uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16; 414 int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16; 415 int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16; 416 uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8; 417 418 if (xoffset == 0) { // secondpass_filter8x4_only 419 // load second_pass filter 420 dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); 421 d0s8 = vdup_lane_s8(dtmps8, 0); 422 d1s8 = vdup_lane_s8(dtmps8, 1); 423 d2s8 = vdup_lane_s8(dtmps8, 2); 424 d3s8 = vdup_lane_s8(dtmps8, 3); 425 d4s8 = vdup_lane_s8(dtmps8, 4); 426 d5s8 = vdup_lane_s8(dtmps8, 5); 427 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); 428 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); 429 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); 430 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); 431 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); 432 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); 433 434 // load src data 435 src = src_ptr - src_pixels_per_line * 2; 436 d22u8 = vld1_u8(src); 437 src += src_pixels_per_line; 438 d23u8 = vld1_u8(src); 439 src += src_pixels_per_line; 440 d24u8 = vld1_u8(src); 441 src += src_pixels_per_line; 442 d25u8 = vld1_u8(src); 443 src += src_pixels_per_line; 444 d26u8 = vld1_u8(src); 445 src += src_pixels_per_line; 446 d27u8 = vld1_u8(src); 447 src += src_pixels_per_line; 448 d28u8 = vld1_u8(src); 449 src += src_pixels_per_line; 450 d29u8 = vld1_u8(src); 451 src += src_pixels_per_line; 452 d30u8 = vld1_u8(src); 453 454 q3u16 = vmull_u8(d22u8, d0u8); 455 q4u16 = vmull_u8(d23u8, d0u8); 456 q5u16 = vmull_u8(d24u8, d0u8); 457 q6u16 = vmull_u8(d25u8, d0u8); 458 459 q3u16 = vmlsl_u8(q3u16, d23u8, d1u8); 460 q4u16 = vmlsl_u8(q4u16, d24u8, d1u8); 461 q5u16 = vmlsl_u8(q5u16, d25u8, d1u8); 462 q6u16 = vmlsl_u8(q6u16, d26u8, d1u8); 463 464 q3u16 = vmlsl_u8(q3u16, d26u8, d4u8); 465 q4u16 = vmlsl_u8(q4u16, d27u8, d4u8); 466 q5u16 = vmlsl_u8(q5u16, d28u8, d4u8); 467 q6u16 = vmlsl_u8(q6u16, d29u8, d4u8); 468 469 q3u16 = vmlal_u8(q3u16, d24u8, d2u8); 470 q4u16 = vmlal_u8(q4u16, d25u8, d2u8); 471 q5u16 = vmlal_u8(q5u16, d26u8, d2u8); 472 q6u16 = vmlal_u8(q6u16, d27u8, d2u8); 473 474 q3u16 = vmlal_u8(q3u16, d27u8, d5u8); 475 q4u16 = vmlal_u8(q4u16, d28u8, d5u8); 476 q5u16 = vmlal_u8(q5u16, d29u8, d5u8); 477 q6u16 = vmlal_u8(q6u16, d30u8, d5u8); 478 479 q7u16 = vmull_u8(d25u8, d3u8); 480 q8u16 = vmull_u8(d26u8, d3u8); 481 q9u16 = vmull_u8(d27u8, d3u8); 482 q10u16 = vmull_u8(d28u8, d3u8); 483 484 q3s16 = vreinterpretq_s16_u16(q3u16); 485 q4s16 = vreinterpretq_s16_u16(q4u16); 486 q5s16 = vreinterpretq_s16_u16(q5u16); 487 q6s16 = vreinterpretq_s16_u16(q6u16); 488 q7s16 = vreinterpretq_s16_u16(q7u16); 489 q8s16 = vreinterpretq_s16_u16(q8u16); 490 q9s16 = vreinterpretq_s16_u16(q9u16); 491 q10s16 = vreinterpretq_s16_u16(q10u16); 492 493 q7s16 = vqaddq_s16(q7s16, q3s16); 494 q8s16 = vqaddq_s16(q8s16, q4s16); 495 q9s16 = vqaddq_s16(q9s16, q5s16); 496 q10s16 = vqaddq_s16(q10s16, q6s16); 497 498 d6u8 = vqrshrun_n_s16(q7s16, 7); 499 d7u8 = vqrshrun_n_s16(q8s16, 7); 500 d8u8 = vqrshrun_n_s16(q9s16, 7); 501 d9u8 = vqrshrun_n_s16(q10s16, 7); 502 503 vst1_u8(dst_ptr, d6u8); 504 dst_ptr += dst_pitch; 505 vst1_u8(dst_ptr, d7u8); 506 dst_ptr += dst_pitch; 507 vst1_u8(dst_ptr, d8u8); 508 dst_ptr += dst_pitch; 509 vst1_u8(dst_ptr, d9u8); 510 return; 511 } 512 513 // load first_pass filter 514 dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]); 515 d0s8 = vdup_lane_s8(dtmps8, 0); 516 d1s8 = vdup_lane_s8(dtmps8, 1); 517 d2s8 = vdup_lane_s8(dtmps8, 2); 518 d3s8 = vdup_lane_s8(dtmps8, 3); 519 d4s8 = vdup_lane_s8(dtmps8, 4); 520 d5s8 = vdup_lane_s8(dtmps8, 5); 521 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); 522 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); 523 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); 524 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); 525 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); 526 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); 527 528 // First pass: output_height lines x output_width columns (9x4) 529 if (yoffset == 0) // firstpass_filter4x4_only 530 src = src_ptr - 2; 531 else 532 src = src_ptr - 2 - (src_pixels_per_line * 2); 533 q3u8 = vld1q_u8(src); 534 src += src_pixels_per_line; 535 q4u8 = vld1q_u8(src); 536 src += src_pixels_per_line; 537 q5u8 = vld1q_u8(src); 538 src += src_pixels_per_line; 539 q6u8 = vld1q_u8(src); 540 541 q7u16 = vmull_u8(vget_low_u8(q3u8), d0u8); 542 q8u16 = vmull_u8(vget_low_u8(q4u8), d0u8); 543 q9u16 = vmull_u8(vget_low_u8(q5u8), d0u8); 544 q10u16 = vmull_u8(vget_low_u8(q6u8), d0u8); 545 546 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); 547 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1); 548 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1); 549 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1); 550 551 q7u16 = vmlsl_u8(q7u16, d28u8, d1u8); 552 q8u16 = vmlsl_u8(q8u16, d29u8, d1u8); 553 q9u16 = vmlsl_u8(q9u16, d30u8, d1u8); 554 q10u16 = vmlsl_u8(q10u16, d31u8, d1u8); 555 556 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4); 557 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4); 558 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4); 559 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4); 560 561 q7u16 = vmlsl_u8(q7u16, d28u8, d4u8); 562 q8u16 = vmlsl_u8(q8u16, d29u8, d4u8); 563 q9u16 = vmlsl_u8(q9u16, d30u8, d4u8); 564 q10u16 = vmlsl_u8(q10u16, d31u8, d4u8); 565 566 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2); 567 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2); 568 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2); 569 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2); 570 571 q7u16 = vmlal_u8(q7u16, d28u8, d2u8); 572 q8u16 = vmlal_u8(q8u16, d29u8, d2u8); 573 q9u16 = vmlal_u8(q9u16, d30u8, d2u8); 574 q10u16 = vmlal_u8(q10u16, d31u8, d2u8); 575 576 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5); 577 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5); 578 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5); 579 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5); 580 581 q7u16 = vmlal_u8(q7u16, d28u8, d5u8); 582 q8u16 = vmlal_u8(q8u16, d29u8, d5u8); 583 q9u16 = vmlal_u8(q9u16, d30u8, d5u8); 584 q10u16 = vmlal_u8(q10u16, d31u8, d5u8); 585 586 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3); 587 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3); 588 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3); 589 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3); 590 591 q3u16 = vmull_u8(d28u8, d3u8); 592 q4u16 = vmull_u8(d29u8, d3u8); 593 q5u16 = vmull_u8(d30u8, d3u8); 594 q6u16 = vmull_u8(d31u8, d3u8); 595 596 q3s16 = vreinterpretq_s16_u16(q3u16); 597 q4s16 = vreinterpretq_s16_u16(q4u16); 598 q5s16 = vreinterpretq_s16_u16(q5u16); 599 q6s16 = vreinterpretq_s16_u16(q6u16); 600 q7s16 = vreinterpretq_s16_u16(q7u16); 601 q8s16 = vreinterpretq_s16_u16(q8u16); 602 q9s16 = vreinterpretq_s16_u16(q9u16); 603 q10s16 = vreinterpretq_s16_u16(q10u16); 604 605 q7s16 = vqaddq_s16(q7s16, q3s16); 606 q8s16 = vqaddq_s16(q8s16, q4s16); 607 q9s16 = vqaddq_s16(q9s16, q5s16); 608 q10s16 = vqaddq_s16(q10s16, q6s16); 609 610 d22u8 = vqrshrun_n_s16(q7s16, 7); 611 d23u8 = vqrshrun_n_s16(q8s16, 7); 612 d24u8 = vqrshrun_n_s16(q9s16, 7); 613 d25u8 = vqrshrun_n_s16(q10s16, 7); 614 615 if (yoffset == 0) { // firstpass_filter8x4_only 616 vst1_u8(dst_ptr, d22u8); 617 dst_ptr += dst_pitch; 618 vst1_u8(dst_ptr, d23u8); 619 dst_ptr += dst_pitch; 620 vst1_u8(dst_ptr, d24u8); 621 dst_ptr += dst_pitch; 622 vst1_u8(dst_ptr, d25u8); 623 return; 624 } 625 626 // First Pass on rest 5-line data 627 src += src_pixels_per_line; 628 q3u8 = vld1q_u8(src); 629 src += src_pixels_per_line; 630 q4u8 = vld1q_u8(src); 631 src += src_pixels_per_line; 632 q5u8 = vld1q_u8(src); 633 src += src_pixels_per_line; 634 q6u8 = vld1q_u8(src); 635 src += src_pixels_per_line; 636 q7u8 = vld1q_u8(src); 637 638 q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8); 639 q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8); 640 q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8); 641 q11u16 = vmull_u8(vget_low_u8(q6u8), d0u8); 642 q12u16 = vmull_u8(vget_low_u8(q7u8), d0u8); 643 644 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); 645 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1); 646 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1); 647 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1); 648 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 1); 649 650 q8u16 = vmlsl_u8(q8u16, d27u8, d1u8); 651 q9u16 = vmlsl_u8(q9u16, d28u8, d1u8); 652 q10u16 = vmlsl_u8(q10u16, d29u8, d1u8); 653 q11u16 = vmlsl_u8(q11u16, d30u8, d1u8); 654 q12u16 = vmlsl_u8(q12u16, d31u8, d1u8); 655 656 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4); 657 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4); 658 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4); 659 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4); 660 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 4); 661 662 q8u16 = vmlsl_u8(q8u16, d27u8, d4u8); 663 q9u16 = vmlsl_u8(q9u16, d28u8, d4u8); 664 q10u16 = vmlsl_u8(q10u16, d29u8, d4u8); 665 q11u16 = vmlsl_u8(q11u16, d30u8, d4u8); 666 q12u16 = vmlsl_u8(q12u16, d31u8, d4u8); 667 668 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2); 669 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2); 670 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2); 671 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2); 672 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 2); 673 674 q8u16 = vmlal_u8(q8u16, d27u8, d2u8); 675 q9u16 = vmlal_u8(q9u16, d28u8, d2u8); 676 q10u16 = vmlal_u8(q10u16, d29u8, d2u8); 677 q11u16 = vmlal_u8(q11u16, d30u8, d2u8); 678 q12u16 = vmlal_u8(q12u16, d31u8, d2u8); 679 680 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5); 681 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5); 682 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5); 683 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5); 684 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 5); 685 686 q8u16 = vmlal_u8(q8u16, d27u8, d5u8); 687 q9u16 = vmlal_u8(q9u16, d28u8, d5u8); 688 q10u16 = vmlal_u8(q10u16, d29u8, d5u8); 689 q11u16 = vmlal_u8(q11u16, d30u8, d5u8); 690 q12u16 = vmlal_u8(q12u16, d31u8, d5u8); 691 692 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3); 693 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3); 694 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3); 695 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3); 696 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 3); 697 698 q3u16 = vmull_u8(d27u8, d3u8); 699 q4u16 = vmull_u8(d28u8, d3u8); 700 q5u16 = vmull_u8(d29u8, d3u8); 701 q6u16 = vmull_u8(d30u8, d3u8); 702 q7u16 = vmull_u8(d31u8, d3u8); 703 704 q3s16 = vreinterpretq_s16_u16(q3u16); 705 q4s16 = vreinterpretq_s16_u16(q4u16); 706 q5s16 = vreinterpretq_s16_u16(q5u16); 707 q6s16 = vreinterpretq_s16_u16(q6u16); 708 q7s16 = vreinterpretq_s16_u16(q7u16); 709 q8s16 = vreinterpretq_s16_u16(q8u16); 710 q9s16 = vreinterpretq_s16_u16(q9u16); 711 q10s16 = vreinterpretq_s16_u16(q10u16); 712 q11s16 = vreinterpretq_s16_u16(q11u16); 713 q12s16 = vreinterpretq_s16_u16(q12u16); 714 715 q8s16 = vqaddq_s16(q8s16, q3s16); 716 q9s16 = vqaddq_s16(q9s16, q4s16); 717 q10s16 = vqaddq_s16(q10s16, q5s16); 718 q11s16 = vqaddq_s16(q11s16, q6s16); 719 q12s16 = vqaddq_s16(q12s16, q7s16); 720 721 d26u8 = vqrshrun_n_s16(q8s16, 7); 722 d27u8 = vqrshrun_n_s16(q9s16, 7); 723 d28u8 = vqrshrun_n_s16(q10s16, 7); 724 d29u8 = vqrshrun_n_s16(q11s16, 7); 725 d30u8 = vqrshrun_n_s16(q12s16, 7); 726 727 // Second pass: 8x4 728 dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); 729 d0s8 = vdup_lane_s8(dtmps8, 0); 730 d1s8 = vdup_lane_s8(dtmps8, 1); 731 d2s8 = vdup_lane_s8(dtmps8, 2); 732 d3s8 = vdup_lane_s8(dtmps8, 3); 733 d4s8 = vdup_lane_s8(dtmps8, 4); 734 d5s8 = vdup_lane_s8(dtmps8, 5); 735 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); 736 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); 737 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); 738 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); 739 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); 740 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); 741 742 q3u16 = vmull_u8(d22u8, d0u8); 743 q4u16 = vmull_u8(d23u8, d0u8); 744 q5u16 = vmull_u8(d24u8, d0u8); 745 q6u16 = vmull_u8(d25u8, d0u8); 746 747 q3u16 = vmlsl_u8(q3u16, d23u8, d1u8); 748 q4u16 = vmlsl_u8(q4u16, d24u8, d1u8); 749 q5u16 = vmlsl_u8(q5u16, d25u8, d1u8); 750 q6u16 = vmlsl_u8(q6u16, d26u8, d1u8); 751 752 q3u16 = vmlsl_u8(q3u16, d26u8, d4u8); 753 q4u16 = vmlsl_u8(q4u16, d27u8, d4u8); 754 q5u16 = vmlsl_u8(q5u16, d28u8, d4u8); 755 q6u16 = vmlsl_u8(q6u16, d29u8, d4u8); 756 757 q3u16 = vmlal_u8(q3u16, d24u8, d2u8); 758 q4u16 = vmlal_u8(q4u16, d25u8, d2u8); 759 q5u16 = vmlal_u8(q5u16, d26u8, d2u8); 760 q6u16 = vmlal_u8(q6u16, d27u8, d2u8); 761 762 q3u16 = vmlal_u8(q3u16, d27u8, d5u8); 763 q4u16 = vmlal_u8(q4u16, d28u8, d5u8); 764 q5u16 = vmlal_u8(q5u16, d29u8, d5u8); 765 q6u16 = vmlal_u8(q6u16, d30u8, d5u8); 766 767 q7u16 = vmull_u8(d25u8, d3u8); 768 q8u16 = vmull_u8(d26u8, d3u8); 769 q9u16 = vmull_u8(d27u8, d3u8); 770 q10u16 = vmull_u8(d28u8, d3u8); 771 772 q3s16 = vreinterpretq_s16_u16(q3u16); 773 q4s16 = vreinterpretq_s16_u16(q4u16); 774 q5s16 = vreinterpretq_s16_u16(q5u16); 775 q6s16 = vreinterpretq_s16_u16(q6u16); 776 q7s16 = vreinterpretq_s16_u16(q7u16); 777 q8s16 = vreinterpretq_s16_u16(q8u16); 778 q9s16 = vreinterpretq_s16_u16(q9u16); 779 q10s16 = vreinterpretq_s16_u16(q10u16); 780 781 q7s16 = vqaddq_s16(q7s16, q3s16); 782 q8s16 = vqaddq_s16(q8s16, q4s16); 783 q9s16 = vqaddq_s16(q9s16, q5s16); 784 q10s16 = vqaddq_s16(q10s16, q6s16); 785 786 d6u8 = vqrshrun_n_s16(q7s16, 7); 787 d7u8 = vqrshrun_n_s16(q8s16, 7); 788 d8u8 = vqrshrun_n_s16(q9s16, 7); 789 d9u8 = vqrshrun_n_s16(q10s16, 7); 790 791 vst1_u8(dst_ptr, d6u8); 792 dst_ptr += dst_pitch; 793 vst1_u8(dst_ptr, d7u8); 794 dst_ptr += dst_pitch; 795 vst1_u8(dst_ptr, d8u8); 796 dst_ptr += dst_pitch; 797 vst1_u8(dst_ptr, d9u8); 798 return; 799 } 800 801 void vp8_sixtap_predict8x8_neon( 802 unsigned char *src_ptr, 803 int src_pixels_per_line, 804 int xoffset, 805 int yoffset, 806 unsigned char *dst_ptr, 807 int dst_pitch) { 808 unsigned char *src, *tmpp; 809 unsigned char tmp[64]; 810 int i; 811 uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8; 812 uint8x8_t d18u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8, d25u8; 813 uint8x8_t d26u8, d27u8, d28u8, d29u8, d30u8, d31u8; 814 int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8; 815 uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16; 816 uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16; 817 int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16; 818 int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16; 819 uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q9u8, q10u8, q11u8, q12u8; 820 821 if (xoffset == 0) { // secondpass_filter8x8_only 822 // load second_pass filter 823 dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); 824 d0s8 = vdup_lane_s8(dtmps8, 0); 825 d1s8 = vdup_lane_s8(dtmps8, 1); 826 d2s8 = vdup_lane_s8(dtmps8, 2); 827 d3s8 = vdup_lane_s8(dtmps8, 3); 828 d4s8 = vdup_lane_s8(dtmps8, 4); 829 d5s8 = vdup_lane_s8(dtmps8, 5); 830 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); 831 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); 832 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); 833 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); 834 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); 835 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); 836 837 // load src data 838 src = src_ptr - src_pixels_per_line * 2; 839 d18u8 = vld1_u8(src); 840 src += src_pixels_per_line; 841 d19u8 = vld1_u8(src); 842 src += src_pixels_per_line; 843 d20u8 = vld1_u8(src); 844 src += src_pixels_per_line; 845 d21u8 = vld1_u8(src); 846 src += src_pixels_per_line; 847 d22u8 = vld1_u8(src); 848 src += src_pixels_per_line; 849 d23u8 = vld1_u8(src); 850 src += src_pixels_per_line; 851 d24u8 = vld1_u8(src); 852 src += src_pixels_per_line; 853 d25u8 = vld1_u8(src); 854 src += src_pixels_per_line; 855 d26u8 = vld1_u8(src); 856 src += src_pixels_per_line; 857 d27u8 = vld1_u8(src); 858 src += src_pixels_per_line; 859 d28u8 = vld1_u8(src); 860 src += src_pixels_per_line; 861 d29u8 = vld1_u8(src); 862 src += src_pixels_per_line; 863 d30u8 = vld1_u8(src); 864 865 for (i = 2; i > 0; i--) { 866 q3u16 = vmull_u8(d18u8, d0u8); 867 q4u16 = vmull_u8(d19u8, d0u8); 868 q5u16 = vmull_u8(d20u8, d0u8); 869 q6u16 = vmull_u8(d21u8, d0u8); 870 871 q3u16 = vmlsl_u8(q3u16, d19u8, d1u8); 872 q4u16 = vmlsl_u8(q4u16, d20u8, d1u8); 873 q5u16 = vmlsl_u8(q5u16, d21u8, d1u8); 874 q6u16 = vmlsl_u8(q6u16, d22u8, d1u8); 875 876 q3u16 = vmlsl_u8(q3u16, d22u8, d4u8); 877 q4u16 = vmlsl_u8(q4u16, d23u8, d4u8); 878 q5u16 = vmlsl_u8(q5u16, d24u8, d4u8); 879 q6u16 = vmlsl_u8(q6u16, d25u8, d4u8); 880 881 q3u16 = vmlal_u8(q3u16, d20u8, d2u8); 882 q4u16 = vmlal_u8(q4u16, d21u8, d2u8); 883 q5u16 = vmlal_u8(q5u16, d22u8, d2u8); 884 q6u16 = vmlal_u8(q6u16, d23u8, d2u8); 885 886 q3u16 = vmlal_u8(q3u16, d23u8, d5u8); 887 q4u16 = vmlal_u8(q4u16, d24u8, d5u8); 888 q5u16 = vmlal_u8(q5u16, d25u8, d5u8); 889 q6u16 = vmlal_u8(q6u16, d26u8, d5u8); 890 891 q7u16 = vmull_u8(d21u8, d3u8); 892 q8u16 = vmull_u8(d22u8, d3u8); 893 q9u16 = vmull_u8(d23u8, d3u8); 894 q10u16 = vmull_u8(d24u8, d3u8); 895 896 q3s16 = vreinterpretq_s16_u16(q3u16); 897 q4s16 = vreinterpretq_s16_u16(q4u16); 898 q5s16 = vreinterpretq_s16_u16(q5u16); 899 q6s16 = vreinterpretq_s16_u16(q6u16); 900 q7s16 = vreinterpretq_s16_u16(q7u16); 901 q8s16 = vreinterpretq_s16_u16(q8u16); 902 q9s16 = vreinterpretq_s16_u16(q9u16); 903 q10s16 = vreinterpretq_s16_u16(q10u16); 904 905 q7s16 = vqaddq_s16(q7s16, q3s16); 906 q8s16 = vqaddq_s16(q8s16, q4s16); 907 q9s16 = vqaddq_s16(q9s16, q5s16); 908 q10s16 = vqaddq_s16(q10s16, q6s16); 909 910 d6u8 = vqrshrun_n_s16(q7s16, 7); 911 d7u8 = vqrshrun_n_s16(q8s16, 7); 912 d8u8 = vqrshrun_n_s16(q9s16, 7); 913 d9u8 = vqrshrun_n_s16(q10s16, 7); 914 915 d18u8 = d22u8; 916 d19u8 = d23u8; 917 d20u8 = d24u8; 918 d21u8 = d25u8; 919 d22u8 = d26u8; 920 d23u8 = d27u8; 921 d24u8 = d28u8; 922 d25u8 = d29u8; 923 d26u8 = d30u8; 924 925 vst1_u8(dst_ptr, d6u8); 926 dst_ptr += dst_pitch; 927 vst1_u8(dst_ptr, d7u8); 928 dst_ptr += dst_pitch; 929 vst1_u8(dst_ptr, d8u8); 930 dst_ptr += dst_pitch; 931 vst1_u8(dst_ptr, d9u8); 932 dst_ptr += dst_pitch; 933 } 934 return; 935 } 936 937 // load first_pass filter 938 dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]); 939 d0s8 = vdup_lane_s8(dtmps8, 0); 940 d1s8 = vdup_lane_s8(dtmps8, 1); 941 d2s8 = vdup_lane_s8(dtmps8, 2); 942 d3s8 = vdup_lane_s8(dtmps8, 3); 943 d4s8 = vdup_lane_s8(dtmps8, 4); 944 d5s8 = vdup_lane_s8(dtmps8, 5); 945 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); 946 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); 947 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); 948 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); 949 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); 950 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); 951 952 // First pass: output_height lines x output_width columns (9x4) 953 if (yoffset == 0) // firstpass_filter4x4_only 954 src = src_ptr - 2; 955 else 956 src = src_ptr - 2 - (src_pixels_per_line * 2); 957 958 tmpp = tmp; 959 for (i = 2; i > 0; i--) { 960 q3u8 = vld1q_u8(src); 961 src += src_pixels_per_line; 962 q4u8 = vld1q_u8(src); 963 src += src_pixels_per_line; 964 q5u8 = vld1q_u8(src); 965 src += src_pixels_per_line; 966 q6u8 = vld1q_u8(src); 967 src += src_pixels_per_line; 968 969 __builtin_prefetch(src); 970 __builtin_prefetch(src + src_pixels_per_line); 971 __builtin_prefetch(src + src_pixels_per_line * 2); 972 973 q7u16 = vmull_u8(vget_low_u8(q3u8), d0u8); 974 q8u16 = vmull_u8(vget_low_u8(q4u8), d0u8); 975 q9u16 = vmull_u8(vget_low_u8(q5u8), d0u8); 976 q10u16 = vmull_u8(vget_low_u8(q6u8), d0u8); 977 978 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); 979 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1); 980 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1); 981 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1); 982 983 q7u16 = vmlsl_u8(q7u16, d28u8, d1u8); 984 q8u16 = vmlsl_u8(q8u16, d29u8, d1u8); 985 q9u16 = vmlsl_u8(q9u16, d30u8, d1u8); 986 q10u16 = vmlsl_u8(q10u16, d31u8, d1u8); 987 988 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4); 989 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4); 990 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4); 991 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4); 992 993 q7u16 = vmlsl_u8(q7u16, d28u8, d4u8); 994 q8u16 = vmlsl_u8(q8u16, d29u8, d4u8); 995 q9u16 = vmlsl_u8(q9u16, d30u8, d4u8); 996 q10u16 = vmlsl_u8(q10u16, d31u8, d4u8); 997 998 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2); 999 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2); 1000 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2); 1001 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2); 1002 1003 q7u16 = vmlal_u8(q7u16, d28u8, d2u8); 1004 q8u16 = vmlal_u8(q8u16, d29u8, d2u8); 1005 q9u16 = vmlal_u8(q9u16, d30u8, d2u8); 1006 q10u16 = vmlal_u8(q10u16, d31u8, d2u8); 1007 1008 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5); 1009 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5); 1010 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5); 1011 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5); 1012 1013 q7u16 = vmlal_u8(q7u16, d28u8, d5u8); 1014 q8u16 = vmlal_u8(q8u16, d29u8, d5u8); 1015 q9u16 = vmlal_u8(q9u16, d30u8, d5u8); 1016 q10u16 = vmlal_u8(q10u16, d31u8, d5u8); 1017 1018 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3); 1019 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3); 1020 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3); 1021 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3); 1022 1023 q3u16 = vmull_u8(d28u8, d3u8); 1024 q4u16 = vmull_u8(d29u8, d3u8); 1025 q5u16 = vmull_u8(d30u8, d3u8); 1026 q6u16 = vmull_u8(d31u8, d3u8); 1027 1028 q3s16 = vreinterpretq_s16_u16(q3u16); 1029 q4s16 = vreinterpretq_s16_u16(q4u16); 1030 q5s16 = vreinterpretq_s16_u16(q5u16); 1031 q6s16 = vreinterpretq_s16_u16(q6u16); 1032 q7s16 = vreinterpretq_s16_u16(q7u16); 1033 q8s16 = vreinterpretq_s16_u16(q8u16); 1034 q9s16 = vreinterpretq_s16_u16(q9u16); 1035 q10s16 = vreinterpretq_s16_u16(q10u16); 1036 1037 q7s16 = vqaddq_s16(q7s16, q3s16); 1038 q8s16 = vqaddq_s16(q8s16, q4s16); 1039 q9s16 = vqaddq_s16(q9s16, q5s16); 1040 q10s16 = vqaddq_s16(q10s16, q6s16); 1041 1042 d22u8 = vqrshrun_n_s16(q7s16, 7); 1043 d23u8 = vqrshrun_n_s16(q8s16, 7); 1044 d24u8 = vqrshrun_n_s16(q9s16, 7); 1045 d25u8 = vqrshrun_n_s16(q10s16, 7); 1046 1047 if (yoffset == 0) { // firstpass_filter8x4_only 1048 vst1_u8(dst_ptr, d22u8); 1049 dst_ptr += dst_pitch; 1050 vst1_u8(dst_ptr, d23u8); 1051 dst_ptr += dst_pitch; 1052 vst1_u8(dst_ptr, d24u8); 1053 dst_ptr += dst_pitch; 1054 vst1_u8(dst_ptr, d25u8); 1055 dst_ptr += dst_pitch; 1056 } else { 1057 vst1_u8(tmpp, d22u8); 1058 tmpp += 8; 1059 vst1_u8(tmpp, d23u8); 1060 tmpp += 8; 1061 vst1_u8(tmpp, d24u8); 1062 tmpp += 8; 1063 vst1_u8(tmpp, d25u8); 1064 tmpp += 8; 1065 } 1066 } 1067 if (yoffset == 0) 1068 return; 1069 1070 // First Pass on rest 5-line data 1071 q3u8 = vld1q_u8(src); 1072 src += src_pixels_per_line; 1073 q4u8 = vld1q_u8(src); 1074 src += src_pixels_per_line; 1075 q5u8 = vld1q_u8(src); 1076 src += src_pixels_per_line; 1077 q6u8 = vld1q_u8(src); 1078 src += src_pixels_per_line; 1079 q7u8 = vld1q_u8(src); 1080 1081 q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8); 1082 q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8); 1083 q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8); 1084 q11u16 = vmull_u8(vget_low_u8(q6u8), d0u8); 1085 q12u16 = vmull_u8(vget_low_u8(q7u8), d0u8); 1086 1087 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); 1088 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1); 1089 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1); 1090 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1); 1091 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 1); 1092 1093 q8u16 = vmlsl_u8(q8u16, d27u8, d1u8); 1094 q9u16 = vmlsl_u8(q9u16, d28u8, d1u8); 1095 q10u16 = vmlsl_u8(q10u16, d29u8, d1u8); 1096 q11u16 = vmlsl_u8(q11u16, d30u8, d1u8); 1097 q12u16 = vmlsl_u8(q12u16, d31u8, d1u8); 1098 1099 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4); 1100 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4); 1101 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4); 1102 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4); 1103 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 4); 1104 1105 q8u16 = vmlsl_u8(q8u16, d27u8, d4u8); 1106 q9u16 = vmlsl_u8(q9u16, d28u8, d4u8); 1107 q10u16 = vmlsl_u8(q10u16, d29u8, d4u8); 1108 q11u16 = vmlsl_u8(q11u16, d30u8, d4u8); 1109 q12u16 = vmlsl_u8(q12u16, d31u8, d4u8); 1110 1111 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2); 1112 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2); 1113 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2); 1114 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2); 1115 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 2); 1116 1117 q8u16 = vmlal_u8(q8u16, d27u8, d2u8); 1118 q9u16 = vmlal_u8(q9u16, d28u8, d2u8); 1119 q10u16 = vmlal_u8(q10u16, d29u8, d2u8); 1120 q11u16 = vmlal_u8(q11u16, d30u8, d2u8); 1121 q12u16 = vmlal_u8(q12u16, d31u8, d2u8); 1122 1123 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5); 1124 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5); 1125 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5); 1126 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5); 1127 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 5); 1128 1129 q8u16 = vmlal_u8(q8u16, d27u8, d5u8); 1130 q9u16 = vmlal_u8(q9u16, d28u8, d5u8); 1131 q10u16 = vmlal_u8(q10u16, d29u8, d5u8); 1132 q11u16 = vmlal_u8(q11u16, d30u8, d5u8); 1133 q12u16 = vmlal_u8(q12u16, d31u8, d5u8); 1134 1135 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3); 1136 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3); 1137 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3); 1138 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3); 1139 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 3); 1140 1141 q3u16 = vmull_u8(d27u8, d3u8); 1142 q4u16 = vmull_u8(d28u8, d3u8); 1143 q5u16 = vmull_u8(d29u8, d3u8); 1144 q6u16 = vmull_u8(d30u8, d3u8); 1145 q7u16 = vmull_u8(d31u8, d3u8); 1146 1147 q3s16 = vreinterpretq_s16_u16(q3u16); 1148 q4s16 = vreinterpretq_s16_u16(q4u16); 1149 q5s16 = vreinterpretq_s16_u16(q5u16); 1150 q6s16 = vreinterpretq_s16_u16(q6u16); 1151 q7s16 = vreinterpretq_s16_u16(q7u16); 1152 q8s16 = vreinterpretq_s16_u16(q8u16); 1153 q9s16 = vreinterpretq_s16_u16(q9u16); 1154 q10s16 = vreinterpretq_s16_u16(q10u16); 1155 q11s16 = vreinterpretq_s16_u16(q11u16); 1156 q12s16 = vreinterpretq_s16_u16(q12u16); 1157 1158 q8s16 = vqaddq_s16(q8s16, q3s16); 1159 q9s16 = vqaddq_s16(q9s16, q4s16); 1160 q10s16 = vqaddq_s16(q10s16, q5s16); 1161 q11s16 = vqaddq_s16(q11s16, q6s16); 1162 q12s16 = vqaddq_s16(q12s16, q7s16); 1163 1164 d26u8 = vqrshrun_n_s16(q8s16, 7); 1165 d27u8 = vqrshrun_n_s16(q9s16, 7); 1166 d28u8 = vqrshrun_n_s16(q10s16, 7); 1167 d29u8 = vqrshrun_n_s16(q11s16, 7); 1168 d30u8 = vqrshrun_n_s16(q12s16, 7); 1169 1170 // Second pass: 8x8 1171 dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); 1172 d0s8 = vdup_lane_s8(dtmps8, 0); 1173 d1s8 = vdup_lane_s8(dtmps8, 1); 1174 d2s8 = vdup_lane_s8(dtmps8, 2); 1175 d3s8 = vdup_lane_s8(dtmps8, 3); 1176 d4s8 = vdup_lane_s8(dtmps8, 4); 1177 d5s8 = vdup_lane_s8(dtmps8, 5); 1178 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); 1179 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); 1180 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); 1181 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); 1182 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); 1183 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); 1184 1185 tmpp = tmp; 1186 q9u8 = vld1q_u8(tmpp); 1187 tmpp += 16; 1188 q10u8 = vld1q_u8(tmpp); 1189 tmpp += 16; 1190 q11u8 = vld1q_u8(tmpp); 1191 tmpp += 16; 1192 q12u8 = vld1q_u8(tmpp); 1193 1194 d18u8 = vget_low_u8(q9u8); 1195 d19u8 = vget_high_u8(q9u8); 1196 d20u8 = vget_low_u8(q10u8); 1197 d21u8 = vget_high_u8(q10u8); 1198 d22u8 = vget_low_u8(q11u8); 1199 d23u8 = vget_high_u8(q11u8); 1200 d24u8 = vget_low_u8(q12u8); 1201 d25u8 = vget_high_u8(q12u8); 1202 1203 for (i = 2; i > 0; i--) { 1204 q3u16 = vmull_u8(d18u8, d0u8); 1205 q4u16 = vmull_u8(d19u8, d0u8); 1206 q5u16 = vmull_u8(d20u8, d0u8); 1207 q6u16 = vmull_u8(d21u8, d0u8); 1208 1209 q3u16 = vmlsl_u8(q3u16, d19u8, d1u8); 1210 q4u16 = vmlsl_u8(q4u16, d20u8, d1u8); 1211 q5u16 = vmlsl_u8(q5u16, d21u8, d1u8); 1212 q6u16 = vmlsl_u8(q6u16, d22u8, d1u8); 1213 1214 q3u16 = vmlsl_u8(q3u16, d22u8, d4u8); 1215 q4u16 = vmlsl_u8(q4u16, d23u8, d4u8); 1216 q5u16 = vmlsl_u8(q5u16, d24u8, d4u8); 1217 q6u16 = vmlsl_u8(q6u16, d25u8, d4u8); 1218 1219 q3u16 = vmlal_u8(q3u16, d20u8, d2u8); 1220 q4u16 = vmlal_u8(q4u16, d21u8, d2u8); 1221 q5u16 = vmlal_u8(q5u16, d22u8, d2u8); 1222 q6u16 = vmlal_u8(q6u16, d23u8, d2u8); 1223 1224 q3u16 = vmlal_u8(q3u16, d23u8, d5u8); 1225 q4u16 = vmlal_u8(q4u16, d24u8, d5u8); 1226 q5u16 = vmlal_u8(q5u16, d25u8, d5u8); 1227 q6u16 = vmlal_u8(q6u16, d26u8, d5u8); 1228 1229 q7u16 = vmull_u8(d21u8, d3u8); 1230 q8u16 = vmull_u8(d22u8, d3u8); 1231 q9u16 = vmull_u8(d23u8, d3u8); 1232 q10u16 = vmull_u8(d24u8, d3u8); 1233 1234 q3s16 = vreinterpretq_s16_u16(q3u16); 1235 q4s16 = vreinterpretq_s16_u16(q4u16); 1236 q5s16 = vreinterpretq_s16_u16(q5u16); 1237 q6s16 = vreinterpretq_s16_u16(q6u16); 1238 q7s16 = vreinterpretq_s16_u16(q7u16); 1239 q8s16 = vreinterpretq_s16_u16(q8u16); 1240 q9s16 = vreinterpretq_s16_u16(q9u16); 1241 q10s16 = vreinterpretq_s16_u16(q10u16); 1242 1243 q7s16 = vqaddq_s16(q7s16, q3s16); 1244 q8s16 = vqaddq_s16(q8s16, q4s16); 1245 q9s16 = vqaddq_s16(q9s16, q5s16); 1246 q10s16 = vqaddq_s16(q10s16, q6s16); 1247 1248 d6u8 = vqrshrun_n_s16(q7s16, 7); 1249 d7u8 = vqrshrun_n_s16(q8s16, 7); 1250 d8u8 = vqrshrun_n_s16(q9s16, 7); 1251 d9u8 = vqrshrun_n_s16(q10s16, 7); 1252 1253 d18u8 = d22u8; 1254 d19u8 = d23u8; 1255 d20u8 = d24u8; 1256 d21u8 = d25u8; 1257 d22u8 = d26u8; 1258 d23u8 = d27u8; 1259 d24u8 = d28u8; 1260 d25u8 = d29u8; 1261 d26u8 = d30u8; 1262 1263 vst1_u8(dst_ptr, d6u8); 1264 dst_ptr += dst_pitch; 1265 vst1_u8(dst_ptr, d7u8); 1266 dst_ptr += dst_pitch; 1267 vst1_u8(dst_ptr, d8u8); 1268 dst_ptr += dst_pitch; 1269 vst1_u8(dst_ptr, d9u8); 1270 dst_ptr += dst_pitch; 1271 } 1272 return; 1273 } 1274 1275 void vp8_sixtap_predict16x16_neon( 1276 unsigned char *src_ptr, 1277 int src_pixels_per_line, 1278 int xoffset, 1279 int yoffset, 1280 unsigned char *dst_ptr, 1281 int dst_pitch) { 1282 unsigned char *src, *src_tmp, *dst, *tmpp; 1283 unsigned char tmp[336]; 1284 int i, j; 1285 uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8; 1286 uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d18u8, d19u8; 1287 uint8x8_t d20u8, d21u8, d22u8, d23u8, d24u8, d25u8, d26u8, d27u8; 1288 uint8x8_t d28u8, d29u8, d30u8, d31u8; 1289 int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8; 1290 uint8x16_t q3u8, q4u8; 1291 uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16, q8u16, q9u16, q10u16; 1292 uint16x8_t q11u16, q12u16, q13u16, q15u16; 1293 int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16, q8s16, q9s16, q10s16; 1294 int16x8_t q11s16, q12s16, q13s16, q15s16; 1295 1296 if (xoffset == 0) { // secondpass_filter8x8_only 1297 // load second_pass filter 1298 dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); 1299 d0s8 = vdup_lane_s8(dtmps8, 0); 1300 d1s8 = vdup_lane_s8(dtmps8, 1); 1301 d2s8 = vdup_lane_s8(dtmps8, 2); 1302 d3s8 = vdup_lane_s8(dtmps8, 3); 1303 d4s8 = vdup_lane_s8(dtmps8, 4); 1304 d5s8 = vdup_lane_s8(dtmps8, 5); 1305 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); 1306 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); 1307 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); 1308 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); 1309 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); 1310 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); 1311 1312 // load src data 1313 src_tmp = src_ptr - src_pixels_per_line * 2; 1314 for (i = 0; i < 2; i++) { 1315 src = src_tmp + i * 8; 1316 dst = dst_ptr + i * 8; 1317 d18u8 = vld1_u8(src); 1318 src += src_pixels_per_line; 1319 d19u8 = vld1_u8(src); 1320 src += src_pixels_per_line; 1321 d20u8 = vld1_u8(src); 1322 src += src_pixels_per_line; 1323 d21u8 = vld1_u8(src); 1324 src += src_pixels_per_line; 1325 d22u8 = vld1_u8(src); 1326 src += src_pixels_per_line; 1327 for (j = 0; j < 4; j++) { 1328 d23u8 = vld1_u8(src); 1329 src += src_pixels_per_line; 1330 d24u8 = vld1_u8(src); 1331 src += src_pixels_per_line; 1332 d25u8 = vld1_u8(src); 1333 src += src_pixels_per_line; 1334 d26u8 = vld1_u8(src); 1335 src += src_pixels_per_line; 1336 1337 q3u16 = vmull_u8(d18u8, d0u8); 1338 q4u16 = vmull_u8(d19u8, d0u8); 1339 q5u16 = vmull_u8(d20u8, d0u8); 1340 q6u16 = vmull_u8(d21u8, d0u8); 1341 1342 q3u16 = vmlsl_u8(q3u16, d19u8, d1u8); 1343 q4u16 = vmlsl_u8(q4u16, d20u8, d1u8); 1344 q5u16 = vmlsl_u8(q5u16, d21u8, d1u8); 1345 q6u16 = vmlsl_u8(q6u16, d22u8, d1u8); 1346 1347 q3u16 = vmlsl_u8(q3u16, d22u8, d4u8); 1348 q4u16 = vmlsl_u8(q4u16, d23u8, d4u8); 1349 q5u16 = vmlsl_u8(q5u16, d24u8, d4u8); 1350 q6u16 = vmlsl_u8(q6u16, d25u8, d4u8); 1351 1352 q3u16 = vmlal_u8(q3u16, d20u8, d2u8); 1353 q4u16 = vmlal_u8(q4u16, d21u8, d2u8); 1354 q5u16 = vmlal_u8(q5u16, d22u8, d2u8); 1355 q6u16 = vmlal_u8(q6u16, d23u8, d2u8); 1356 1357 q3u16 = vmlal_u8(q3u16, d23u8, d5u8); 1358 q4u16 = vmlal_u8(q4u16, d24u8, d5u8); 1359 q5u16 = vmlal_u8(q5u16, d25u8, d5u8); 1360 q6u16 = vmlal_u8(q6u16, d26u8, d5u8); 1361 1362 q7u16 = vmull_u8(d21u8, d3u8); 1363 q8u16 = vmull_u8(d22u8, d3u8); 1364 q9u16 = vmull_u8(d23u8, d3u8); 1365 q10u16 = vmull_u8(d24u8, d3u8); 1366 1367 q3s16 = vreinterpretq_s16_u16(q3u16); 1368 q4s16 = vreinterpretq_s16_u16(q4u16); 1369 q5s16 = vreinterpretq_s16_u16(q5u16); 1370 q6s16 = vreinterpretq_s16_u16(q6u16); 1371 q7s16 = vreinterpretq_s16_u16(q7u16); 1372 q8s16 = vreinterpretq_s16_u16(q8u16); 1373 q9s16 = vreinterpretq_s16_u16(q9u16); 1374 q10s16 = vreinterpretq_s16_u16(q10u16); 1375 1376 q7s16 = vqaddq_s16(q7s16, q3s16); 1377 q8s16 = vqaddq_s16(q8s16, q4s16); 1378 q9s16 = vqaddq_s16(q9s16, q5s16); 1379 q10s16 = vqaddq_s16(q10s16, q6s16); 1380 1381 d6u8 = vqrshrun_n_s16(q7s16, 7); 1382 d7u8 = vqrshrun_n_s16(q8s16, 7); 1383 d8u8 = vqrshrun_n_s16(q9s16, 7); 1384 d9u8 = vqrshrun_n_s16(q10s16, 7); 1385 1386 d18u8 = d22u8; 1387 d19u8 = d23u8; 1388 d20u8 = d24u8; 1389 d21u8 = d25u8; 1390 d22u8 = d26u8; 1391 1392 vst1_u8(dst, d6u8); 1393 dst += dst_pitch; 1394 vst1_u8(dst, d7u8); 1395 dst += dst_pitch; 1396 vst1_u8(dst, d8u8); 1397 dst += dst_pitch; 1398 vst1_u8(dst, d9u8); 1399 dst += dst_pitch; 1400 } 1401 } 1402 return; 1403 } 1404 1405 // load first_pass filter 1406 dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]); 1407 d0s8 = vdup_lane_s8(dtmps8, 0); 1408 d1s8 = vdup_lane_s8(dtmps8, 1); 1409 d2s8 = vdup_lane_s8(dtmps8, 2); 1410 d3s8 = vdup_lane_s8(dtmps8, 3); 1411 d4s8 = vdup_lane_s8(dtmps8, 4); 1412 d5s8 = vdup_lane_s8(dtmps8, 5); 1413 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); 1414 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); 1415 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); 1416 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); 1417 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); 1418 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); 1419 1420 // First pass: output_height lines x output_width columns (9x4) 1421 if (yoffset == 0) { // firstpass_filter4x4_only 1422 src = src_ptr - 2; 1423 dst = dst_ptr; 1424 for (i = 0; i < 8; i++) { 1425 d6u8 = vld1_u8(src); 1426 d7u8 = vld1_u8(src + 8); 1427 d8u8 = vld1_u8(src + 16); 1428 src += src_pixels_per_line; 1429 d9u8 = vld1_u8(src); 1430 d10u8 = vld1_u8(src + 8); 1431 d11u8 = vld1_u8(src + 16); 1432 src += src_pixels_per_line; 1433 1434 __builtin_prefetch(src); 1435 __builtin_prefetch(src + src_pixels_per_line); 1436 1437 q6u16 = vmull_u8(d6u8, d0u8); 1438 q7u16 = vmull_u8(d7u8, d0u8); 1439 q8u16 = vmull_u8(d9u8, d0u8); 1440 q9u16 = vmull_u8(d10u8, d0u8); 1441 1442 d20u8 = vext_u8(d6u8, d7u8, 1); 1443 d21u8 = vext_u8(d9u8, d10u8, 1); 1444 d22u8 = vext_u8(d7u8, d8u8, 1); 1445 d23u8 = vext_u8(d10u8, d11u8, 1); 1446 d24u8 = vext_u8(d6u8, d7u8, 4); 1447 d25u8 = vext_u8(d9u8, d10u8, 4); 1448 d26u8 = vext_u8(d7u8, d8u8, 4); 1449 d27u8 = vext_u8(d10u8, d11u8, 4); 1450 d28u8 = vext_u8(d6u8, d7u8, 5); 1451 d29u8 = vext_u8(d9u8, d10u8, 5); 1452 1453 q6u16 = vmlsl_u8(q6u16, d20u8, d1u8); 1454 q8u16 = vmlsl_u8(q8u16, d21u8, d1u8); 1455 q7u16 = vmlsl_u8(q7u16, d22u8, d1u8); 1456 q9u16 = vmlsl_u8(q9u16, d23u8, d1u8); 1457 q6u16 = vmlsl_u8(q6u16, d24u8, d4u8); 1458 q8u16 = vmlsl_u8(q8u16, d25u8, d4u8); 1459 q7u16 = vmlsl_u8(q7u16, d26u8, d4u8); 1460 q9u16 = vmlsl_u8(q9u16, d27u8, d4u8); 1461 q6u16 = vmlal_u8(q6u16, d28u8, d5u8); 1462 q8u16 = vmlal_u8(q8u16, d29u8, d5u8); 1463 1464 d20u8 = vext_u8(d7u8, d8u8, 5); 1465 d21u8 = vext_u8(d10u8, d11u8, 5); 1466 d22u8 = vext_u8(d6u8, d7u8, 2); 1467 d23u8 = vext_u8(d9u8, d10u8, 2); 1468 d24u8 = vext_u8(d7u8, d8u8, 2); 1469 d25u8 = vext_u8(d10u8, d11u8, 2); 1470 d26u8 = vext_u8(d6u8, d7u8, 3); 1471 d27u8 = vext_u8(d9u8, d10u8, 3); 1472 d28u8 = vext_u8(d7u8, d8u8, 3); 1473 d29u8 = vext_u8(d10u8, d11u8, 3); 1474 1475 q7u16 = vmlal_u8(q7u16, d20u8, d5u8); 1476 q9u16 = vmlal_u8(q9u16, d21u8, d5u8); 1477 q6u16 = vmlal_u8(q6u16, d22u8, d2u8); 1478 q8u16 = vmlal_u8(q8u16, d23u8, d2u8); 1479 q7u16 = vmlal_u8(q7u16, d24u8, d2u8); 1480 q9u16 = vmlal_u8(q9u16, d25u8, d2u8); 1481 1482 q10u16 = vmull_u8(d26u8, d3u8); 1483 q11u16 = vmull_u8(d27u8, d3u8); 1484 q12u16 = vmull_u8(d28u8, d3u8); 1485 q15u16 = vmull_u8(d29u8, d3u8); 1486 1487 q6s16 = vreinterpretq_s16_u16(q6u16); 1488 q7s16 = vreinterpretq_s16_u16(q7u16); 1489 q8s16 = vreinterpretq_s16_u16(q8u16); 1490 q9s16 = vreinterpretq_s16_u16(q9u16); 1491 q10s16 = vreinterpretq_s16_u16(q10u16); 1492 q11s16 = vreinterpretq_s16_u16(q11u16); 1493 q12s16 = vreinterpretq_s16_u16(q12u16); 1494 q15s16 = vreinterpretq_s16_u16(q15u16); 1495 1496 q6s16 = vqaddq_s16(q6s16, q10s16); 1497 q8s16 = vqaddq_s16(q8s16, q11s16); 1498 q7s16 = vqaddq_s16(q7s16, q12s16); 1499 q9s16 = vqaddq_s16(q9s16, q15s16); 1500 1501 d6u8 = vqrshrun_n_s16(q6s16, 7); 1502 d7u8 = vqrshrun_n_s16(q7s16, 7); 1503 d8u8 = vqrshrun_n_s16(q8s16, 7); 1504 d9u8 = vqrshrun_n_s16(q9s16, 7); 1505 1506 q3u8 = vcombine_u8(d6u8, d7u8); 1507 q4u8 = vcombine_u8(d8u8, d9u8); 1508 vst1q_u8(dst, q3u8); 1509 dst += dst_pitch; 1510 vst1q_u8(dst, q4u8); 1511 dst += dst_pitch; 1512 } 1513 return; 1514 } 1515 1516 src = src_ptr - 2 - src_pixels_per_line * 2; 1517 tmpp = tmp; 1518 for (i = 0; i < 7; i++) { 1519 d6u8 = vld1_u8(src); 1520 d7u8 = vld1_u8(src + 8); 1521 d8u8 = vld1_u8(src + 16); 1522 src += src_pixels_per_line; 1523 d9u8 = vld1_u8(src); 1524 d10u8 = vld1_u8(src + 8); 1525 d11u8 = vld1_u8(src + 16); 1526 src += src_pixels_per_line; 1527 d12u8 = vld1_u8(src); 1528 d13u8 = vld1_u8(src + 8); 1529 d14u8 = vld1_u8(src + 16); 1530 src += src_pixels_per_line; 1531 1532 __builtin_prefetch(src); 1533 __builtin_prefetch(src + src_pixels_per_line); 1534 __builtin_prefetch(src + src_pixels_per_line * 2); 1535 1536 q8u16 = vmull_u8(d6u8, d0u8); 1537 q9u16 = vmull_u8(d7u8, d0u8); 1538 q10u16 = vmull_u8(d9u8, d0u8); 1539 q11u16 = vmull_u8(d10u8, d0u8); 1540 q12u16 = vmull_u8(d12u8, d0u8); 1541 q13u16 = vmull_u8(d13u8, d0u8); 1542 1543 d28u8 = vext_u8(d6u8, d7u8, 1); 1544 d29u8 = vext_u8(d9u8, d10u8, 1); 1545 d30u8 = vext_u8(d12u8, d13u8, 1); 1546 q8u16 = vmlsl_u8(q8u16, d28u8, d1u8); 1547 q10u16 = vmlsl_u8(q10u16, d29u8, d1u8); 1548 q12u16 = vmlsl_u8(q12u16, d30u8, d1u8); 1549 d28u8 = vext_u8(d7u8, d8u8, 1); 1550 d29u8 = vext_u8(d10u8, d11u8, 1); 1551 d30u8 = vext_u8(d13u8, d14u8, 1); 1552 q9u16 = vmlsl_u8(q9u16, d28u8, d1u8); 1553 q11u16 = vmlsl_u8(q11u16, d29u8, d1u8); 1554 q13u16 = vmlsl_u8(q13u16, d30u8, d1u8); 1555 1556 d28u8 = vext_u8(d6u8, d7u8, 4); 1557 d29u8 = vext_u8(d9u8, d10u8, 4); 1558 d30u8 = vext_u8(d12u8, d13u8, 4); 1559 q8u16 = vmlsl_u8(q8u16, d28u8, d4u8); 1560 q10u16 = vmlsl_u8(q10u16, d29u8, d4u8); 1561 q12u16 = vmlsl_u8(q12u16, d30u8, d4u8); 1562 d28u8 = vext_u8(d7u8, d8u8, 4); 1563 d29u8 = vext_u8(d10u8, d11u8, 4); 1564 d30u8 = vext_u8(d13u8, d14u8, 4); 1565 q9u16 = vmlsl_u8(q9u16, d28u8, d4u8); 1566 q11u16 = vmlsl_u8(q11u16, d29u8, d4u8); 1567 q13u16 = vmlsl_u8(q13u16, d30u8, d4u8); 1568 1569 d28u8 = vext_u8(d6u8, d7u8, 5); 1570 d29u8 = vext_u8(d9u8, d10u8, 5); 1571 d30u8 = vext_u8(d12u8, d13u8, 5); 1572 q8u16 = vmlal_u8(q8u16, d28u8, d5u8); 1573 q10u16 = vmlal_u8(q10u16, d29u8, d5u8); 1574 q12u16 = vmlal_u8(q12u16, d30u8, d5u8); 1575 d28u8 = vext_u8(d7u8, d8u8, 5); 1576 d29u8 = vext_u8(d10u8, d11u8, 5); 1577 d30u8 = vext_u8(d13u8, d14u8, 5); 1578 q9u16 = vmlal_u8(q9u16, d28u8, d5u8); 1579 q11u16 = vmlal_u8(q11u16, d29u8, d5u8); 1580 q13u16 = vmlal_u8(q13u16, d30u8, d5u8); 1581 1582 d28u8 = vext_u8(d6u8, d7u8, 2); 1583 d29u8 = vext_u8(d9u8, d10u8, 2); 1584 d30u8 = vext_u8(d12u8, d13u8, 2); 1585 q8u16 = vmlal_u8(q8u16, d28u8, d2u8); 1586 q10u16 = vmlal_u8(q10u16, d29u8, d2u8); 1587 q12u16 = vmlal_u8(q12u16, d30u8, d2u8); 1588 d28u8 = vext_u8(d7u8, d8u8, 2); 1589 d29u8 = vext_u8(d10u8, d11u8, 2); 1590 d30u8 = vext_u8(d13u8, d14u8, 2); 1591 q9u16 = vmlal_u8(q9u16, d28u8, d2u8); 1592 q11u16 = vmlal_u8(q11u16, d29u8, d2u8); 1593 q13u16 = vmlal_u8(q13u16, d30u8, d2u8); 1594 1595 d28u8 = vext_u8(d6u8, d7u8, 3); 1596 d29u8 = vext_u8(d9u8, d10u8, 3); 1597 d30u8 = vext_u8(d12u8, d13u8, 3); 1598 d15u8 = vext_u8(d7u8, d8u8, 3); 1599 d31u8 = vext_u8(d10u8, d11u8, 3); 1600 d6u8 = vext_u8(d13u8, d14u8, 3); 1601 q4u16 = vmull_u8(d28u8, d3u8); 1602 q5u16 = vmull_u8(d29u8, d3u8); 1603 q6u16 = vmull_u8(d30u8, d3u8); 1604 q4s16 = vreinterpretq_s16_u16(q4u16); 1605 q5s16 = vreinterpretq_s16_u16(q5u16); 1606 q6s16 = vreinterpretq_s16_u16(q6u16); 1607 q8s16 = vreinterpretq_s16_u16(q8u16); 1608 q10s16 = vreinterpretq_s16_u16(q10u16); 1609 q12s16 = vreinterpretq_s16_u16(q12u16); 1610 q8s16 = vqaddq_s16(q8s16, q4s16); 1611 q10s16 = vqaddq_s16(q10s16, q5s16); 1612 q12s16 = vqaddq_s16(q12s16, q6s16); 1613 1614 q6u16 = vmull_u8(d15u8, d3u8); 1615 q7u16 = vmull_u8(d31u8, d3u8); 1616 q3u16 = vmull_u8(d6u8, d3u8); 1617 q3s16 = vreinterpretq_s16_u16(q3u16); 1618 q6s16 = vreinterpretq_s16_u16(q6u16); 1619 q7s16 = vreinterpretq_s16_u16(q7u16); 1620 q9s16 = vreinterpretq_s16_u16(q9u16); 1621 q11s16 = vreinterpretq_s16_u16(q11u16); 1622 q13s16 = vreinterpretq_s16_u16(q13u16); 1623 q9s16 = vqaddq_s16(q9s16, q6s16); 1624 q11s16 = vqaddq_s16(q11s16, q7s16); 1625 q13s16 = vqaddq_s16(q13s16, q3s16); 1626 1627 d6u8 = vqrshrun_n_s16(q8s16, 7); 1628 d7u8 = vqrshrun_n_s16(q9s16, 7); 1629 d8u8 = vqrshrun_n_s16(q10s16, 7); 1630 d9u8 = vqrshrun_n_s16(q11s16, 7); 1631 d10u8 = vqrshrun_n_s16(q12s16, 7); 1632 d11u8 = vqrshrun_n_s16(q13s16, 7); 1633 1634 vst1_u8(tmpp, d6u8); 1635 tmpp += 8; 1636 vst1_u8(tmpp, d7u8); 1637 tmpp += 8; 1638 vst1_u8(tmpp, d8u8); 1639 tmpp += 8; 1640 vst1_u8(tmpp, d9u8); 1641 tmpp += 8; 1642 vst1_u8(tmpp, d10u8); 1643 tmpp += 8; 1644 vst1_u8(tmpp, d11u8); 1645 tmpp += 8; 1646 } 1647 1648 // Second pass: 16x16 1649 dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); 1650 d0s8 = vdup_lane_s8(dtmps8, 0); 1651 d1s8 = vdup_lane_s8(dtmps8, 1); 1652 d2s8 = vdup_lane_s8(dtmps8, 2); 1653 d3s8 = vdup_lane_s8(dtmps8, 3); 1654 d4s8 = vdup_lane_s8(dtmps8, 4); 1655 d5s8 = vdup_lane_s8(dtmps8, 5); 1656 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); 1657 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); 1658 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); 1659 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); 1660 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); 1661 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); 1662 1663 for (i = 0; i < 2; i++) { 1664 dst = dst_ptr + 8 * i; 1665 tmpp = tmp + 8 * i; 1666 d18u8 = vld1_u8(tmpp); 1667 tmpp += 16; 1668 d19u8 = vld1_u8(tmpp); 1669 tmpp += 16; 1670 d20u8 = vld1_u8(tmpp); 1671 tmpp += 16; 1672 d21u8 = vld1_u8(tmpp); 1673 tmpp += 16; 1674 d22u8 = vld1_u8(tmpp); 1675 tmpp += 16; 1676 for (j = 0; j < 4; j++) { 1677 d23u8 = vld1_u8(tmpp); 1678 tmpp += 16; 1679 d24u8 = vld1_u8(tmpp); 1680 tmpp += 16; 1681 d25u8 = vld1_u8(tmpp); 1682 tmpp += 16; 1683 d26u8 = vld1_u8(tmpp); 1684 tmpp += 16; 1685 1686 q3u16 = vmull_u8(d18u8, d0u8); 1687 q4u16 = vmull_u8(d19u8, d0u8); 1688 q5u16 = vmull_u8(d20u8, d0u8); 1689 q6u16 = vmull_u8(d21u8, d0u8); 1690 1691 q3u16 = vmlsl_u8(q3u16, d19u8, d1u8); 1692 q4u16 = vmlsl_u8(q4u16, d20u8, d1u8); 1693 q5u16 = vmlsl_u8(q5u16, d21u8, d1u8); 1694 q6u16 = vmlsl_u8(q6u16, d22u8, d1u8); 1695 1696 q3u16 = vmlsl_u8(q3u16, d22u8, d4u8); 1697 q4u16 = vmlsl_u8(q4u16, d23u8, d4u8); 1698 q5u16 = vmlsl_u8(q5u16, d24u8, d4u8); 1699 q6u16 = vmlsl_u8(q6u16, d25u8, d4u8); 1700 1701 q3u16 = vmlal_u8(q3u16, d20u8, d2u8); 1702 q4u16 = vmlal_u8(q4u16, d21u8, d2u8); 1703 q5u16 = vmlal_u8(q5u16, d22u8, d2u8); 1704 q6u16 = vmlal_u8(q6u16, d23u8, d2u8); 1705 1706 q3u16 = vmlal_u8(q3u16, d23u8, d5u8); 1707 q4u16 = vmlal_u8(q4u16, d24u8, d5u8); 1708 q5u16 = vmlal_u8(q5u16, d25u8, d5u8); 1709 q6u16 = vmlal_u8(q6u16, d26u8, d5u8); 1710 1711 q7u16 = vmull_u8(d21u8, d3u8); 1712 q8u16 = vmull_u8(d22u8, d3u8); 1713 q9u16 = vmull_u8(d23u8, d3u8); 1714 q10u16 = vmull_u8(d24u8, d3u8); 1715 1716 q3s16 = vreinterpretq_s16_u16(q3u16); 1717 q4s16 = vreinterpretq_s16_u16(q4u16); 1718 q5s16 = vreinterpretq_s16_u16(q5u16); 1719 q6s16 = vreinterpretq_s16_u16(q6u16); 1720 q7s16 = vreinterpretq_s16_u16(q7u16); 1721 q8s16 = vreinterpretq_s16_u16(q8u16); 1722 q9s16 = vreinterpretq_s16_u16(q9u16); 1723 q10s16 = vreinterpretq_s16_u16(q10u16); 1724 1725 q7s16 = vqaddq_s16(q7s16, q3s16); 1726 q8s16 = vqaddq_s16(q8s16, q4s16); 1727 q9s16 = vqaddq_s16(q9s16, q5s16); 1728 q10s16 = vqaddq_s16(q10s16, q6s16); 1729 1730 d6u8 = vqrshrun_n_s16(q7s16, 7); 1731 d7u8 = vqrshrun_n_s16(q8s16, 7); 1732 d8u8 = vqrshrun_n_s16(q9s16, 7); 1733 d9u8 = vqrshrun_n_s16(q10s16, 7); 1734 1735 d18u8 = d22u8; 1736 d19u8 = d23u8; 1737 d20u8 = d24u8; 1738 d21u8 = d25u8; 1739 d22u8 = d26u8; 1740 1741 vst1_u8(dst, d6u8); 1742 dst += dst_pitch; 1743 vst1_u8(dst, d7u8); 1744 dst += dst_pitch; 1745 vst1_u8(dst, d8u8); 1746 dst += dst_pitch; 1747 vst1_u8(dst, d9u8); 1748 dst += dst_pitch; 1749 } 1750 } 1751 return; 1752 } 1753