1 /* 2 * Copyright (c) 2016 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <arm_neon.h> 12 #include "./vpx_config.h" 13 #include "./vpx_dsp_rtcd.h" 14 #include "vpx_dsp/arm/transpose_neon.h" 15 16 // For all the static inline functions, the functions ending with '_8' process 17 // 8 samples in a bunch, and the functions ending with '_16' process 16 samples 18 // in a bunch. 19 20 #define FUN_LOAD_THRESH(w, r) \ 21 static INLINE void load_thresh_##w( \ 22 const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, \ 23 uint8x##w##_t *blimit_vec, uint8x##w##_t *limit_vec, \ 24 uint8x##w##_t *thresh_vec) { \ 25 *blimit_vec = vld1##r##dup_u8(blimit); \ 26 *limit_vec = vld1##r##dup_u8(limit); \ 27 *thresh_vec = vld1##r##dup_u8(thresh); \ 28 } 29 30 FUN_LOAD_THRESH(8, _) // load_thresh_8 31 FUN_LOAD_THRESH(16, q_) // load_thresh_16 32 #undef FUN_LOAD_THRESH 33 34 static INLINE void load_thresh_8_dual( 35 const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, 36 const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, 37 uint8x16_t *blimit_vec, uint8x16_t *limit_vec, uint8x16_t *thresh_vec) { 38 *blimit_vec = vcombine_u8(vld1_dup_u8(blimit0), vld1_dup_u8(blimit1)); 39 *limit_vec = vcombine_u8(vld1_dup_u8(limit0), vld1_dup_u8(limit1)); 40 *thresh_vec = vcombine_u8(vld1_dup_u8(thresh0), vld1_dup_u8(thresh1)); 41 } 42 43 // Here flat is 64-bit long, with each 8-bit (or 4-bit) chunk being a mask of a 44 // pixel. When used to control filter branches, we only detect whether it is all 45 // 0s or all 1s. We pairwise add flat to a 32-bit long number flat_status. 46 // flat equals 0 if and only if flat_status equals 0. 47 // flat equals -1 (all 1s) if and only if flat_status equals -2. (This is true 48 // because each mask occupies more than 1 bit.) 49 static INLINE uint32_t calc_flat_status_8(uint8x8_t flat) { 50 return vget_lane_u32( 51 vreinterpret_u32_u64(vpaddl_u32(vreinterpret_u32_u8(flat))), 0); 52 } 53 54 // Here flat is 128-bit long, with each 8-bit chunk being a mask of a pixel. 55 // When used to control filter branches, we only detect whether it is all 0s or 56 // all 1s. We narrowing shift right each 16-bit chunk by 4 arithmetically, so 57 // we get a 64-bit long number, with each 4-bit chunk being a mask of a pixel. 58 // Then we pairwise add flat to a 32-bit long number flat_status. 59 // flat equals 0 if and only if flat_status equals 0. 60 // flat equals -1 (all 1s) if and only if flat_status equals -2. (This is true 61 // because each mask occupies more than 1 bit.) 62 static INLINE uint32_t calc_flat_status_16(uint8x16_t flat) { 63 const uint8x8_t flat_4bit = 64 vreinterpret_u8_s8(vshrn_n_s16(vreinterpretq_s16_u8(flat), 4)); 65 return calc_flat_status_8(flat_4bit); 66 } 67 68 #define FUN_FILTER_HEV_MASK4(w, r) \ 69 static INLINE uint8x##w##_t filter_hev_mask4_##w( \ 70 const uint8x##w##_t limit, const uint8x##w##_t blimit, \ 71 const uint8x##w##_t thresh, const uint8x##w##_t p3, \ 72 const uint8x##w##_t p2, const uint8x##w##_t p1, const uint8x##w##_t p0, \ 73 const uint8x##w##_t q0, const uint8x##w##_t q1, const uint8x##w##_t q2, \ 74 const uint8x##w##_t q3, uint8x##w##_t *hev, uint8x##w##_t *mask) { \ 75 uint8x##w##_t max, t0, t1; \ 76 \ 77 max = vabd##r##u8(p1, p0); \ 78 max = vmax##r##u8(max, vabd##r##u8(q1, q0)); \ 79 *hev = vcgt##r##u8(max, thresh); \ 80 *mask = vmax##r##u8(max, vabd##r##u8(p3, p2)); \ 81 *mask = vmax##r##u8(*mask, vabd##r##u8(p2, p1)); \ 82 *mask = vmax##r##u8(*mask, vabd##r##u8(q2, q1)); \ 83 *mask = vmax##r##u8(*mask, vabd##r##u8(q3, q2)); \ 84 t0 = vabd##r##u8(p0, q0); \ 85 t1 = vabd##r##u8(p1, q1); \ 86 t0 = vqadd##r##u8(t0, t0); \ 87 t1 = vshr##r##n_u8(t1, 1); \ 88 t0 = vqadd##r##u8(t0, t1); \ 89 *mask = vcle##r##u8(*mask, limit); \ 90 t0 = vcle##r##u8(t0, blimit); \ 91 *mask = vand##r##u8(*mask, t0); \ 92 \ 93 return max; \ 94 } 95 96 FUN_FILTER_HEV_MASK4(8, _) // filter_hev_mask4_8 97 FUN_FILTER_HEV_MASK4(16, q_) // filter_hev_mask4_16 98 #undef FUN_FILTER_HEV_MASK4 99 100 #define FUN_FILTER_FLAT_HEV_MASK(w, r) \ 101 static INLINE uint8x##w##_t filter_flat_hev_mask_##w( \ 102 const uint8x##w##_t limit, const uint8x##w##_t blimit, \ 103 const uint8x##w##_t thresh, const uint8x##w##_t p3, \ 104 const uint8x##w##_t p2, const uint8x##w##_t p1, const uint8x##w##_t p0, \ 105 const uint8x##w##_t q0, const uint8x##w##_t q1, const uint8x##w##_t q2, \ 106 const uint8x##w##_t q3, uint8x##w##_t *flat, uint32_t *flat_status, \ 107 uint8x##w##_t *hev) { \ 108 uint8x##w##_t max, mask; \ 109 \ 110 max = filter_hev_mask4_##w(limit, blimit, thresh, p3, p2, p1, p0, q0, q1, \ 111 q2, q3, hev, &mask); \ 112 *flat = vmax##r##u8(max, vabd##r##u8(p2, p0)); \ 113 *flat = vmax##r##u8(*flat, vabd##r##u8(q2, q0)); \ 114 *flat = vmax##r##u8(*flat, vabd##r##u8(p3, p0)); \ 115 *flat = vmax##r##u8(*flat, vabd##r##u8(q3, q0)); \ 116 *flat = vcle##r##u8(*flat, vdup##r##n_u8(1)); /* flat_mask4() */ \ 117 *flat = vand##r##u8(*flat, mask); \ 118 *flat_status = calc_flat_status_##w(*flat); \ 119 \ 120 return mask; \ 121 } 122 123 FUN_FILTER_FLAT_HEV_MASK(8, _) // filter_flat_hev_mask_8 124 FUN_FILTER_FLAT_HEV_MASK(16, q_) // filter_flat_hev_mask_16 125 #undef FUN_FILTER_FLAT_HEV_MASK 126 127 #define FUN_FLAT_MASK5(w, r) \ 128 static INLINE uint8x##w##_t flat_mask5_##w( \ 129 const uint8x##w##_t p4, const uint8x##w##_t p3, const uint8x##w##_t p2, \ 130 const uint8x##w##_t p1, const uint8x##w##_t p0, const uint8x##w##_t q0, \ 131 const uint8x##w##_t q1, const uint8x##w##_t q2, const uint8x##w##_t q3, \ 132 const uint8x##w##_t q4, const uint8x##w##_t flat, \ 133 uint32_t *flat2_status) { \ 134 uint8x##w##_t flat2 = vabd##r##u8(p4, p0); \ 135 flat2 = vmax##r##u8(flat2, vabd##r##u8(p3, p0)); \ 136 flat2 = vmax##r##u8(flat2, vabd##r##u8(p2, p0)); \ 137 flat2 = vmax##r##u8(flat2, vabd##r##u8(p1, p0)); \ 138 flat2 = vmax##r##u8(flat2, vabd##r##u8(q1, q0)); \ 139 flat2 = vmax##r##u8(flat2, vabd##r##u8(q2, q0)); \ 140 flat2 = vmax##r##u8(flat2, vabd##r##u8(q3, q0)); \ 141 flat2 = vmax##r##u8(flat2, vabd##r##u8(q4, q0)); \ 142 flat2 = vcle##r##u8(flat2, vdup##r##n_u8(1)); \ 143 flat2 = vand##r##u8(flat2, flat); \ 144 *flat2_status = calc_flat_status_##w(flat2); \ 145 \ 146 return flat2; \ 147 } 148 149 FUN_FLAT_MASK5(8, _) // flat_mask5_8 150 FUN_FLAT_MASK5(16, q_) // flat_mask5_16 151 #undef FUN_FLAT_MASK5 152 153 #define FUN_FLIP_SIGN(w, r) \ 154 static INLINE int8x##w##_t flip_sign_##w(const uint8x##w##_t v) { \ 155 const uint8x##w##_t sign_bit = vdup##r##n_u8(0x80); \ 156 return vreinterpret##r##s8_u8(veor##r##u8(v, sign_bit)); \ 157 } 158 159 FUN_FLIP_SIGN(8, _) // flip_sign_8 160 FUN_FLIP_SIGN(16, q_) // flip_sign_16 161 #undef FUN_FLIP_SIGN 162 163 #define FUN_FLIP_SIGN_BACK(w, r) \ 164 static INLINE uint8x##w##_t flip_sign_back_##w(const int8x##w##_t v) { \ 165 const int8x##w##_t sign_bit = vdup##r##n_s8(0x80); \ 166 return vreinterpret##r##u8_s8(veor##r##s8(v, sign_bit)); \ 167 } 168 169 FUN_FLIP_SIGN_BACK(8, _) // flip_sign_back_8 170 FUN_FLIP_SIGN_BACK(16, q_) // flip_sign_back_16 171 #undef FUN_FLIP_SIGN_BACK 172 173 static INLINE void filter_update_8(const uint8x8_t sub0, const uint8x8_t sub1, 174 const uint8x8_t add0, const uint8x8_t add1, 175 uint16x8_t *sum) { 176 *sum = vsubw_u8(*sum, sub0); 177 *sum = vsubw_u8(*sum, sub1); 178 *sum = vaddw_u8(*sum, add0); 179 *sum = vaddw_u8(*sum, add1); 180 } 181 182 static INLINE void filter_update_16(const uint8x16_t sub0, 183 const uint8x16_t sub1, 184 const uint8x16_t add0, 185 const uint8x16_t add1, uint16x8_t *sum0, 186 uint16x8_t *sum1) { 187 *sum0 = vsubw_u8(*sum0, vget_low_u8(sub0)); 188 *sum1 = vsubw_u8(*sum1, vget_high_u8(sub0)); 189 *sum0 = vsubw_u8(*sum0, vget_low_u8(sub1)); 190 *sum1 = vsubw_u8(*sum1, vget_high_u8(sub1)); 191 *sum0 = vaddw_u8(*sum0, vget_low_u8(add0)); 192 *sum1 = vaddw_u8(*sum1, vget_high_u8(add0)); 193 *sum0 = vaddw_u8(*sum0, vget_low_u8(add1)); 194 *sum1 = vaddw_u8(*sum1, vget_high_u8(add1)); 195 } 196 197 static INLINE uint8x8_t calc_7_tap_filter_8_kernel(const uint8x8_t sub0, 198 const uint8x8_t sub1, 199 const uint8x8_t add0, 200 const uint8x8_t add1, 201 uint16x8_t *sum) { 202 filter_update_8(sub0, sub1, add0, add1, sum); 203 return vrshrn_n_u16(*sum, 3); 204 } 205 206 static INLINE uint8x16_t calc_7_tap_filter_16_kernel( 207 const uint8x16_t sub0, const uint8x16_t sub1, const uint8x16_t add0, 208 const uint8x16_t add1, uint16x8_t *sum0, uint16x8_t *sum1) { 209 filter_update_16(sub0, sub1, add0, add1, sum0, sum1); 210 return vcombine_u8(vrshrn_n_u16(*sum0, 3), vrshrn_n_u16(*sum1, 3)); 211 } 212 213 static INLINE uint8x8_t apply_15_tap_filter_8_kernel( 214 const uint8x8_t flat, const uint8x8_t sub0, const uint8x8_t sub1, 215 const uint8x8_t add0, const uint8x8_t add1, const uint8x8_t in, 216 uint16x8_t *sum) { 217 filter_update_8(sub0, sub1, add0, add1, sum); 218 return vbsl_u8(flat, vrshrn_n_u16(*sum, 4), in); 219 } 220 221 static INLINE uint8x16_t apply_15_tap_filter_16_kernel( 222 const uint8x16_t flat, const uint8x16_t sub0, const uint8x16_t sub1, 223 const uint8x16_t add0, const uint8x16_t add1, const uint8x16_t in, 224 uint16x8_t *sum0, uint16x8_t *sum1) { 225 uint8x16_t t; 226 filter_update_16(sub0, sub1, add0, add1, sum0, sum1); 227 t = vcombine_u8(vrshrn_n_u16(*sum0, 4), vrshrn_n_u16(*sum1, 4)); 228 return vbslq_u8(flat, t, in); 229 } 230 231 // 7-tap filter [1, 1, 1, 2, 1, 1, 1] 232 static INLINE void calc_7_tap_filter_8(const uint8x8_t p3, const uint8x8_t p2, 233 const uint8x8_t p1, const uint8x8_t p0, 234 const uint8x8_t q0, const uint8x8_t q1, 235 const uint8x8_t q2, const uint8x8_t q3, 236 uint8x8_t *op2, uint8x8_t *op1, 237 uint8x8_t *op0, uint8x8_t *oq0, 238 uint8x8_t *oq1, uint8x8_t *oq2) { 239 uint16x8_t sum; 240 sum = vaddl_u8(p3, p3); // 2*p3 241 sum = vaddw_u8(sum, p3); // 3*p3 242 sum = vaddw_u8(sum, p2); // 3*p3+p2 243 sum = vaddw_u8(sum, p2); // 3*p3+2*p2 244 sum = vaddw_u8(sum, p1); // 3*p3+2*p2+p1 245 sum = vaddw_u8(sum, p0); // 3*p3+2*p2+p1+p0 246 sum = vaddw_u8(sum, q0); // 3*p3+2*p2+p1+p0+q0 247 *op2 = vrshrn_n_u16(sum, 3); 248 *op1 = calc_7_tap_filter_8_kernel(p3, p2, p1, q1, &sum); 249 *op0 = calc_7_tap_filter_8_kernel(p3, p1, p0, q2, &sum); 250 *oq0 = calc_7_tap_filter_8_kernel(p3, p0, q0, q3, &sum); 251 *oq1 = calc_7_tap_filter_8_kernel(p2, q0, q1, q3, &sum); 252 *oq2 = calc_7_tap_filter_8_kernel(p1, q1, q2, q3, &sum); 253 } 254 255 static INLINE void calc_7_tap_filter_16( 256 const uint8x16_t p3, const uint8x16_t p2, const uint8x16_t p1, 257 const uint8x16_t p0, const uint8x16_t q0, const uint8x16_t q1, 258 const uint8x16_t q2, const uint8x16_t q3, uint8x16_t *op2, uint8x16_t *op1, 259 uint8x16_t *op0, uint8x16_t *oq0, uint8x16_t *oq1, uint8x16_t *oq2) { 260 uint16x8_t sum0, sum1; 261 sum0 = vaddl_u8(vget_low_u8(p3), vget_low_u8(p3)); // 2*p3 262 sum1 = vaddl_u8(vget_high_u8(p3), vget_high_u8(p3)); // 2*p3 263 sum0 = vaddw_u8(sum0, vget_low_u8(p3)); // 3*p3 264 sum1 = vaddw_u8(sum1, vget_high_u8(p3)); // 3*p3 265 sum0 = vaddw_u8(sum0, vget_low_u8(p2)); // 3*p3+p2 266 sum1 = vaddw_u8(sum1, vget_high_u8(p2)); // 3*p3+p2 267 sum0 = vaddw_u8(sum0, vget_low_u8(p2)); // 3*p3+2*p2 268 sum1 = vaddw_u8(sum1, vget_high_u8(p2)); // 3*p3+2*p2 269 sum0 = vaddw_u8(sum0, vget_low_u8(p1)); // 3*p3+2*p2+p1 270 sum1 = vaddw_u8(sum1, vget_high_u8(p1)); // 3*p3+2*p2+p1 271 sum0 = vaddw_u8(sum0, vget_low_u8(p0)); // 3*p3+2*p2+p1+p0 272 sum1 = vaddw_u8(sum1, vget_high_u8(p0)); // 3*p3+2*p2+p1+p0 273 sum0 = vaddw_u8(sum0, vget_low_u8(q0)); // 3*p3+2*p2+p1+p0+q0 274 sum1 = vaddw_u8(sum1, vget_high_u8(q0)); // 3*p3+2*p2+p1+p0+q0 275 *op2 = vcombine_u8(vrshrn_n_u16(sum0, 3), vrshrn_n_u16(sum1, 3)); 276 *op1 = calc_7_tap_filter_16_kernel(p3, p2, p1, q1, &sum0, &sum1); 277 *op0 = calc_7_tap_filter_16_kernel(p3, p1, p0, q2, &sum0, &sum1); 278 *oq0 = calc_7_tap_filter_16_kernel(p3, p0, q0, q3, &sum0, &sum1); 279 *oq1 = calc_7_tap_filter_16_kernel(p2, q0, q1, q3, &sum0, &sum1); 280 *oq2 = calc_7_tap_filter_16_kernel(p1, q1, q2, q3, &sum0, &sum1); 281 } 282 283 #define FUN_APPLY_7_TAP_FILTER(w, r) \ 284 static INLINE void apply_7_tap_filter_##w( \ 285 const uint8x##w##_t flat, const uint8x##w##_t p3, \ 286 const uint8x##w##_t p2, const uint8x##w##_t p1, const uint8x##w##_t p0, \ 287 const uint8x##w##_t q0, const uint8x##w##_t q1, const uint8x##w##_t q2, \ 288 const uint8x##w##_t q3, uint8x##w##_t *op2, uint8x##w##_t *op1, \ 289 uint8x##w##_t *op0, uint8x##w##_t *oq0, uint8x##w##_t *oq1, \ 290 uint8x##w##_t *oq2) { \ 291 uint8x##w##_t tp1, tp0, tq0, tq1; \ 292 calc_7_tap_filter_##w(p3, p2, p1, p0, q0, q1, q2, q3, op2, &tp1, &tp0, \ 293 &tq0, &tq1, oq2); \ 294 *op2 = vbsl##r##u8(flat, *op2, p2); \ 295 *op1 = vbsl##r##u8(flat, tp1, *op1); \ 296 *op0 = vbsl##r##u8(flat, tp0, *op0); \ 297 *oq0 = vbsl##r##u8(flat, tq0, *oq0); \ 298 *oq1 = vbsl##r##u8(flat, tq1, *oq1); \ 299 *oq2 = vbsl##r##u8(flat, *oq2, q2); \ 300 } 301 302 FUN_APPLY_7_TAP_FILTER(8, _) // apply_7_tap_filter_8 303 FUN_APPLY_7_TAP_FILTER(16, q_) // apply_7_tap_filter_16 304 #undef FUN_APPLY_7_TAP_FILTER 305 306 // 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1] 307 static INLINE void apply_15_tap_filter_8( 308 const uint8x8_t flat2, const uint8x8_t p7, const uint8x8_t p6, 309 const uint8x8_t p5, const uint8x8_t p4, const uint8x8_t p3, 310 const uint8x8_t p2, const uint8x8_t p1, const uint8x8_t p0, 311 const uint8x8_t q0, const uint8x8_t q1, const uint8x8_t q2, 312 const uint8x8_t q3, const uint8x8_t q4, const uint8x8_t q5, 313 const uint8x8_t q6, const uint8x8_t q7, uint8x8_t *op6, uint8x8_t *op5, 314 uint8x8_t *op4, uint8x8_t *op3, uint8x8_t *op2, uint8x8_t *op1, 315 uint8x8_t *op0, uint8x8_t *oq0, uint8x8_t *oq1, uint8x8_t *oq2, 316 uint8x8_t *oq3, uint8x8_t *oq4, uint8x8_t *oq5, uint8x8_t *oq6) { 317 uint16x8_t sum; 318 sum = vshll_n_u8(p7, 3); // 8*p7 319 sum = vsubw_u8(sum, p7); // 7*p7 320 sum = vaddw_u8(sum, p6); // 7*p7+p6 321 sum = vaddw_u8(sum, p6); // 7*p7+2*p6 322 sum = vaddw_u8(sum, p5); // 7*p7+2*p6+p5 323 sum = vaddw_u8(sum, p4); // 7*p7+2*p6+p5+p4 324 sum = vaddw_u8(sum, p3); // 7*p7+2*p6+p5+p4+p3 325 sum = vaddw_u8(sum, p2); // 7*p7+2*p6+p5+p4+p3+p2 326 sum = vaddw_u8(sum, p1); // 7*p7+2*p6+p5+p4+p3+p2+p1 327 sum = vaddw_u8(sum, p0); // 7*p7+2*p6+p5+p4+p3+p2+p1+p0 328 sum = vaddw_u8(sum, q0); // 7*p7+2*p6+p5+p4+p3+p2+p1+p0+q0 329 *op6 = vbsl_u8(flat2, vrshrn_n_u16(sum, 4), p6); 330 *op5 = apply_15_tap_filter_8_kernel(flat2, p7, p6, p5, q1, p5, &sum); 331 *op4 = apply_15_tap_filter_8_kernel(flat2, p7, p5, p4, q2, p4, &sum); 332 *op3 = apply_15_tap_filter_8_kernel(flat2, p7, p4, p3, q3, p3, &sum); 333 *op2 = apply_15_tap_filter_8_kernel(flat2, p7, p3, p2, q4, *op2, &sum); 334 *op1 = apply_15_tap_filter_8_kernel(flat2, p7, p2, p1, q5, *op1, &sum); 335 *op0 = apply_15_tap_filter_8_kernel(flat2, p7, p1, p0, q6, *op0, &sum); 336 *oq0 = apply_15_tap_filter_8_kernel(flat2, p7, p0, q0, q7, *oq0, &sum); 337 *oq1 = apply_15_tap_filter_8_kernel(flat2, p6, q0, q1, q7, *oq1, &sum); 338 *oq2 = apply_15_tap_filter_8_kernel(flat2, p5, q1, q2, q7, *oq2, &sum); 339 *oq3 = apply_15_tap_filter_8_kernel(flat2, p4, q2, q3, q7, q3, &sum); 340 *oq4 = apply_15_tap_filter_8_kernel(flat2, p3, q3, q4, q7, q4, &sum); 341 *oq5 = apply_15_tap_filter_8_kernel(flat2, p2, q4, q5, q7, q5, &sum); 342 *oq6 = apply_15_tap_filter_8_kernel(flat2, p1, q5, q6, q7, q6, &sum); 343 } 344 345 static INLINE void apply_15_tap_filter_16( 346 const uint8x16_t flat2, const uint8x16_t p7, const uint8x16_t p6, 347 const uint8x16_t p5, const uint8x16_t p4, const uint8x16_t p3, 348 const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0, 349 const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2, 350 const uint8x16_t q3, const uint8x16_t q4, const uint8x16_t q5, 351 const uint8x16_t q6, const uint8x16_t q7, uint8x16_t *op6, uint8x16_t *op5, 352 uint8x16_t *op4, uint8x16_t *op3, uint8x16_t *op2, uint8x16_t *op1, 353 uint8x16_t *op0, uint8x16_t *oq0, uint8x16_t *oq1, uint8x16_t *oq2, 354 uint8x16_t *oq3, uint8x16_t *oq4, uint8x16_t *oq5, uint8x16_t *oq6) { 355 uint16x8_t sum0, sum1; 356 uint8x16_t t; 357 sum0 = vshll_n_u8(vget_low_u8(p7), 3); // 8*p7 358 sum1 = vshll_n_u8(vget_high_u8(p7), 3); // 8*p7 359 sum0 = vsubw_u8(sum0, vget_low_u8(p7)); // 7*p7 360 sum1 = vsubw_u8(sum1, vget_high_u8(p7)); // 7*p7 361 sum0 = vaddw_u8(sum0, vget_low_u8(p6)); // 7*p7+p6 362 sum1 = vaddw_u8(sum1, vget_high_u8(p6)); // 7*p7+p6 363 sum0 = vaddw_u8(sum0, vget_low_u8(p6)); // 7*p7+2*p6 364 sum1 = vaddw_u8(sum1, vget_high_u8(p6)); // 7*p7+2*p6 365 sum0 = vaddw_u8(sum0, vget_low_u8(p5)); // 7*p7+2*p6+p5 366 sum1 = vaddw_u8(sum1, vget_high_u8(p5)); // 7*p7+2*p6+p5 367 sum0 = vaddw_u8(sum0, vget_low_u8(p4)); // 7*p7+2*p6+p5+p4 368 sum1 = vaddw_u8(sum1, vget_high_u8(p4)); // 7*p7+2*p6+p5+p4 369 sum0 = vaddw_u8(sum0, vget_low_u8(p3)); // 7*p7+2*p6+p5+p4+p3 370 sum1 = vaddw_u8(sum1, vget_high_u8(p3)); // 7*p7+2*p6+p5+p4+p3 371 sum0 = vaddw_u8(sum0, vget_low_u8(p2)); // 7*p7+2*p6+p5+p4+p3+p2 372 sum1 = vaddw_u8(sum1, vget_high_u8(p2)); // 7*p7+2*p6+p5+p4+p3+p2 373 sum0 = vaddw_u8(sum0, vget_low_u8(p1)); // 7*p7+2*p6+p5+p4+p3+p2+p1 374 sum1 = vaddw_u8(sum1, vget_high_u8(p1)); // 7*p7+2*p6+p5+p4+p3+p2+p1 375 sum0 = vaddw_u8(sum0, vget_low_u8(p0)); // 7*p7+2*p6+p5+p4+p3+p2+p1+p0 376 sum1 = vaddw_u8(sum1, vget_high_u8(p0)); // 7*p7+2*p6+p5+p4+p3+p2+p1+p0 377 sum0 = vaddw_u8(sum0, vget_low_u8(q0)); // 7*p7+2*p6+p5+p4+p3+p2+p1+p0+q0 378 sum1 = vaddw_u8(sum1, vget_high_u8(q0)); // 7*p7+2*p6+p5+p4+p3+p2+p1+p0+q0 379 t = vcombine_u8(vrshrn_n_u16(sum0, 4), vrshrn_n_u16(sum1, 4)); 380 *op6 = vbslq_u8(flat2, t, p6); 381 *op5 = apply_15_tap_filter_16_kernel(flat2, p7, p6, p5, q1, p5, &sum0, &sum1); 382 *op4 = apply_15_tap_filter_16_kernel(flat2, p7, p5, p4, q2, p4, &sum0, &sum1); 383 *op3 = apply_15_tap_filter_16_kernel(flat2, p7, p4, p3, q3, p3, &sum0, &sum1); 384 *op2 = 385 apply_15_tap_filter_16_kernel(flat2, p7, p3, p2, q4, *op2, &sum0, &sum1); 386 *op1 = 387 apply_15_tap_filter_16_kernel(flat2, p7, p2, p1, q5, *op1, &sum0, &sum1); 388 *op0 = 389 apply_15_tap_filter_16_kernel(flat2, p7, p1, p0, q6, *op0, &sum0, &sum1); 390 *oq0 = 391 apply_15_tap_filter_16_kernel(flat2, p7, p0, q0, q7, *oq0, &sum0, &sum1); 392 *oq1 = 393 apply_15_tap_filter_16_kernel(flat2, p6, q0, q1, q7, *oq1, &sum0, &sum1); 394 *oq2 = 395 apply_15_tap_filter_16_kernel(flat2, p5, q1, q2, q7, *oq2, &sum0, &sum1); 396 *oq3 = apply_15_tap_filter_16_kernel(flat2, p4, q2, q3, q7, q3, &sum0, &sum1); 397 *oq4 = apply_15_tap_filter_16_kernel(flat2, p3, q3, q4, q7, q4, &sum0, &sum1); 398 *oq5 = apply_15_tap_filter_16_kernel(flat2, p2, q4, q5, q7, q5, &sum0, &sum1); 399 *oq6 = apply_15_tap_filter_16_kernel(flat2, p1, q5, q6, q7, q6, &sum0, &sum1); 400 } 401 402 #define FUN_FILTER4(w, r) \ 403 static INLINE void filter4_##w( \ 404 const uint8x##w##_t mask, const uint8x##w##_t hev, \ 405 const uint8x##w##_t p1, const uint8x##w##_t p0, const uint8x##w##_t q0, \ 406 const uint8x##w##_t q1, uint8x##w##_t *op1, uint8x##w##_t *op0, \ 407 uint8x##w##_t *oq0, uint8x##w##_t *oq1) { \ 408 int8x##w##_t filter, filter1, filter2, t; \ 409 int8x##w##_t ps1 = flip_sign_##w(p1); \ 410 int8x##w##_t ps0 = flip_sign_##w(p0); \ 411 int8x##w##_t qs0 = flip_sign_##w(q0); \ 412 int8x##w##_t qs1 = flip_sign_##w(q1); \ 413 \ 414 /* add outer taps if we have high edge variance */ \ 415 filter = vqsub##r##s8(ps1, qs1); \ 416 filter = vand##r##s8(filter, vreinterpret##r##s8_u8(hev)); \ 417 t = vqsub##r##s8(qs0, ps0); \ 418 \ 419 /* inner taps */ \ 420 filter = vqadd##r##s8(filter, t); \ 421 filter = vqadd##r##s8(filter, t); \ 422 filter = vqadd##r##s8(filter, t); \ 423 filter = vand##r##s8(filter, vreinterpret##r##s8_u8(mask)); \ 424 \ 425 /* save bottom 3 bits so that we round one side +4 and the other +3 */ \ 426 /* if it equals 4 we'll set it to adjust by -1 to account for the fact */ \ 427 /* we'd round it by 3 the other way */ \ 428 filter1 = vshr##r##n_s8(vqadd##r##s8(filter, vdup##r##n_s8(4)), 3); \ 429 filter2 = vshr##r##n_s8(vqadd##r##s8(filter, vdup##r##n_s8(3)), 3); \ 430 \ 431 qs0 = vqsub##r##s8(qs0, filter1); \ 432 ps0 = vqadd##r##s8(ps0, filter2); \ 433 *oq0 = flip_sign_back_##w(qs0); \ 434 *op0 = flip_sign_back_##w(ps0); \ 435 \ 436 /* outer tap adjustments */ \ 437 filter = vrshr##r##n_s8(filter1, 1); \ 438 filter = vbic##r##s8(filter, vreinterpret##r##s8_u8(hev)); \ 439 \ 440 qs1 = vqsub##r##s8(qs1, filter); \ 441 ps1 = vqadd##r##s8(ps1, filter); \ 442 *oq1 = flip_sign_back_##w(qs1); \ 443 *op1 = flip_sign_back_##w(ps1); \ 444 } 445 446 FUN_FILTER4(8, _) // filter4_8 447 FUN_FILTER4(16, q_) // filter4_16 448 #undef FUN_FILTER4 449 450 #define FUN_FILTER8(w) \ 451 static INLINE void filter8_##w( \ 452 const uint8x##w##_t mask, const uint8x##w##_t flat, \ 453 const uint32_t flat_status, const uint8x##w##_t hev, \ 454 const uint8x##w##_t p3, const uint8x##w##_t p2, const uint8x##w##_t p1, \ 455 const uint8x##w##_t p0, const uint8x##w##_t q0, const uint8x##w##_t q1, \ 456 const uint8x##w##_t q2, const uint8x##w##_t q3, uint8x##w##_t *op2, \ 457 uint8x##w##_t *op1, uint8x##w##_t *op0, uint8x##w##_t *oq0, \ 458 uint8x##w##_t *oq1, uint8x##w##_t *oq2) { \ 459 if (flat_status != (uint32_t)-2) { \ 460 filter4_##w(mask, hev, p1, p0, q0, q1, op1, op0, oq0, oq1); \ 461 *op2 = p2; \ 462 *oq2 = q2; \ 463 if (flat_status) { \ 464 apply_7_tap_filter_##w(flat, p3, p2, p1, p0, q0, q1, q2, q3, op2, op1, \ 465 op0, oq0, oq1, oq2); \ 466 } \ 467 } else { \ 468 calc_7_tap_filter_##w(p3, p2, p1, p0, q0, q1, q2, q3, op2, op1, op0, \ 469 oq0, oq1, oq2); \ 470 } \ 471 } 472 473 FUN_FILTER8(8) // filter8_8 474 FUN_FILTER8(16) // filter8_16 475 #undef FUN_FILTER8 476 477 #define FUN_FILTER16(w) \ 478 static INLINE void filter16_##w( \ 479 const uint8x##w##_t mask, const uint8x##w##_t flat, \ 480 const uint32_t flat_status, const uint8x##w##_t flat2, \ 481 const uint32_t flat2_status, const uint8x##w##_t hev, \ 482 const uint8x##w##_t p7, const uint8x##w##_t p6, const uint8x##w##_t p5, \ 483 const uint8x##w##_t p4, const uint8x##w##_t p3, const uint8x##w##_t p2, \ 484 const uint8x##w##_t p1, const uint8x##w##_t p0, const uint8x##w##_t q0, \ 485 const uint8x##w##_t q1, const uint8x##w##_t q2, const uint8x##w##_t q3, \ 486 const uint8x##w##_t q4, const uint8x##w##_t q5, const uint8x##w##_t q6, \ 487 const uint8x##w##_t q7, uint8x##w##_t *op6, uint8x##w##_t *op5, \ 488 uint8x##w##_t *op4, uint8x##w##_t *op3, uint8x##w##_t *op2, \ 489 uint8x##w##_t *op1, uint8x##w##_t *op0, uint8x##w##_t *oq0, \ 490 uint8x##w##_t *oq1, uint8x##w##_t *oq2, uint8x##w##_t *oq3, \ 491 uint8x##w##_t *oq4, uint8x##w##_t *oq5, uint8x##w##_t *oq6) { \ 492 if (flat_status != (uint32_t)-2) { \ 493 filter4_##w(mask, hev, p1, p0, q0, q1, op1, op0, oq0, oq1); \ 494 } \ 495 \ 496 if (flat_status) { \ 497 *op2 = p2; \ 498 *oq2 = q2; \ 499 if (flat2_status != (uint32_t)-2) { \ 500 apply_7_tap_filter_##w(flat, p3, p2, p1, p0, q0, q1, q2, q3, op2, op1, \ 501 op0, oq0, oq1, oq2); \ 502 } \ 503 if (flat2_status) { \ 504 apply_15_tap_filter_##w(flat2, p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, \ 505 q2, q3, q4, q5, q6, q7, op6, op5, op4, op3, \ 506 op2, op1, op0, oq0, oq1, oq2, oq3, oq4, oq5, \ 507 oq6); \ 508 } \ 509 } \ 510 } 511 512 FUN_FILTER16(8) // filter16_8 513 FUN_FILTER16(16) // filter16_16 514 #undef FUN_FILTER16 515 516 #define FUN_LOAD8(w, r) \ 517 static INLINE void load_##w##x8( \ 518 const uint8_t *s, const int p, uint8x##w##_t *p3, uint8x##w##_t *p2, \ 519 uint8x##w##_t *p1, uint8x##w##_t *p0, uint8x##w##_t *q0, \ 520 uint8x##w##_t *q1, uint8x##w##_t *q2, uint8x##w##_t *q3) { \ 521 *p3 = vld1##r##u8(s); \ 522 s += p; \ 523 *p2 = vld1##r##u8(s); \ 524 s += p; \ 525 *p1 = vld1##r##u8(s); \ 526 s += p; \ 527 *p0 = vld1##r##u8(s); \ 528 s += p; \ 529 *q0 = vld1##r##u8(s); \ 530 s += p; \ 531 *q1 = vld1##r##u8(s); \ 532 s += p; \ 533 *q2 = vld1##r##u8(s); \ 534 s += p; \ 535 *q3 = vld1##r##u8(s); \ 536 } 537 538 FUN_LOAD8(8, _) // load_8x8 539 FUN_LOAD8(16, q_) // load_16x8 540 #undef FUN_LOAD8 541 542 #define FUN_LOAD16(w, r) \ 543 static INLINE void load_##w##x16( \ 544 const uint8_t *s, const int p, uint8x##w##_t *s0, uint8x##w##_t *s1, \ 545 uint8x##w##_t *s2, uint8x##w##_t *s3, uint8x##w##_t *s4, \ 546 uint8x##w##_t *s5, uint8x##w##_t *s6, uint8x##w##_t *s7, \ 547 uint8x##w##_t *s8, uint8x##w##_t *s9, uint8x##w##_t *s10, \ 548 uint8x##w##_t *s11, uint8x##w##_t *s12, uint8x##w##_t *s13, \ 549 uint8x##w##_t *s14, uint8x##w##_t *s15) { \ 550 *s0 = vld1##r##u8(s); \ 551 s += p; \ 552 *s1 = vld1##r##u8(s); \ 553 s += p; \ 554 *s2 = vld1##r##u8(s); \ 555 s += p; \ 556 *s3 = vld1##r##u8(s); \ 557 s += p; \ 558 *s4 = vld1##r##u8(s); \ 559 s += p; \ 560 *s5 = vld1##r##u8(s); \ 561 s += p; \ 562 *s6 = vld1##r##u8(s); \ 563 s += p; \ 564 *s7 = vld1##r##u8(s); \ 565 s += p; \ 566 *s8 = vld1##r##u8(s); \ 567 s += p; \ 568 *s9 = vld1##r##u8(s); \ 569 s += p; \ 570 *s10 = vld1##r##u8(s); \ 571 s += p; \ 572 *s11 = vld1##r##u8(s); \ 573 s += p; \ 574 *s12 = vld1##r##u8(s); \ 575 s += p; \ 576 *s13 = vld1##r##u8(s); \ 577 s += p; \ 578 *s14 = vld1##r##u8(s); \ 579 s += p; \ 580 *s15 = vld1##r##u8(s); \ 581 } 582 583 FUN_LOAD16(8, _) // load_8x16 584 FUN_LOAD16(16, q_) // load_16x16 585 #undef FUN_LOAD16 586 587 #define FUN_STORE4(w, r) \ 588 static INLINE void store_##w##x4( \ 589 uint8_t *s, const int p, const uint8x##w##_t s0, const uint8x##w##_t s1, \ 590 const uint8x##w##_t s2, const uint8x##w##_t s3) { \ 591 vst1##r##u8(s, s0); \ 592 s += p; \ 593 vst1##r##u8(s, s1); \ 594 s += p; \ 595 vst1##r##u8(s, s2); \ 596 s += p; \ 597 vst1##r##u8(s, s3); \ 598 } 599 600 FUN_STORE4(8, _) // store_8x4 601 FUN_STORE4(16, q_) // store_16x4 602 #undef FUN_STORE4 603 604 #define FUN_STORE6(w, r) \ 605 static INLINE void store_##w##x6( \ 606 uint8_t *s, const int p, const uint8x##w##_t s0, const uint8x##w##_t s1, \ 607 const uint8x##w##_t s2, const uint8x##w##_t s3, const uint8x##w##_t s4, \ 608 const uint8x##w##_t s5) { \ 609 vst1##r##u8(s, s0); \ 610 s += p; \ 611 vst1##r##u8(s, s1); \ 612 s += p; \ 613 vst1##r##u8(s, s2); \ 614 s += p; \ 615 vst1##r##u8(s, s3); \ 616 s += p; \ 617 vst1##r##u8(s, s4); \ 618 s += p; \ 619 vst1##r##u8(s, s5); \ 620 } 621 622 FUN_STORE6(8, _) // store_8x6 623 FUN_STORE6(16, q_) // store_16x6 624 #undef FUN_STORE6 625 626 static INLINE void store_4x8(uint8_t *s, const int p, const uint8x8_t p1, 627 const uint8x8_t p0, const uint8x8_t q0, 628 const uint8x8_t q1) { 629 uint8x8x4_t o; 630 631 o.val[0] = p1; 632 o.val[1] = p0; 633 o.val[2] = q0; 634 o.val[3] = q1; 635 vst4_lane_u8(s, o, 0); 636 s += p; 637 vst4_lane_u8(s, o, 1); 638 s += p; 639 vst4_lane_u8(s, o, 2); 640 s += p; 641 vst4_lane_u8(s, o, 3); 642 s += p; 643 vst4_lane_u8(s, o, 4); 644 s += p; 645 vst4_lane_u8(s, o, 5); 646 s += p; 647 vst4_lane_u8(s, o, 6); 648 s += p; 649 vst4_lane_u8(s, o, 7); 650 } 651 652 static INLINE void store_6x8(uint8_t *s, const int p, const uint8x8_t s0, 653 const uint8x8_t s1, const uint8x8_t s2, 654 const uint8x8_t s3, const uint8x8_t s4, 655 const uint8x8_t s5) { 656 uint8x8x3_t o0, o1; 657 658 o0.val[0] = s0; 659 o0.val[1] = s1; 660 o0.val[2] = s2; 661 o1.val[0] = s3; 662 o1.val[1] = s4; 663 o1.val[2] = s5; 664 vst3_lane_u8(s - 3, o0, 0); 665 vst3_lane_u8(s + 0, o1, 0); 666 s += p; 667 vst3_lane_u8(s - 3, o0, 1); 668 vst3_lane_u8(s + 0, o1, 1); 669 s += p; 670 vst3_lane_u8(s - 3, o0, 2); 671 vst3_lane_u8(s + 0, o1, 2); 672 s += p; 673 vst3_lane_u8(s - 3, o0, 3); 674 vst3_lane_u8(s + 0, o1, 3); 675 s += p; 676 vst3_lane_u8(s - 3, o0, 4); 677 vst3_lane_u8(s + 0, o1, 4); 678 s += p; 679 vst3_lane_u8(s - 3, o0, 5); 680 vst3_lane_u8(s + 0, o1, 5); 681 s += p; 682 vst3_lane_u8(s - 3, o0, 6); 683 vst3_lane_u8(s + 0, o1, 6); 684 s += p; 685 vst3_lane_u8(s - 3, o0, 7); 686 vst3_lane_u8(s + 0, o1, 7); 687 } 688 689 #define FUN_STORE8(w, r) \ 690 static INLINE void store_##w##x8( \ 691 uint8_t *s, const int p, const uint8x##w##_t s0, const uint8x##w##_t s1, \ 692 const uint8x##w##_t s2, const uint8x##w##_t s3, const uint8x##w##_t s4, \ 693 const uint8x##w##_t s5, const uint8x##w##_t s6, \ 694 const uint8x##w##_t s7) { \ 695 vst1##r##u8(s, s0); \ 696 s += p; \ 697 vst1##r##u8(s, s1); \ 698 s += p; \ 699 vst1##r##u8(s, s2); \ 700 s += p; \ 701 vst1##r##u8(s, s3); \ 702 s += p; \ 703 vst1##r##u8(s, s4); \ 704 s += p; \ 705 vst1##r##u8(s, s5); \ 706 s += p; \ 707 vst1##r##u8(s, s6); \ 708 s += p; \ 709 vst1##r##u8(s, s7); \ 710 } 711 712 FUN_STORE8(8, _) // store_8x8 713 FUN_STORE8(16, q_) // store_16x8 714 #undef FUN_STORE8 715 716 #define FUN_STORE14(w, r) \ 717 static INLINE void store_##w##x14( \ 718 uint8_t *s, const int p, const uint8x##w##_t p6, const uint8x##w##_t p5, \ 719 const uint8x##w##_t p4, const uint8x##w##_t p3, const uint8x##w##_t p2, \ 720 const uint8x##w##_t p1, const uint8x##w##_t p0, const uint8x##w##_t q0, \ 721 const uint8x##w##_t q1, const uint8x##w##_t q2, const uint8x##w##_t q3, \ 722 const uint8x##w##_t q4, const uint8x##w##_t q5, const uint8x##w##_t q6, \ 723 const uint32_t flat_status, const uint32_t flat2_status) { \ 724 if (flat_status) { \ 725 if (flat2_status) { \ 726 vst1##r##u8(s - 7 * p, p6); \ 727 vst1##r##u8(s - 6 * p, p5); \ 728 vst1##r##u8(s - 5 * p, p4); \ 729 vst1##r##u8(s - 4 * p, p3); \ 730 vst1##r##u8(s + 3 * p, q3); \ 731 vst1##r##u8(s + 4 * p, q4); \ 732 vst1##r##u8(s + 5 * p, q5); \ 733 vst1##r##u8(s + 6 * p, q6); \ 734 } \ 735 vst1##r##u8(s - 3 * p, p2); \ 736 vst1##r##u8(s + 2 * p, q2); \ 737 } \ 738 vst1##r##u8(s - 2 * p, p1); \ 739 vst1##r##u8(s - 1 * p, p0); \ 740 vst1##r##u8(s + 0 * p, q0); \ 741 vst1##r##u8(s + 1 * p, q1); \ 742 } 743 744 FUN_STORE14(8, _) // store_8x14 745 FUN_STORE14(16, q_) // store_16x14 746 #undef FUN_STORE14 747 748 static INLINE void store_16x16(uint8_t *s, const int p, const uint8x16_t s0, 749 const uint8x16_t s1, const uint8x16_t s2, 750 const uint8x16_t s3, const uint8x16_t s4, 751 const uint8x16_t s5, const uint8x16_t s6, 752 const uint8x16_t s7, const uint8x16_t s8, 753 const uint8x16_t s9, const uint8x16_t s10, 754 const uint8x16_t s11, const uint8x16_t s12, 755 const uint8x16_t s13, const uint8x16_t s14, 756 const uint8x16_t s15) { 757 vst1q_u8(s, s0); 758 s += p; 759 vst1q_u8(s, s1); 760 s += p; 761 vst1q_u8(s, s2); 762 s += p; 763 vst1q_u8(s, s3); 764 s += p; 765 vst1q_u8(s, s4); 766 s += p; 767 vst1q_u8(s, s5); 768 s += p; 769 vst1q_u8(s, s6); 770 s += p; 771 vst1q_u8(s, s7); 772 s += p; 773 vst1q_u8(s, s8); 774 s += p; 775 vst1q_u8(s, s9); 776 s += p; 777 vst1q_u8(s, s10); 778 s += p; 779 vst1q_u8(s, s11); 780 s += p; 781 vst1q_u8(s, s12); 782 s += p; 783 vst1q_u8(s, s13); 784 s += p; 785 vst1q_u8(s, s14); 786 s += p; 787 vst1q_u8(s, s15); 788 } 789 790 #define FUN_HOR_4_KERNEL(name, w) \ 791 static INLINE void lpf_horizontal_4##name##kernel( \ 792 uint8_t *s, const int p, const uint8x##w##_t blimit, \ 793 const uint8x##w##_t limit, const uint8x##w##_t thresh) { \ 794 uint8x##w##_t p3, p2, p1, p0, q0, q1, q2, q3, mask, hev; \ 795 \ 796 load_##w##x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); \ 797 filter_hev_mask4_##w(limit, blimit, thresh, p3, p2, p1, p0, q0, q1, q2, \ 798 q3, &hev, &mask); \ 799 filter4_##w(mask, hev, p1, p0, q0, q1, &p1, &p0, &q0, &q1); \ 800 store_##w##x4(s - 2 * p, p, p1, p0, q0, q1); \ 801 } 802 803 FUN_HOR_4_KERNEL(_, 8) // lpf_horizontal_4_kernel 804 FUN_HOR_4_KERNEL(_dual_, 16) // lpf_horizontal_4_dual_kernel 805 #undef FUN_HOR_4_KERNEL 806 807 void vpx_lpf_horizontal_4_neon(uint8_t *s, int p, const uint8_t *blimit, 808 const uint8_t *limit, const uint8_t *thresh) { 809 uint8x8_t blimit_vec, limit_vec, thresh_vec; 810 load_thresh_8(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec); 811 lpf_horizontal_4_kernel(s, p, blimit_vec, limit_vec, thresh_vec); 812 } 813 814 void vpx_lpf_horizontal_4_dual_neon(uint8_t *s, int p, const uint8_t *blimit0, 815 const uint8_t *limit0, 816 const uint8_t *thresh0, 817 const uint8_t *blimit1, 818 const uint8_t *limit1, 819 const uint8_t *thresh1) { 820 uint8x16_t blimit_vec, limit_vec, thresh_vec; 821 load_thresh_8_dual(blimit0, limit0, thresh0, blimit1, limit1, thresh1, 822 &blimit_vec, &limit_vec, &thresh_vec); 823 lpf_horizontal_4_dual_kernel(s, p, blimit_vec, limit_vec, thresh_vec); 824 } 825 826 void vpx_lpf_vertical_4_neon(uint8_t *s, int p, const uint8_t *blimit, 827 const uint8_t *limit, const uint8_t *thresh) { 828 uint8x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3, 829 mask, hev; 830 load_thresh_8(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec); 831 load_8x8(s - 4, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); 832 transpose_u8_8x8(&p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); 833 filter_hev_mask4_8(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, 834 q2, q3, &hev, &mask); 835 filter4_8(mask, hev, p1, p0, q0, q1, &p1, &p0, &q0, &q1); 836 store_4x8(s - 2, p, p1, p0, q0, q1); 837 } 838 839 void vpx_lpf_vertical_4_dual_neon(uint8_t *s, int p, const uint8_t *blimit0, 840 const uint8_t *limit0, const uint8_t *thresh0, 841 const uint8_t *blimit1, const uint8_t *limit1, 842 const uint8_t *thresh1) { 843 uint8x16_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3, 844 mask, hev; 845 uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, 846 s15; 847 848 load_thresh_8_dual(blimit0, limit0, thresh0, blimit1, limit1, thresh1, 849 &blimit_vec, &limit_vec, &thresh_vec); 850 load_8x16(s - 4, p, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, &s9, &s10, 851 &s11, &s12, &s13, &s14, &s15); 852 transpose_u8_8x16(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, 853 s14, s15, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); 854 filter_hev_mask4_16(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, 855 q2, q3, &hev, &mask); 856 filter4_16(mask, hev, p1, p0, q0, q1, &p1, &p0, &q0, &q1); 857 s -= 2; 858 store_4x8(s, p, vget_low_u8(p1), vget_low_u8(p0), vget_low_u8(q0), 859 vget_low_u8(q1)); 860 store_4x8(s + 8 * p, p, vget_high_u8(p1), vget_high_u8(p0), vget_high_u8(q0), 861 vget_high_u8(q1)); 862 } 863 864 void vpx_lpf_horizontal_8_neon(uint8_t *s, int p, const uint8_t *blimit, 865 const uint8_t *limit, const uint8_t *thresh) { 866 uint8x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3, 867 op2, op1, op0, oq0, oq1, oq2, mask, flat, hev; 868 uint32_t flat_status; 869 870 load_thresh_8(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec); 871 load_8x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); 872 mask = filter_flat_hev_mask_8(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, 873 p0, q0, q1, q2, q3, &flat, &flat_status, &hev); 874 filter8_8(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2, 875 &op1, &op0, &oq0, &oq1, &oq2); 876 store_8x6(s - 3 * p, p, op2, op1, op0, oq0, oq1, oq2); 877 } 878 879 void vpx_lpf_horizontal_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit0, 880 const uint8_t *limit0, 881 const uint8_t *thresh0, 882 const uint8_t *blimit1, 883 const uint8_t *limit1, 884 const uint8_t *thresh1) { 885 uint8x16_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3, 886 op2, op1, op0, oq0, oq1, oq2, mask, flat, hev; 887 uint32_t flat_status; 888 889 load_thresh_8_dual(blimit0, limit0, thresh0, blimit1, limit1, thresh1, 890 &blimit_vec, &limit_vec, &thresh_vec); 891 load_16x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); 892 mask = filter_flat_hev_mask_16(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, 893 p0, q0, q1, q2, q3, &flat, &flat_status, &hev); 894 filter8_16(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2, 895 &op1, &op0, &oq0, &oq1, &oq2); 896 store_16x6(s - 3 * p, p, op2, op1, op0, oq0, oq1, oq2); 897 } 898 899 void vpx_lpf_vertical_8_neon(uint8_t *s, int p, const uint8_t *blimit, 900 const uint8_t *limit, const uint8_t *thresh) { 901 uint8x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3, 902 op2, op1, op0, oq0, oq1, oq2, mask, flat, hev; 903 uint32_t flat_status; 904 905 load_thresh_8(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec); 906 load_8x8(s - 4, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); 907 transpose_u8_8x8(&p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); 908 mask = filter_flat_hev_mask_8(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, 909 p0, q0, q1, q2, q3, &flat, &flat_status, &hev); 910 filter8_8(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2, 911 &op1, &op0, &oq0, &oq1, &oq2); 912 // Note: transpose + store_8x8() is faster than store_6x8(). 913 transpose_u8_8x8(&p3, &op2, &op1, &op0, &oq0, &oq1, &oq2, &q3); 914 store_8x8(s - 4, p, p3, op2, op1, op0, oq0, oq1, oq2, q3); 915 } 916 917 void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit0, 918 const uint8_t *limit0, const uint8_t *thresh0, 919 const uint8_t *blimit1, const uint8_t *limit1, 920 const uint8_t *thresh1) { 921 uint8x16_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3, 922 op2, op1, op0, oq0, oq1, oq2, mask, flat, hev; 923 uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, 924 s15; 925 uint32_t flat_status; 926 927 load_thresh_8_dual(blimit0, limit0, thresh0, blimit1, limit1, thresh1, 928 &blimit_vec, &limit_vec, &thresh_vec); 929 load_8x16(s - 4, p, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, &s9, &s10, 930 &s11, &s12, &s13, &s14, &s15); 931 transpose_u8_8x16(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, 932 s14, s15, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); 933 mask = filter_flat_hev_mask_16(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, 934 p0, q0, q1, q2, q3, &flat, &flat_status, &hev); 935 filter8_16(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2, 936 &op1, &op0, &oq0, &oq1, &oq2); 937 // Note: store_6x8() twice is faster than transpose + store_8x16(). 938 store_6x8(s, p, vget_low_u8(op2), vget_low_u8(op1), vget_low_u8(op0), 939 vget_low_u8(oq0), vget_low_u8(oq1), vget_low_u8(oq2)); 940 store_6x8(s + 8 * p, p, vget_high_u8(op2), vget_high_u8(op1), 941 vget_high_u8(op0), vget_high_u8(oq0), vget_high_u8(oq1), 942 vget_high_u8(oq2)); 943 } 944 945 #define FUN_LPF_16_KERNEL(name, w) \ 946 static INLINE void lpf_16##name##kernel( \ 947 const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, \ 948 const uint8x##w##_t p7, const uint8x##w##_t p6, const uint8x##w##_t p5, \ 949 const uint8x##w##_t p4, const uint8x##w##_t p3, const uint8x##w##_t p2, \ 950 const uint8x##w##_t p1, const uint8x##w##_t p0, const uint8x##w##_t q0, \ 951 const uint8x##w##_t q1, const uint8x##w##_t q2, const uint8x##w##_t q3, \ 952 const uint8x##w##_t q4, const uint8x##w##_t q5, const uint8x##w##_t q6, \ 953 const uint8x##w##_t q7, uint8x##w##_t *op6, uint8x##w##_t *op5, \ 954 uint8x##w##_t *op4, uint8x##w##_t *op3, uint8x##w##_t *op2, \ 955 uint8x##w##_t *op1, uint8x##w##_t *op0, uint8x##w##_t *oq0, \ 956 uint8x##w##_t *oq1, uint8x##w##_t *oq2, uint8x##w##_t *oq3, \ 957 uint8x##w##_t *oq4, uint8x##w##_t *oq5, uint8x##w##_t *oq6, \ 958 uint32_t *flat_status, uint32_t *flat2_status) { \ 959 uint8x##w##_t blimit_vec, limit_vec, thresh_vec, mask, flat, flat2, hev; \ 960 \ 961 load_thresh_##w(blimit, limit, thresh, &blimit_vec, &limit_vec, \ 962 &thresh_vec); \ 963 mask = filter_flat_hev_mask_##w(limit_vec, blimit_vec, thresh_vec, p3, p2, \ 964 p1, p0, q0, q1, q2, q3, &flat, \ 965 flat_status, &hev); \ 966 flat2 = flat_mask5_##w(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, \ 967 flat2_status); \ 968 filter16_##w(mask, flat, *flat_status, flat2, *flat2_status, hev, p7, p6, \ 969 p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, op6, \ 970 op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4, oq5, \ 971 oq6); \ 972 } 973 974 FUN_LPF_16_KERNEL(_, 8) // lpf_16_kernel 975 FUN_LPF_16_KERNEL(_dual_, 16) // lpf_16_dual_kernel 976 #undef FUN_LPF_16_KERNEL 977 978 void vpx_lpf_horizontal_16_neon(uint8_t *s, int p, const uint8_t *blimit, 979 const uint8_t *limit, const uint8_t *thresh) { 980 uint8x8_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, op6, 981 op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4, oq5, oq6; 982 uint32_t flat_status, flat2_status; 983 984 load_8x16(s - 8 * p, p, &p7, &p6, &p5, &p4, &p3, &p2, &p1, &p0, &q0, &q1, &q2, 985 &q3, &q4, &q5, &q6, &q7); 986 lpf_16_kernel(blimit, limit, thresh, p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, 987 q2, q3, q4, q5, q6, q7, &op6, &op5, &op4, &op3, &op2, &op1, 988 &op0, &oq0, &oq1, &oq2, &oq3, &oq4, &oq5, &oq6, &flat_status, 989 &flat2_status); 990 store_8x14(s, p, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4, 991 oq5, oq6, flat_status, flat2_status); 992 } 993 994 void vpx_lpf_horizontal_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit, 995 const uint8_t *limit, 996 const uint8_t *thresh) { 997 uint8x16_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, 998 op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4, oq5, oq6; 999 uint32_t flat_status, flat2_status; 1000 1001 load_16x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); 1002 p7 = vld1q_u8(s - 8 * p); 1003 p6 = vld1q_u8(s - 7 * p); 1004 p5 = vld1q_u8(s - 6 * p); 1005 p4 = vld1q_u8(s - 5 * p); 1006 q4 = vld1q_u8(s + 4 * p); 1007 q5 = vld1q_u8(s + 5 * p); 1008 q6 = vld1q_u8(s + 6 * p); 1009 q7 = vld1q_u8(s + 7 * p); 1010 lpf_16_dual_kernel(blimit, limit, thresh, p7, p6, p5, p4, p3, p2, p1, p0, q0, 1011 q1, q2, q3, q4, q5, q6, q7, &op6, &op5, &op4, &op3, &op2, 1012 &op1, &op0, &oq0, &oq1, &oq2, &oq3, &oq4, &oq5, &oq6, 1013 &flat_status, &flat2_status); 1014 store_16x14(s, p, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4, 1015 oq5, oq6, flat_status, flat2_status); 1016 } 1017 1018 void vpx_lpf_vertical_16_neon(uint8_t *s, int p, const uint8_t *blimit, 1019 const uint8_t *limit, const uint8_t *thresh) { 1020 uint8x8_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, op6, 1021 op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4, oq5, oq6; 1022 uint8x16_t s0, s1, s2, s3, s4, s5, s6, s7; 1023 uint32_t flat_status, flat2_status; 1024 1025 s -= 8; 1026 load_16x8(s, p, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); 1027 transpose_u8_16x8(s0, s1, s2, s3, s4, s5, s6, s7, &p7, &p6, &p5, &p4, &p3, 1028 &p2, &p1, &p0, &q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7); 1029 lpf_16_kernel(blimit, limit, thresh, p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, 1030 q2, q3, q4, q5, q6, q7, &op6, &op5, &op4, &op3, &op2, &op1, 1031 &op0, &oq0, &oq1, &oq2, &oq3, &oq4, &oq5, &oq6, &flat_status, 1032 &flat2_status); 1033 if (flat_status) { 1034 if (flat2_status) { 1035 transpose_u8_8x16(p7, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, 1036 oq3, oq4, oq5, oq6, q7, &s0, &s1, &s2, &s3, &s4, &s5, 1037 &s6, &s7); 1038 store_16x8(s, p, s0, s1, s2, s3, s4, s5, s6, s7); 1039 } else { 1040 // Note: transpose + store_8x8() is faster than store_6x8(). 1041 transpose_u8_8x8(&p3, &op2, &op1, &op0, &oq0, &oq1, &oq2, &q3); 1042 store_8x8(s + 4, p, p3, op2, op1, op0, oq0, oq1, oq2, q3); 1043 } 1044 } else { 1045 store_4x8(s + 6, p, op1, op0, oq0, oq1); 1046 } 1047 } 1048 1049 void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit, 1050 const uint8_t *limit, 1051 const uint8_t *thresh) { 1052 uint8x16_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, 1053 op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4, oq5, oq6; 1054 uint8x16_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, 1055 s15; 1056 uint32_t flat_status, flat2_status; 1057 1058 s -= 8; 1059 load_16x16(s, p, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, &s9, &s10, &s11, 1060 &s12, &s13, &s14, &s15); 1061 transpose_u8_16x16(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, 1062 s14, s15, &p7, &p6, &p5, &p4, &p3, &p2, &p1, &p0, &q0, &q1, 1063 &q2, &q3, &q4, &q5, &q6, &q7); 1064 lpf_16_dual_kernel(blimit, limit, thresh, p7, p6, p5, p4, p3, p2, p1, p0, q0, 1065 q1, q2, q3, q4, q5, q6, q7, &op6, &op5, &op4, &op3, &op2, 1066 &op1, &op0, &oq0, &oq1, &oq2, &oq3, &oq4, &oq5, &oq6, 1067 &flat_status, &flat2_status); 1068 if (flat_status) { 1069 if (flat2_status) { 1070 transpose_u8_16x16(p7, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, 1071 oq3, oq4, oq5, oq6, q7, &s0, &s1, &s2, &s3, &s4, &s5, 1072 &s6, &s7, &s8, &s9, &s10, &s11, &s12, &s13, &s14, 1073 &s15); 1074 store_16x16(s, p, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, 1075 s13, s14, s15); 1076 } else { 1077 // Note: store_6x8() twice is faster than transpose + store_8x16(). 1078 s += 8; 1079 store_6x8(s, p, vget_low_u8(op2), vget_low_u8(op1), vget_low_u8(op0), 1080 vget_low_u8(oq0), vget_low_u8(oq1), vget_low_u8(oq2)); 1081 store_6x8(s + 8 * p, p, vget_high_u8(op2), vget_high_u8(op1), 1082 vget_high_u8(op0), vget_high_u8(oq0), vget_high_u8(oq1), 1083 vget_high_u8(oq2)); 1084 } 1085 } else { 1086 s += 6; 1087 store_4x8(s, p, vget_low_u8(op1), vget_low_u8(op0), vget_low_u8(oq0), 1088 vget_low_u8(oq1)); 1089 store_4x8(s + 8 * p, p, vget_high_u8(op1), vget_high_u8(op0), 1090 vget_high_u8(oq0), vget_high_u8(oq1)); 1091 } 1092 } 1093