1 typedef short int16_t; 2 typedef int int32_t; 3 typedef unsigned char uint8_t; 4 typedef unsigned int uintptr_t; 5 6 typedef __builtin_neon_hi int16x4_t __attribute__ ((__vector_size__ (8))); 7 typedef __builtin_neon_uqi uint8x8_t __attribute__ ((__vector_size__ (8))); 8 typedef __builtin_neon_uhi uint16x8_t __attribute__ ((__vector_size__ (16))); 9 typedef __builtin_neon_si int32x4_t __attribute__ ((__vector_size__ (16))); 10 typedef __builtin_neon_hi int16x8_t __attribute__ ((__vector_size__ (16))); 11 typedef __builtin_neon_qi int8x8_t __attribute__ ((__vector_size__ (8))); 12 typedef __builtin_neon_si int32x2_t __attribute__ ((__vector_size__ (8))); 13 14 typedef struct uint8x8x2_t 15 { 16 uint8x8_t val[2]; 17 } uint8x8x2_t; 18 typedef struct uint8x8x4_t 19 { 20 uint8x8_t val[4]; 21 } uint8x8x4_t; 22 23 __extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) 24 vaddq_u16 (uint16x8_t __a, uint16x8_t __b) 25 { 26 return (uint16x8_t)__builtin_neon_vaddv8hi ((int16x8_t) __a, (int16x8_t) __b, 0); 27 } 28 __extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) 29 vaddl_s16 (int16x4_t __a, int16x4_t __b) 30 { 31 return (int32x4_t)__builtin_neon_vaddlv4hi (__a, __b, 1); 32 } 33 __extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) 34 vaddl_u8 (uint8x8_t __a, uint8x8_t __b) 35 { 36 return (uint16x8_t)__builtin_neon_vaddlv8qi ((int8x8_t) __a, (int8x8_t) __b, 0); 37 } 38 __extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) 39 vaddw_u8 (uint16x8_t __a, uint8x8_t __b) 40 { 41 return (uint16x8_t)__builtin_neon_vaddwv8qi ((int16x8_t) __a, (int8x8_t) __b, 0); 42 } 43 __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) 44 vrhadd_u8 (uint8x8_t __a, uint8x8_t __b) 45 { 46 return (uint8x8_t)__builtin_neon_vhaddv8qi ((int8x8_t) __a, (int8x8_t) __b, 4); 47 } 48 __extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) 49 vsubl_s16 (int16x4_t __a, int16x4_t __b) 50 { 51 return (int32x4_t)__builtin_neon_vsublv4hi (__a, __b, 1); 52 } 53 __extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) 54 vsubl_u8 (uint8x8_t __a, uint8x8_t __b) 55 { 56 return (uint16x8_t)__builtin_neon_vsublv8qi ((int8x8_t) __a, (int8x8_t) __b, 0); 57 } 58 __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) 59 vshrn_n_u16 (uint16x8_t __a, const int __b) 60 { 61 return (uint8x8_t)__builtin_neon_vshrn_nv8hi ((int16x8_t) __a, __b, 0); 62 } 63 __extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) 64 vrshrn_n_s32 (int32x4_t __a, const int __b) 65 { 66 return (int16x4_t)__builtin_neon_vshrn_nv4si (__a, __b, 5); 67 } 68 __extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) 69 vshlq_n_s16 (int16x8_t __a, const int __b) 70 { 71 return (int16x8_t)__builtin_neon_vshl_nv8hi (__a, __b, 1); 72 } 73 __extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) 74 vshll_n_s16 (int16x4_t __a, const int __b) 75 { 76 return (int32x4_t)__builtin_neon_vshll_nv4hi (__a, __b, 1); 77 } 78 __extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) 79 vshll_n_u8 (uint8x8_t __a, const int __b) 80 { 81 return (uint16x8_t)__builtin_neon_vshll_nv8qi ((int8x8_t) __a, __b, 0); 82 } 83 __extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) 84 vmov_n_s32 (int32_t __a) 85 { 86 return (int32x2_t)__builtin_neon_vdup_nv2si ((__builtin_neon_si) __a); 87 } 88 __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) 89 vmov_n_u8 (uint8_t __a) 90 { 91 return (uint8x8_t)__builtin_neon_vdup_nv8qi ((__builtin_neon_qi) __a); 92 } 93 __extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) 94 vcombine_s16 (int16x4_t __a, int16x4_t __b) 95 { 96 return (int16x8_t)__builtin_neon_vcombinev4hi (__a, __b); 97 } 98 __extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) 99 vget_high_s16 (int16x8_t __a) 100 { 101 return (int16x4_t)__builtin_neon_vget_highv8hi (__a); 102 } 103 __extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) 104 vget_low_s16 (int16x8_t __a) 105 { 106 return (int16x4_t)__builtin_neon_vget_lowv8hi (__a); 107 } 108 __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) 109 vqmovun_s16 (int16x8_t __a) 110 { 111 return (uint8x8_t)__builtin_neon_vqmovunv8hi (__a, 1); 112 } 113 __extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) 114 vmovl_s16 (int16x4_t __a) 115 { 116 return (int32x4_t)__builtin_neon_vmovlv4hi (__a, 1); 117 } 118 __extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) 119 vmulq_lane_s32 (int32x4_t __a, int32x2_t __b, const int __c) 120 { 121 return (int32x4_t)__builtin_neon_vmul_lanev4si (__a, __b, __c, 1); 122 } 123 __extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) 124 vmlal_lane_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c, const int __d) 125 { 126 return (int32x4_t)__builtin_neon_vmlal_lanev4hi (__a, __b, __c, __d, 1); 127 } 128 __extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) 129 vqdmlal_lane_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c, const int __d) 130 { 131 return (int32x4_t)__builtin_neon_vqdmlal_lanev4hi (__a, __b, __c, __d, 1); 132 } 133 __extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) 134 vld1_s16 (const int16_t * __a) 135 { 136 return (int16x4_t)__builtin_neon_vld1v4hi ((const __builtin_neon_hi *) __a); 137 } 138 __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) 139 vld1_u8 (const uint8_t * __a) 140 { 141 return (uint8x8_t)__builtin_neon_vld1v8qi ((const __builtin_neon_qi *) __a); 142 } 143 __extension__ static __inline void __attribute__ ((__always_inline__)) 144 vst2_u8 (uint8_t * __a, uint8x8x2_t __b) 145 { 146 union { uint8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b }; 147 __builtin_neon_vst2v8qi ((__builtin_neon_qi *) __a, __bu.__o); 148 } 149 __extension__ static __inline void __attribute__ ((__always_inline__)) 150 vst4_u8 (uint8_t * __a, uint8x8x4_t __b) 151 { 152 union { uint8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b }; 153 __builtin_neon_vst4v8qi ((__builtin_neon_qi *) __a, __bu.__o); 154 } 155 __extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) 156 vreinterpretq_s16_u16 (uint16x8_t __a) 157 { 158 return (int16x8_t)__builtin_neon_vreinterpretv8hiv8hi ((int16x8_t) __a); 159 } 160 161 static const int16_t coef[4] = { 89858 / 4, 22014, 45773 / 2, 113618 / 4 }; 162 163 void UpsampleRgbaLinePairNEON(const uint8_t *top_y, const uint8_t *bottom_y, const uint8_t *top_u, const uint8_t *top_v, const uint8_t *cur_u, const uint8_t *cur_v, uint8_t *top_dst, uint8_t *bottom_dst, int len) 164 { 165 int block; 166 uint8_t uv_buf[2 * 32 + 15]; 167 uint8_t *const r_uv = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15); 168 const int uv_len = (len + 1) >> 1; 169 const int num_blocks = (uv_len - 1) >> 3; 170 const int leftover = uv_len - num_blocks * 8; 171 const int last_pos = 1 + 16 * num_blocks; 172 const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1; 173 const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1; 174 const int16x4_t cf16 = vld1_s16(coef); 175 const int32x2_t cf32 = vmov_n_s32(76283); 176 const uint8x8_t u16 = vmov_n_u8(16); 177 const uint8x8_t u128 = vmov_n_u8(128); 178 for (block = 0; block < num_blocks; ++block) { 179 { 180 uint8x8_t a = vld1_u8(top_u); 181 uint8x8_t b = vld1_u8(top_u + 1); 182 uint8x8_t c = vld1_u8(cur_u); 183 uint8x8_t d = vld1_u8(cur_u + 1); 184 uint16x8_t al = vshll_n_u8(a, 1); 185 uint16x8_t bl = vshll_n_u8(b, 1); 186 uint16x8_t cl = vshll_n_u8(c, 1); 187 uint16x8_t dl = vshll_n_u8(d, 1); 188 uint8x8_t diag1, diag2; 189 uint16x8_t sl; 190 sl = vaddl_u8(a, b); 191 sl = vaddw_u8(sl, c); 192 sl = vaddw_u8(sl, d); 193 al = vaddq_u16(sl, al); 194 bl = vaddq_u16(sl, bl); 195 al = vaddq_u16(al, dl); 196 bl = vaddq_u16(bl, cl); 197 diag2 = vshrn_n_u16(al, 3); 198 diag1 = vshrn_n_u16(bl, 3); 199 a = vrhadd_u8(a, diag1); 200 b = vrhadd_u8(b, diag2); 201 c = vrhadd_u8(c, diag2); 202 d = vrhadd_u8(d, diag1); 203 { 204 const uint8x8x2_t a_b = {{ a, b }}; 205 const uint8x8x2_t c_d = {{ c, d }}; 206 vst2_u8(r_uv, a_b); 207 vst2_u8(r_uv + 32, c_d); 208 } 209 } 210 { 211 uint8x8_t a = vld1_u8(top_v); 212 uint8x8_t b = vld1_u8(top_v + 1); 213 uint8x8_t c = vld1_u8(cur_v); 214 uint8x8_t d = vld1_u8(cur_v + 1); 215 uint16x8_t al = vshll_n_u8(a, 1); 216 uint16x8_t bl = vshll_n_u8(b, 1); 217 uint16x8_t cl = vshll_n_u8(c, 1); 218 uint16x8_t dl = vshll_n_u8(d, 1); 219 uint8x8_t diag1, diag2; 220 uint16x8_t sl; 221 sl = vaddl_u8(a, b); 222 sl = vaddw_u8(sl, c); 223 sl = vaddw_u8(sl, d); 224 al = vaddq_u16(sl, al); 225 bl = vaddq_u16(sl, bl); 226 al = vaddq_u16(al, dl); 227 bl = vaddq_u16(bl, cl); 228 diag2 = vshrn_n_u16(al, 3); 229 diag1 = vshrn_n_u16(bl, 3); 230 a = vrhadd_u8(a, diag1); 231 b = vrhadd_u8(b, diag2); 232 c = vrhadd_u8(c, diag2); 233 d = vrhadd_u8(d, diag1); 234 { 235 const uint8x8x2_t a_b = {{ a, b }}; 236 const uint8x8x2_t c_d = {{ c, d }}; 237 vst2_u8(r_uv + 16, a_b); 238 vst2_u8(r_uv + 16 + 32, c_d); 239 } 240 } 241 { 242 if (top_y) { 243 { 244 int i; 245 for (i = 0; i < 16; i += 8) { 246 int off = ((16 * block + 1) + i) * 4; 247 uint8x8_t y = vld1_u8(top_y + (16 * block + 1) + i); 248 uint8x8_t u = vld1_u8((r_uv) + i); 249 uint8x8_t v = vld1_u8((r_uv) + i + 16); 250 int16x8_t yy = vreinterpretq_s16_u16(vsubl_u8(y, u16)); 251 int16x8_t uu = vreinterpretq_s16_u16(vsubl_u8(u, u128)); 252 int16x8_t vv = vreinterpretq_s16_u16(vsubl_u8(v, u128)); 253 int16x8_t ud = vshlq_n_s16(uu, 1); 254 int16x8_t vd = vshlq_n_s16(vv, 1); 255 int32x4_t vrl = vqdmlal_lane_s16(vshll_n_s16(vget_low_s16(vv), 1), vget_low_s16(vd), cf16, 0); 256 int32x4_t vrh = vqdmlal_lane_s16(vshll_n_s16(vget_high_s16(vv), 1), vget_high_s16(vd), cf16, 0); 257 int16x8_t vr = vcombine_s16(vrshrn_n_s32(vrl, 16), vrshrn_n_s32(vrh, 16)); 258 int32x4_t vl = vmovl_s16(vget_low_s16(vv)); 259 int32x4_t vh = vmovl_s16(vget_high_s16(vv)); 260 int32x4_t ugl = vmlal_lane_s16(vl, vget_low_s16(uu), cf16, 1); 261 int32x4_t ugh = vmlal_lane_s16(vh, vget_high_s16(uu), cf16, 1); 262 int32x4_t gcl = vqdmlal_lane_s16(ugl, vget_low_s16(vv), cf16, 2); 263 int32x4_t gch = vqdmlal_lane_s16(ugh, vget_high_s16(vv), cf16, 2); 264 int16x8_t gc = vcombine_s16(vrshrn_n_s32(gcl, 16), vrshrn_n_s32(gch, 16)); 265 int32x4_t ubl = vqdmlal_lane_s16(vshll_n_s16(vget_low_s16(uu), 1), vget_low_s16(ud), cf16, 3); 266 int32x4_t ubh = vqdmlal_lane_s16(vshll_n_s16(vget_high_s16(uu), 1), vget_high_s16(ud), cf16, 3); 267 int16x8_t ub = vcombine_s16(vrshrn_n_s32(ubl, 16), vrshrn_n_s32(ubh, 16)); 268 int32x4_t rl = vaddl_s16(vget_low_s16(yy), vget_low_s16(vr)); 269 int32x4_t rh = vaddl_s16(vget_high_s16(yy), vget_high_s16(vr)); 270 int32x4_t gl = vsubl_s16(vget_low_s16(yy), vget_low_s16(gc)); 271 int32x4_t gh = vsubl_s16(vget_high_s16(yy), vget_high_s16(gc)); 272 int32x4_t bl = vaddl_s16(vget_low_s16(yy), vget_low_s16(ub)); 273 int32x4_t bh = vaddl_s16(vget_high_s16(yy), vget_high_s16(ub)); 274 rl = vmulq_lane_s32(rl, cf32, 0); 275 rh = vmulq_lane_s32(rh, cf32, 0); 276 gl = vmulq_lane_s32(gl, cf32, 0); 277 gh = vmulq_lane_s32(gh, cf32, 0); 278 bl = vmulq_lane_s32(bl, cf32, 0); 279 bh = vmulq_lane_s32(bh, cf32, 0); 280 y = vqmovun_s16(vcombine_s16(vrshrn_n_s32(rl, 16), vrshrn_n_s32(rh, 16))); 281 u = vqmovun_s16(vcombine_s16(vrshrn_n_s32(gl, 16), vrshrn_n_s32(gh, 16))); 282 v = vqmovun_s16(vcombine_s16(vrshrn_n_s32(bl, 16), vrshrn_n_s32(bh, 16))); 283 do { 284 const uint8x8x4_t r_g_b_v255 = {{ y, u, v, vmov_n_u8(255) }}; 285 vst4_u8(top_dst + off, r_g_b_v255); 286 } while (0); 287 } 288 } 289 } 290 if (bottom_y) { 291 { 292 int i; 293 for (i = 0; i < 16; i += 8) { 294 int off = ((16 * block + 1) + i) * 4; 295 uint8x8_t y = vld1_u8(bottom_y + (16 * block + 1) + i); 296 uint8x8_t u = vld1_u8(((r_uv) + 32) + i); 297 uint8x8_t v = vld1_u8(((r_uv) + 32) + i + 16); 298 int16x8_t yy = vreinterpretq_s16_u16(vsubl_u8(y, u16)); 299 int16x8_t uu = vreinterpretq_s16_u16(vsubl_u8(u, u128)); 300 int16x8_t vv = vreinterpretq_s16_u16(vsubl_u8(v, u128)); 301 int16x8_t ud = vshlq_n_s16(uu, 1); 302 int16x8_t vd = vshlq_n_s16(vv, 1); 303 int32x4_t vrl = vqdmlal_lane_s16(vshll_n_s16(vget_low_s16(vv), 1), vget_low_s16(vd), cf16, 0); 304 int32x4_t vrh = vqdmlal_lane_s16(vshll_n_s16(vget_high_s16(vv), 1), vget_high_s16(vd), cf16, 0); 305 int16x8_t vr = vcombine_s16(vrshrn_n_s32(vrl, 16), vrshrn_n_s32(vrh, 16)); 306 int32x4_t vl = vmovl_s16(vget_low_s16(vv)); 307 int32x4_t vh = vmovl_s16(vget_high_s16(vv)); 308 int32x4_t ugl = vmlal_lane_s16(vl, vget_low_s16(uu), cf16, 1); 309 int32x4_t ugh = vmlal_lane_s16(vh, vget_high_s16(uu), cf16, 1); 310 int32x4_t gcl = vqdmlal_lane_s16(ugl, vget_low_s16(vv), cf16, 2); 311 int32x4_t gch = vqdmlal_lane_s16(ugh, vget_high_s16(vv), cf16, 2); 312 int16x8_t gc = vcombine_s16(vrshrn_n_s32(gcl, 16), vrshrn_n_s32(gch, 16)); 313 int32x4_t ubl = vqdmlal_lane_s16(vshll_n_s16(vget_low_s16(uu), 1), vget_low_s16(ud), cf16, 3); 314 int32x4_t ubh = vqdmlal_lane_s16(vshll_n_s16(vget_high_s16(uu), 1), vget_high_s16(ud), cf16, 3); 315 int16x8_t ub = vcombine_s16(vrshrn_n_s32(ubl, 16), vrshrn_n_s32(ubh, 16)); 316 int32x4_t rl = vaddl_s16(vget_low_s16(yy), vget_low_s16(vr)); 317 int32x4_t rh = vaddl_s16(vget_high_s16(yy), vget_high_s16(vr)); 318 int32x4_t gl = vsubl_s16(vget_low_s16(yy), vget_low_s16(gc)); 319 int32x4_t gh = vsubl_s16(vget_high_s16(yy), vget_high_s16(gc)); 320 int32x4_t bl = vaddl_s16(vget_low_s16(yy), vget_low_s16(ub)); 321 int32x4_t bh = vaddl_s16(vget_high_s16(yy), vget_high_s16(ub)); 322 rl = vmulq_lane_s32(rl, cf32, 0); 323 rh = vmulq_lane_s32(rh, cf32, 0); 324 gl = vmulq_lane_s32(gl, cf32, 0); 325 gh = vmulq_lane_s32(gh, cf32, 0); 326 bl = vmulq_lane_s32(bl, cf32, 0); 327 bh = vmulq_lane_s32(bh, cf32, 0); 328 y = vqmovun_s16(vcombine_s16(vrshrn_n_s32(rl, 16), vrshrn_n_s32(rh, 16))); 329 u = vqmovun_s16(vcombine_s16(vrshrn_n_s32(gl, 16), vrshrn_n_s32(gh, 16))); 330 v = vqmovun_s16(vcombine_s16(vrshrn_n_s32(bl, 16), vrshrn_n_s32(bh, 16))); 331 do { 332 const uint8x8x4_t r_g_b_v255 = {{ y, u, v, vmov_n_u8(255) }}; 333 vst4_u8(bottom_dst + off, r_g_b_v255); 334 } while (0); 335 } 336 } 337 } 338 } 339 } 340 } 341