Home | History | Annotate | Download | only in jni
      1 typedef short int16_t;
      2 typedef int int32_t;
      3 typedef unsigned char uint8_t;
      4 typedef unsigned int uintptr_t;
      5 
      6 typedef __builtin_neon_hi int16x4_t __attribute__ ((__vector_size__ (8)));
      7 typedef __builtin_neon_uqi uint8x8_t __attribute__ ((__vector_size__ (8)));
      8 typedef __builtin_neon_uhi uint16x8_t __attribute__ ((__vector_size__ (16)));
      9 typedef __builtin_neon_si int32x4_t __attribute__ ((__vector_size__ (16)));
     10 typedef __builtin_neon_hi int16x8_t __attribute__ ((__vector_size__ (16)));
     11 typedef __builtin_neon_qi int8x8_t __attribute__ ((__vector_size__ (8)));
     12 typedef __builtin_neon_si int32x2_t __attribute__ ((__vector_size__ (8)));
     13 
     14 typedef struct uint8x8x2_t
     15 {
     16   uint8x8_t val[2];
     17 } uint8x8x2_t;
     18 typedef struct uint8x8x4_t
     19 {
     20   uint8x8_t val[4];
     21 } uint8x8x4_t;
     22 
     23 __extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
     24 vaddq_u16 (uint16x8_t __a, uint16x8_t __b)
     25 {
     26   return (uint16x8_t)__builtin_neon_vaddv8hi ((int16x8_t) __a, (int16x8_t) __b, 0);
     27 }
     28 __extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
     29 vaddl_s16 (int16x4_t __a, int16x4_t __b)
     30 {
     31   return (int32x4_t)__builtin_neon_vaddlv4hi (__a, __b, 1);
     32 }
     33 __extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
     34 vaddl_u8 (uint8x8_t __a, uint8x8_t __b)
     35 {
     36   return (uint16x8_t)__builtin_neon_vaddlv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
     37 }
     38 __extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
     39 vaddw_u8 (uint16x8_t __a, uint8x8_t __b)
     40 {
     41   return (uint16x8_t)__builtin_neon_vaddwv8qi ((int16x8_t) __a, (int8x8_t) __b, 0);
     42 }
     43 __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
     44 vrhadd_u8 (uint8x8_t __a, uint8x8_t __b)
     45 {
     46   return (uint8x8_t)__builtin_neon_vhaddv8qi ((int8x8_t) __a, (int8x8_t) __b, 4);
     47 }
     48 __extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
     49 vsubl_s16 (int16x4_t __a, int16x4_t __b)
     50 {
     51   return (int32x4_t)__builtin_neon_vsublv4hi (__a, __b, 1);
     52 }
     53 __extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
     54 vsubl_u8 (uint8x8_t __a, uint8x8_t __b)
     55 {
     56   return (uint16x8_t)__builtin_neon_vsublv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
     57 }
     58 __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
     59 vshrn_n_u16 (uint16x8_t __a, const int __b)
     60 {
     61   return (uint8x8_t)__builtin_neon_vshrn_nv8hi ((int16x8_t) __a, __b, 0);
     62 }
     63 __extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
     64 vrshrn_n_s32 (int32x4_t __a, const int __b)
     65 {
     66   return (int16x4_t)__builtin_neon_vshrn_nv4si (__a, __b, 5);
     67 }
     68 __extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
     69 vshlq_n_s16 (int16x8_t __a, const int __b)
     70 {
     71   return (int16x8_t)__builtin_neon_vshl_nv8hi (__a, __b, 1);
     72 }
     73 __extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
     74 vshll_n_s16 (int16x4_t __a, const int __b)
     75 {
     76   return (int32x4_t)__builtin_neon_vshll_nv4hi (__a, __b, 1);
     77 }
     78 __extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
     79 vshll_n_u8 (uint8x8_t __a, const int __b)
     80 {
     81   return (uint16x8_t)__builtin_neon_vshll_nv8qi ((int8x8_t) __a, __b, 0);
     82 }
     83 __extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
     84 vmov_n_s32 (int32_t __a)
     85 {
     86   return (int32x2_t)__builtin_neon_vdup_nv2si ((__builtin_neon_si) __a);
     87 }
     88 __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
     89 vmov_n_u8 (uint8_t __a)
     90 {
     91   return (uint8x8_t)__builtin_neon_vdup_nv8qi ((__builtin_neon_qi) __a);
     92 }
     93 __extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
     94 vcombine_s16 (int16x4_t __a, int16x4_t __b)
     95 {
     96   return (int16x8_t)__builtin_neon_vcombinev4hi (__a, __b);
     97 }
     98 __extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
     99 vget_high_s16 (int16x8_t __a)
    100 {
    101   return (int16x4_t)__builtin_neon_vget_highv8hi (__a);
    102 }
    103 __extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
    104 vget_low_s16 (int16x8_t __a)
    105 {
    106   return (int16x4_t)__builtin_neon_vget_lowv8hi (__a);
    107 }
    108 __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
    109 vqmovun_s16 (int16x8_t __a)
    110 {
    111   return (uint8x8_t)__builtin_neon_vqmovunv8hi (__a, 1);
    112 }
    113 __extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
    114 vmovl_s16 (int16x4_t __a)
    115 {
    116   return (int32x4_t)__builtin_neon_vmovlv4hi (__a, 1);
    117 }
    118 __extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
    119 vmulq_lane_s32 (int32x4_t __a, int32x2_t __b, const int __c)
    120 {
    121   return (int32x4_t)__builtin_neon_vmul_lanev4si (__a, __b, __c, 1);
    122 }
    123 __extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
    124 vmlal_lane_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c, const int __d)
    125 {
    126   return (int32x4_t)__builtin_neon_vmlal_lanev4hi (__a, __b, __c, __d, 1);
    127 }
    128 __extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
    129 vqdmlal_lane_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c, const int __d)
    130 {
    131   return (int32x4_t)__builtin_neon_vqdmlal_lanev4hi (__a, __b, __c, __d, 1);
    132 }
    133 __extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
    134 vld1_s16 (const int16_t * __a)
    135 {
    136   return (int16x4_t)__builtin_neon_vld1v4hi ((const __builtin_neon_hi *) __a);
    137 }
    138 __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
    139 vld1_u8 (const uint8_t * __a)
    140 {
    141   return (uint8x8_t)__builtin_neon_vld1v8qi ((const __builtin_neon_qi *) __a);
    142 }
    143 __extension__ static __inline void __attribute__ ((__always_inline__))
    144 vst2_u8 (uint8_t * __a, uint8x8x2_t __b)
    145 {
    146   union { uint8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
    147   __builtin_neon_vst2v8qi ((__builtin_neon_qi *) __a, __bu.__o);
    148 }
    149 __extension__ static __inline void __attribute__ ((__always_inline__))
    150 vst4_u8 (uint8_t * __a, uint8x8x4_t __b)
    151 {
    152   union { uint8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
    153   __builtin_neon_vst4v8qi ((__builtin_neon_qi *) __a, __bu.__o);
    154 }
    155 __extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
    156 vreinterpretq_s16_u16 (uint16x8_t __a)
    157 {
    158   return (int16x8_t)__builtin_neon_vreinterpretv8hiv8hi ((int16x8_t) __a);
    159 }
    160 
    161 static const int16_t coef[4] = { 89858 / 4, 22014, 45773 / 2, 113618 / 4 };
    162 
    163 void UpsampleRgbaLinePairNEON(const uint8_t *top_y, const uint8_t *bottom_y, const uint8_t *top_u, const uint8_t *top_v, const uint8_t *cur_u, const uint8_t *cur_v, uint8_t *top_dst, uint8_t *bottom_dst, int len)
    164 {
    165     int block;
    166     uint8_t uv_buf[2 * 32 + 15];
    167     uint8_t *const r_uv = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15);
    168     const int uv_len = (len + 1) >> 1;
    169     const int num_blocks = (uv_len - 1) >> 3;
    170     const int leftover = uv_len - num_blocks * 8;
    171     const int last_pos = 1 + 16 * num_blocks;
    172     const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1;
    173     const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1;
    174     const int16x4_t cf16 = vld1_s16(coef);
    175     const int32x2_t cf32 = vmov_n_s32(76283);
    176     const uint8x8_t u16 = vmov_n_u8(16);
    177     const uint8x8_t u128 = vmov_n_u8(128);
    178     for (block = 0; block < num_blocks; ++block) {
    179         {
    180             uint8x8_t a = vld1_u8(top_u);
    181             uint8x8_t b = vld1_u8(top_u + 1);
    182             uint8x8_t c = vld1_u8(cur_u);
    183             uint8x8_t d = vld1_u8(cur_u + 1);
    184             uint16x8_t al = vshll_n_u8(a, 1);
    185             uint16x8_t bl = vshll_n_u8(b, 1);
    186             uint16x8_t cl = vshll_n_u8(c, 1);
    187             uint16x8_t dl = vshll_n_u8(d, 1);
    188             uint8x8_t diag1, diag2;
    189             uint16x8_t sl;
    190             sl = vaddl_u8(a, b);
    191             sl = vaddw_u8(sl, c);
    192             sl = vaddw_u8(sl, d);
    193             al = vaddq_u16(sl, al);
    194             bl = vaddq_u16(sl, bl);
    195             al = vaddq_u16(al, dl);
    196             bl = vaddq_u16(bl, cl);
    197             diag2 = vshrn_n_u16(al, 3);
    198             diag1 = vshrn_n_u16(bl, 3);
    199             a = vrhadd_u8(a, diag1);
    200             b = vrhadd_u8(b, diag2);
    201             c = vrhadd_u8(c, diag2);
    202             d = vrhadd_u8(d, diag1);
    203             {
    204                 const uint8x8x2_t a_b = {{ a, b }};
    205                 const uint8x8x2_t c_d = {{ c, d }};
    206                 vst2_u8(r_uv, a_b);
    207                 vst2_u8(r_uv + 32, c_d);
    208             }
    209         }
    210         {
    211             uint8x8_t a = vld1_u8(top_v);
    212             uint8x8_t b = vld1_u8(top_v + 1);
    213             uint8x8_t c = vld1_u8(cur_v);
    214             uint8x8_t d = vld1_u8(cur_v + 1);
    215             uint16x8_t al = vshll_n_u8(a, 1);
    216             uint16x8_t bl = vshll_n_u8(b, 1);
    217             uint16x8_t cl = vshll_n_u8(c, 1);
    218             uint16x8_t dl = vshll_n_u8(d, 1);
    219             uint8x8_t diag1, diag2;
    220             uint16x8_t sl;
    221             sl = vaddl_u8(a, b);
    222             sl = vaddw_u8(sl, c);
    223             sl = vaddw_u8(sl, d);
    224             al = vaddq_u16(sl, al);
    225             bl = vaddq_u16(sl, bl);
    226             al = vaddq_u16(al, dl);
    227             bl = vaddq_u16(bl, cl);
    228             diag2 = vshrn_n_u16(al, 3);
    229             diag1 = vshrn_n_u16(bl, 3);
    230             a = vrhadd_u8(a, diag1);
    231             b = vrhadd_u8(b, diag2);
    232             c = vrhadd_u8(c, diag2);
    233             d = vrhadd_u8(d, diag1);
    234             {
    235                 const uint8x8x2_t a_b = {{ a, b }};
    236                 const uint8x8x2_t c_d = {{ c, d }};
    237                 vst2_u8(r_uv + 16, a_b);
    238                 vst2_u8(r_uv + 16 + 32, c_d);
    239             }
    240         }
    241         {
    242             if (top_y) {
    243                 {
    244                     int i;
    245                     for (i = 0; i < 16; i += 8) {
    246                         int off = ((16 * block + 1) + i) * 4;
    247                         uint8x8_t y = vld1_u8(top_y + (16 * block + 1) + i);
    248                         uint8x8_t u = vld1_u8((r_uv) + i);
    249                         uint8x8_t v = vld1_u8((r_uv) + i + 16);
    250                         int16x8_t yy = vreinterpretq_s16_u16(vsubl_u8(y, u16));
    251                         int16x8_t uu = vreinterpretq_s16_u16(vsubl_u8(u, u128));
    252                         int16x8_t vv = vreinterpretq_s16_u16(vsubl_u8(v, u128));
    253                         int16x8_t ud = vshlq_n_s16(uu, 1);
    254                         int16x8_t vd = vshlq_n_s16(vv, 1);
    255                         int32x4_t vrl = vqdmlal_lane_s16(vshll_n_s16(vget_low_s16(vv), 1), vget_low_s16(vd), cf16, 0);
    256                         int32x4_t vrh = vqdmlal_lane_s16(vshll_n_s16(vget_high_s16(vv), 1), vget_high_s16(vd), cf16, 0);
    257                         int16x8_t vr = vcombine_s16(vrshrn_n_s32(vrl, 16), vrshrn_n_s32(vrh, 16));
    258                         int32x4_t vl = vmovl_s16(vget_low_s16(vv));
    259                         int32x4_t vh = vmovl_s16(vget_high_s16(vv));
    260                         int32x4_t ugl = vmlal_lane_s16(vl, vget_low_s16(uu), cf16, 1);
    261                         int32x4_t ugh = vmlal_lane_s16(vh, vget_high_s16(uu), cf16, 1);
    262                         int32x4_t gcl = vqdmlal_lane_s16(ugl, vget_low_s16(vv), cf16, 2);
    263                         int32x4_t gch = vqdmlal_lane_s16(ugh, vget_high_s16(vv), cf16, 2);
    264                         int16x8_t gc = vcombine_s16(vrshrn_n_s32(gcl, 16), vrshrn_n_s32(gch, 16));
    265                         int32x4_t ubl = vqdmlal_lane_s16(vshll_n_s16(vget_low_s16(uu), 1), vget_low_s16(ud), cf16, 3);
    266                         int32x4_t ubh = vqdmlal_lane_s16(vshll_n_s16(vget_high_s16(uu), 1), vget_high_s16(ud), cf16, 3);
    267                         int16x8_t ub = vcombine_s16(vrshrn_n_s32(ubl, 16), vrshrn_n_s32(ubh, 16));
    268                         int32x4_t rl = vaddl_s16(vget_low_s16(yy), vget_low_s16(vr));
    269                         int32x4_t rh = vaddl_s16(vget_high_s16(yy), vget_high_s16(vr));
    270                         int32x4_t gl = vsubl_s16(vget_low_s16(yy), vget_low_s16(gc));
    271                         int32x4_t gh = vsubl_s16(vget_high_s16(yy), vget_high_s16(gc));
    272                         int32x4_t bl = vaddl_s16(vget_low_s16(yy), vget_low_s16(ub));
    273                         int32x4_t bh = vaddl_s16(vget_high_s16(yy), vget_high_s16(ub));
    274                         rl = vmulq_lane_s32(rl, cf32, 0);
    275                         rh = vmulq_lane_s32(rh, cf32, 0);
    276                         gl = vmulq_lane_s32(gl, cf32, 0);
    277                         gh = vmulq_lane_s32(gh, cf32, 0);
    278                         bl = vmulq_lane_s32(bl, cf32, 0);
    279                         bh = vmulq_lane_s32(bh, cf32, 0);
    280                         y = vqmovun_s16(vcombine_s16(vrshrn_n_s32(rl, 16), vrshrn_n_s32(rh, 16)));
    281                         u = vqmovun_s16(vcombine_s16(vrshrn_n_s32(gl, 16), vrshrn_n_s32(gh, 16)));
    282                         v = vqmovun_s16(vcombine_s16(vrshrn_n_s32(bl, 16), vrshrn_n_s32(bh, 16)));
    283                         do {
    284                             const uint8x8x4_t r_g_b_v255 = {{ y, u, v, vmov_n_u8(255) }};
    285                             vst4_u8(top_dst + off, r_g_b_v255);
    286                         } while (0);
    287                     }
    288                 }
    289             }
    290             if (bottom_y) {
    291                 {
    292                     int i;
    293                     for (i = 0; i < 16; i += 8) {
    294                         int off = ((16 * block + 1) + i) * 4;
    295                         uint8x8_t y = vld1_u8(bottom_y + (16 * block + 1) + i);
    296                         uint8x8_t u = vld1_u8(((r_uv) + 32) + i);
    297                         uint8x8_t v = vld1_u8(((r_uv) + 32) + i + 16);
    298                         int16x8_t yy = vreinterpretq_s16_u16(vsubl_u8(y, u16));
    299                         int16x8_t uu = vreinterpretq_s16_u16(vsubl_u8(u, u128));
    300                         int16x8_t vv = vreinterpretq_s16_u16(vsubl_u8(v, u128));
    301                         int16x8_t ud = vshlq_n_s16(uu, 1);
    302                         int16x8_t vd = vshlq_n_s16(vv, 1);
    303                         int32x4_t vrl = vqdmlal_lane_s16(vshll_n_s16(vget_low_s16(vv), 1), vget_low_s16(vd), cf16, 0);
    304                         int32x4_t vrh = vqdmlal_lane_s16(vshll_n_s16(vget_high_s16(vv), 1), vget_high_s16(vd), cf16, 0);
    305                         int16x8_t vr = vcombine_s16(vrshrn_n_s32(vrl, 16), vrshrn_n_s32(vrh, 16));
    306                         int32x4_t vl = vmovl_s16(vget_low_s16(vv));
    307                         int32x4_t vh = vmovl_s16(vget_high_s16(vv));
    308                         int32x4_t ugl = vmlal_lane_s16(vl, vget_low_s16(uu), cf16, 1);
    309                         int32x4_t ugh = vmlal_lane_s16(vh, vget_high_s16(uu), cf16, 1);
    310                         int32x4_t gcl = vqdmlal_lane_s16(ugl, vget_low_s16(vv), cf16, 2);
    311                         int32x4_t gch = vqdmlal_lane_s16(ugh, vget_high_s16(vv), cf16, 2);
    312                         int16x8_t gc = vcombine_s16(vrshrn_n_s32(gcl, 16), vrshrn_n_s32(gch, 16));
    313                         int32x4_t ubl = vqdmlal_lane_s16(vshll_n_s16(vget_low_s16(uu), 1), vget_low_s16(ud), cf16, 3);
    314                         int32x4_t ubh = vqdmlal_lane_s16(vshll_n_s16(vget_high_s16(uu), 1), vget_high_s16(ud), cf16, 3);
    315                         int16x8_t ub = vcombine_s16(vrshrn_n_s32(ubl, 16), vrshrn_n_s32(ubh, 16));
    316                         int32x4_t rl = vaddl_s16(vget_low_s16(yy), vget_low_s16(vr));
    317                         int32x4_t rh = vaddl_s16(vget_high_s16(yy), vget_high_s16(vr));
    318                         int32x4_t gl = vsubl_s16(vget_low_s16(yy), vget_low_s16(gc));
    319                         int32x4_t gh = vsubl_s16(vget_high_s16(yy), vget_high_s16(gc));
    320                         int32x4_t bl = vaddl_s16(vget_low_s16(yy), vget_low_s16(ub));
    321                         int32x4_t bh = vaddl_s16(vget_high_s16(yy), vget_high_s16(ub));
    322                         rl = vmulq_lane_s32(rl, cf32, 0);
    323                         rh = vmulq_lane_s32(rh, cf32, 0);
    324                         gl = vmulq_lane_s32(gl, cf32, 0);
    325                         gh = vmulq_lane_s32(gh, cf32, 0);
    326                         bl = vmulq_lane_s32(bl, cf32, 0);
    327                         bh = vmulq_lane_s32(bh, cf32, 0);
    328                         y = vqmovun_s16(vcombine_s16(vrshrn_n_s32(rl, 16), vrshrn_n_s32(rh, 16)));
    329                         u = vqmovun_s16(vcombine_s16(vrshrn_n_s32(gl, 16), vrshrn_n_s32(gh, 16)));
    330                         v = vqmovun_s16(vcombine_s16(vrshrn_n_s32(bl, 16), vrshrn_n_s32(bh, 16)));
    331                         do {
    332                             const uint8x8x4_t r_g_b_v255 = {{ y, u, v, vmov_n_u8(255) }};
    333                             vst4_u8(bottom_dst + off, r_g_b_v255);
    334                         } while (0);
    335                     }
    336                 }
    337             }
    338         }
    339     }
    340 }
    341