Home | History | Annotate | Download | only in neon
      1 /*
      2  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <arm_neon.h>
     12 
     13 #ifdef _MSC_VER
     14 #define __builtin_prefetch(x)
     15 #endif
     16 
     17 unsigned int vp8_variance16x16_neon(
     18         const unsigned char *src_ptr,
     19         int source_stride,
     20         const unsigned char *ref_ptr,
     21         int recon_stride,
     22         unsigned int *sse) {
     23     int i;
     24     int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
     25     uint32x2_t d0u32, d10u32;
     26     int64x1_t d0s64, d1s64;
     27     uint8x16_t q0u8, q1u8, q2u8, q3u8;
     28     uint16x8_t q11u16, q12u16, q13u16, q14u16;
     29     int32x4_t q8s32, q9s32, q10s32;
     30     int64x2_t q0s64, q1s64, q5s64;
     31 
     32     q8s32 = vdupq_n_s32(0);
     33     q9s32 = vdupq_n_s32(0);
     34     q10s32 = vdupq_n_s32(0);
     35 
     36     for (i = 0; i < 8; i++) {
     37         q0u8 = vld1q_u8(src_ptr);
     38         src_ptr += source_stride;
     39         q1u8 = vld1q_u8(src_ptr);
     40         src_ptr += source_stride;
     41         __builtin_prefetch(src_ptr);
     42 
     43         q2u8 = vld1q_u8(ref_ptr);
     44         ref_ptr += recon_stride;
     45         q3u8 = vld1q_u8(ref_ptr);
     46         ref_ptr += recon_stride;
     47         __builtin_prefetch(ref_ptr);
     48 
     49         q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
     50         q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
     51         q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
     52         q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
     53 
     54         d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
     55         d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
     56         q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
     57         q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
     58         q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
     59 
     60         d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
     61         d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
     62         q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
     63         q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
     64         q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
     65 
     66         d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
     67         d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
     68         q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
     69         q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
     70         q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
     71 
     72         d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
     73         d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
     74         q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
     75         q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
     76         q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
     77     }
     78 
     79     q10s32 = vaddq_s32(q10s32, q9s32);
     80     q0s64 = vpaddlq_s32(q8s32);
     81     q1s64 = vpaddlq_s32(q10s32);
     82 
     83     d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
     84     d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
     85 
     86     q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
     87                       vreinterpret_s32_s64(d0s64));
     88     vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
     89 
     90     d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8);
     91     d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
     92 
     93     return vget_lane_u32(d0u32, 0);
     94 }
     95 
     96 unsigned int vp8_variance16x8_neon(
     97         const unsigned char *src_ptr,
     98         int source_stride,
     99         const unsigned char *ref_ptr,
    100         int recon_stride,
    101         unsigned int *sse) {
    102     int i;
    103     int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
    104     uint32x2_t d0u32, d10u32;
    105     int64x1_t d0s64, d1s64;
    106     uint8x16_t q0u8, q1u8, q2u8, q3u8;
    107     uint16x8_t q11u16, q12u16, q13u16, q14u16;
    108     int32x4_t q8s32, q9s32, q10s32;
    109     int64x2_t q0s64, q1s64, q5s64;
    110 
    111     q8s32 = vdupq_n_s32(0);
    112     q9s32 = vdupq_n_s32(0);
    113     q10s32 = vdupq_n_s32(0);
    114 
    115     for (i = 0; i < 4; i++) {  // variance16x8_neon_loop
    116         q0u8 = vld1q_u8(src_ptr);
    117         src_ptr += source_stride;
    118         q1u8 = vld1q_u8(src_ptr);
    119         src_ptr += source_stride;
    120         __builtin_prefetch(src_ptr);
    121 
    122         q2u8 = vld1q_u8(ref_ptr);
    123         ref_ptr += recon_stride;
    124         q3u8 = vld1q_u8(ref_ptr);
    125         ref_ptr += recon_stride;
    126         __builtin_prefetch(ref_ptr);
    127 
    128         q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
    129         q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
    130         q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
    131         q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
    132 
    133         d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
    134         d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
    135         q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
    136         q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
    137         q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
    138 
    139         d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
    140         d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
    141         q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
    142         q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
    143         q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
    144 
    145         d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
    146         d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
    147         q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
    148         q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
    149         q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
    150 
    151         d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
    152         d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
    153         q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
    154         q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
    155         q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
    156     }
    157 
    158     q10s32 = vaddq_s32(q10s32, q9s32);
    159     q0s64 = vpaddlq_s32(q8s32);
    160     q1s64 = vpaddlq_s32(q10s32);
    161 
    162     d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
    163     d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
    164 
    165     q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
    166                       vreinterpret_s32_s64(d0s64));
    167     vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
    168 
    169     d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
    170     d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
    171 
    172     return vget_lane_u32(d0u32, 0);
    173 }
    174 
    175 unsigned int vp8_variance8x16_neon(
    176         const unsigned char *src_ptr,
    177         int source_stride,
    178         const unsigned char *ref_ptr,
    179         int recon_stride,
    180         unsigned int *sse) {
    181     int i;
    182     uint8x8_t d0u8, d2u8, d4u8, d6u8;
    183     int16x4_t d22s16, d23s16, d24s16, d25s16;
    184     uint32x2_t d0u32, d10u32;
    185     int64x1_t d0s64, d1s64;
    186     uint16x8_t q11u16, q12u16;
    187     int32x4_t q8s32, q9s32, q10s32;
    188     int64x2_t q0s64, q1s64, q5s64;
    189 
    190     q8s32 = vdupq_n_s32(0);
    191     q9s32 = vdupq_n_s32(0);
    192     q10s32 = vdupq_n_s32(0);
    193 
    194     for (i = 0; i < 8; i++) {  // variance8x16_neon_loop
    195         d0u8 = vld1_u8(src_ptr);
    196         src_ptr += source_stride;
    197         d2u8 = vld1_u8(src_ptr);
    198         src_ptr += source_stride;
    199         __builtin_prefetch(src_ptr);
    200 
    201         d4u8 = vld1_u8(ref_ptr);
    202         ref_ptr += recon_stride;
    203         d6u8 = vld1_u8(ref_ptr);
    204         ref_ptr += recon_stride;
    205         __builtin_prefetch(ref_ptr);
    206 
    207         q11u16 = vsubl_u8(d0u8, d4u8);
    208         q12u16 = vsubl_u8(d2u8, d6u8);
    209 
    210         d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
    211         d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
    212         q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
    213         q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
    214         q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
    215 
    216         d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
    217         d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
    218         q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
    219         q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
    220         q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
    221     }
    222 
    223     q10s32 = vaddq_s32(q10s32, q9s32);
    224     q0s64 = vpaddlq_s32(q8s32);
    225     q1s64 = vpaddlq_s32(q10s32);
    226 
    227     d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
    228     d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
    229 
    230     q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
    231                       vreinterpret_s32_s64(d0s64));
    232     vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
    233 
    234     d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
    235     d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
    236 
    237     return vget_lane_u32(d0u32, 0);
    238 }
    239 
    240 unsigned int vp8_variance8x8_neon(
    241         const unsigned char *src_ptr,
    242         int source_stride,
    243         const unsigned char *ref_ptr,
    244         int recon_stride,
    245         unsigned int *sse) {
    246     int i;
    247     uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
    248     int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
    249     uint32x2_t d0u32, d10u32;
    250     int64x1_t d0s64, d1s64;
    251     uint16x8_t q11u16, q12u16, q13u16, q14u16;
    252     int32x4_t q8s32, q9s32, q10s32;
    253     int64x2_t q0s64, q1s64, q5s64;
    254 
    255     q8s32 = vdupq_n_s32(0);
    256     q9s32 = vdupq_n_s32(0);
    257     q10s32 = vdupq_n_s32(0);
    258 
    259     for (i = 0; i < 2; i++) {  // variance8x8_neon_loop
    260         d0u8 = vld1_u8(src_ptr);
    261         src_ptr += source_stride;
    262         d1u8 = vld1_u8(src_ptr);
    263         src_ptr += source_stride;
    264         d2u8 = vld1_u8(src_ptr);
    265         src_ptr += source_stride;
    266         d3u8 = vld1_u8(src_ptr);
    267         src_ptr += source_stride;
    268 
    269         d4u8 = vld1_u8(ref_ptr);
    270         ref_ptr += recon_stride;
    271         d5u8 = vld1_u8(ref_ptr);
    272         ref_ptr += recon_stride;
    273         d6u8 = vld1_u8(ref_ptr);
    274         ref_ptr += recon_stride;
    275         d7u8 = vld1_u8(ref_ptr);
    276         ref_ptr += recon_stride;
    277 
    278         q11u16 = vsubl_u8(d0u8, d4u8);
    279         q12u16 = vsubl_u8(d1u8, d5u8);
    280         q13u16 = vsubl_u8(d2u8, d6u8);
    281         q14u16 = vsubl_u8(d3u8, d7u8);
    282 
    283         d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
    284         d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
    285         q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
    286         q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
    287         q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
    288 
    289         d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
    290         d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
    291         q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
    292         q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
    293         q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
    294 
    295         d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
    296         d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
    297         q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
    298         q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
    299         q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
    300 
    301         d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
    302         d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
    303         q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
    304         q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
    305         q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
    306     }
    307 
    308     q10s32 = vaddq_s32(q10s32, q9s32);
    309     q0s64 = vpaddlq_s32(q8s32);
    310     q1s64 = vpaddlq_s32(q10s32);
    311 
    312     d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
    313     d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
    314 
    315     q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
    316                       vreinterpret_s32_s64(d0s64));
    317     vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
    318 
    319     d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 6);
    320     d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
    321 
    322     return vget_lane_u32(d0u32, 0);
    323 }
    324