Home | History | Annotate | Download | only in neon
      1 /*
      2  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <arm_neon.h>
     12 #include "vpx_ports/mem.h"
     13 
     14 unsigned int vp8_variance16x16_neon(
     15         const unsigned char *src_ptr,
     16         int source_stride,
     17         const unsigned char *ref_ptr,
     18         int recon_stride,
     19         unsigned int *sse) {
     20     int i;
     21     int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
     22     uint32x2_t d0u32, d10u32;
     23     int64x1_t d0s64, d1s64;
     24     uint8x16_t q0u8, q1u8, q2u8, q3u8;
     25     uint16x8_t q11u16, q12u16, q13u16, q14u16;
     26     int32x4_t q8s32, q9s32, q10s32;
     27     int64x2_t q0s64, q1s64, q5s64;
     28 
     29     q8s32 = vdupq_n_s32(0);
     30     q9s32 = vdupq_n_s32(0);
     31     q10s32 = vdupq_n_s32(0);
     32 
     33     for (i = 0; i < 8; i++) {
     34         q0u8 = vld1q_u8(src_ptr);
     35         src_ptr += source_stride;
     36         q1u8 = vld1q_u8(src_ptr);
     37         src_ptr += source_stride;
     38         __builtin_prefetch(src_ptr);
     39 
     40         q2u8 = vld1q_u8(ref_ptr);
     41         ref_ptr += recon_stride;
     42         q3u8 = vld1q_u8(ref_ptr);
     43         ref_ptr += recon_stride;
     44         __builtin_prefetch(ref_ptr);
     45 
     46         q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
     47         q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
     48         q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
     49         q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
     50 
     51         d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
     52         d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
     53         q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
     54         q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
     55         q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
     56 
     57         d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
     58         d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
     59         q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
     60         q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
     61         q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
     62 
     63         d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
     64         d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
     65         q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
     66         q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
     67         q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
     68 
     69         d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
     70         d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
     71         q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
     72         q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
     73         q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
     74     }
     75 
     76     q10s32 = vaddq_s32(q10s32, q9s32);
     77     q0s64 = vpaddlq_s32(q8s32);
     78     q1s64 = vpaddlq_s32(q10s32);
     79 
     80     d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
     81     d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
     82 
     83     q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
     84                       vreinterpret_s32_s64(d0s64));
     85     vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
     86 
     87     d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8);
     88     d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
     89 
     90     return vget_lane_u32(d0u32, 0);
     91 }
     92 
     93 unsigned int vp8_variance16x8_neon(
     94         const unsigned char *src_ptr,
     95         int source_stride,
     96         const unsigned char *ref_ptr,
     97         int recon_stride,
     98         unsigned int *sse) {
     99     int i;
    100     int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
    101     uint32x2_t d0u32, d10u32;
    102     int64x1_t d0s64, d1s64;
    103     uint8x16_t q0u8, q1u8, q2u8, q3u8;
    104     uint16x8_t q11u16, q12u16, q13u16, q14u16;
    105     int32x4_t q8s32, q9s32, q10s32;
    106     int64x2_t q0s64, q1s64, q5s64;
    107 
    108     q8s32 = vdupq_n_s32(0);
    109     q9s32 = vdupq_n_s32(0);
    110     q10s32 = vdupq_n_s32(0);
    111 
    112     for (i = 0; i < 4; i++) {  // variance16x8_neon_loop
    113         q0u8 = vld1q_u8(src_ptr);
    114         src_ptr += source_stride;
    115         q1u8 = vld1q_u8(src_ptr);
    116         src_ptr += source_stride;
    117         __builtin_prefetch(src_ptr);
    118 
    119         q2u8 = vld1q_u8(ref_ptr);
    120         ref_ptr += recon_stride;
    121         q3u8 = vld1q_u8(ref_ptr);
    122         ref_ptr += recon_stride;
    123         __builtin_prefetch(ref_ptr);
    124 
    125         q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
    126         q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
    127         q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
    128         q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
    129 
    130         d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
    131         d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
    132         q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
    133         q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
    134         q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
    135 
    136         d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
    137         d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
    138         q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
    139         q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
    140         q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
    141 
    142         d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
    143         d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
    144         q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
    145         q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
    146         q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
    147 
    148         d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
    149         d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
    150         q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
    151         q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
    152         q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
    153     }
    154 
    155     q10s32 = vaddq_s32(q10s32, q9s32);
    156     q0s64 = vpaddlq_s32(q8s32);
    157     q1s64 = vpaddlq_s32(q10s32);
    158 
    159     d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
    160     d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
    161 
    162     q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
    163                       vreinterpret_s32_s64(d0s64));
    164     vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
    165 
    166     d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
    167     d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
    168 
    169     return vget_lane_u32(d0u32, 0);
    170 }
    171 
    172 unsigned int vp8_variance8x16_neon(
    173         const unsigned char *src_ptr,
    174         int source_stride,
    175         const unsigned char *ref_ptr,
    176         int recon_stride,
    177         unsigned int *sse) {
    178     int i;
    179     uint8x8_t d0u8, d2u8, d4u8, d6u8;
    180     int16x4_t d22s16, d23s16, d24s16, d25s16;
    181     uint32x2_t d0u32, d10u32;
    182     int64x1_t d0s64, d1s64;
    183     uint16x8_t q11u16, q12u16;
    184     int32x4_t q8s32, q9s32, q10s32;
    185     int64x2_t q0s64, q1s64, q5s64;
    186 
    187     q8s32 = vdupq_n_s32(0);
    188     q9s32 = vdupq_n_s32(0);
    189     q10s32 = vdupq_n_s32(0);
    190 
    191     for (i = 0; i < 8; i++) {  // variance8x16_neon_loop
    192         d0u8 = vld1_u8(src_ptr);
    193         src_ptr += source_stride;
    194         d2u8 = vld1_u8(src_ptr);
    195         src_ptr += source_stride;
    196         __builtin_prefetch(src_ptr);
    197 
    198         d4u8 = vld1_u8(ref_ptr);
    199         ref_ptr += recon_stride;
    200         d6u8 = vld1_u8(ref_ptr);
    201         ref_ptr += recon_stride;
    202         __builtin_prefetch(ref_ptr);
    203 
    204         q11u16 = vsubl_u8(d0u8, d4u8);
    205         q12u16 = vsubl_u8(d2u8, d6u8);
    206 
    207         d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
    208         d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
    209         q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
    210         q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
    211         q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
    212 
    213         d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
    214         d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
    215         q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
    216         q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
    217         q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
    218     }
    219 
    220     q10s32 = vaddq_s32(q10s32, q9s32);
    221     q0s64 = vpaddlq_s32(q8s32);
    222     q1s64 = vpaddlq_s32(q10s32);
    223 
    224     d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
    225     d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
    226 
    227     q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
    228                       vreinterpret_s32_s64(d0s64));
    229     vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
    230 
    231     d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
    232     d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
    233 
    234     return vget_lane_u32(d0u32, 0);
    235 }
    236 
    237 unsigned int vp8_variance8x8_neon(
    238         const unsigned char *src_ptr,
    239         int source_stride,
    240         const unsigned char *ref_ptr,
    241         int recon_stride,
    242         unsigned int *sse) {
    243     int i;
    244     uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
    245     int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
    246     uint32x2_t d0u32, d10u32;
    247     int64x1_t d0s64, d1s64;
    248     uint16x8_t q11u16, q12u16, q13u16, q14u16;
    249     int32x4_t q8s32, q9s32, q10s32;
    250     int64x2_t q0s64, q1s64, q5s64;
    251 
    252     q8s32 = vdupq_n_s32(0);
    253     q9s32 = vdupq_n_s32(0);
    254     q10s32 = vdupq_n_s32(0);
    255 
    256     for (i = 0; i < 2; i++) {  // variance8x8_neon_loop
    257         d0u8 = vld1_u8(src_ptr);
    258         src_ptr += source_stride;
    259         d1u8 = vld1_u8(src_ptr);
    260         src_ptr += source_stride;
    261         d2u8 = vld1_u8(src_ptr);
    262         src_ptr += source_stride;
    263         d3u8 = vld1_u8(src_ptr);
    264         src_ptr += source_stride;
    265 
    266         d4u8 = vld1_u8(ref_ptr);
    267         ref_ptr += recon_stride;
    268         d5u8 = vld1_u8(ref_ptr);
    269         ref_ptr += recon_stride;
    270         d6u8 = vld1_u8(ref_ptr);
    271         ref_ptr += recon_stride;
    272         d7u8 = vld1_u8(ref_ptr);
    273         ref_ptr += recon_stride;
    274 
    275         q11u16 = vsubl_u8(d0u8, d4u8);
    276         q12u16 = vsubl_u8(d1u8, d5u8);
    277         q13u16 = vsubl_u8(d2u8, d6u8);
    278         q14u16 = vsubl_u8(d3u8, d7u8);
    279 
    280         d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
    281         d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
    282         q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
    283         q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
    284         q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
    285 
    286         d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
    287         d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
    288         q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
    289         q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
    290         q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
    291 
    292         d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
    293         d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
    294         q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
    295         q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
    296         q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
    297 
    298         d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
    299         d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
    300         q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
    301         q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
    302         q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
    303     }
    304 
    305     q10s32 = vaddq_s32(q10s32, q9s32);
    306     q0s64 = vpaddlq_s32(q8s32);
    307     q1s64 = vpaddlq_s32(q10s32);
    308 
    309     d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
    310     d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
    311 
    312     q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
    313                       vreinterpret_s32_s64(d0s64));
    314     vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
    315 
    316     d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 6);
    317     d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
    318 
    319     return vget_lane_u32(d0u32, 0);
    320 }
    321