Home | History | Annotate | Download | only in neon
      1 /*
      2  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <arm_neon.h>
     12 
     13 unsigned int vp8_sad8x8_neon(
     14         unsigned char *src_ptr,
     15         int src_stride,
     16         unsigned char *ref_ptr,
     17         int ref_stride) {
     18     uint8x8_t d0, d8;
     19     uint16x8_t q12;
     20     uint32x4_t q1;
     21     uint64x2_t q3;
     22     uint32x2_t d5;
     23     int i;
     24 
     25     d0 = vld1_u8(src_ptr);
     26     src_ptr += src_stride;
     27     d8 = vld1_u8(ref_ptr);
     28     ref_ptr += ref_stride;
     29     q12 = vabdl_u8(d0, d8);
     30 
     31     for (i = 0; i < 7; i++) {
     32         d0 = vld1_u8(src_ptr);
     33         src_ptr += src_stride;
     34         d8 = vld1_u8(ref_ptr);
     35         ref_ptr += ref_stride;
     36         q12 = vabal_u8(q12, d0, d8);
     37     }
     38 
     39     q1 = vpaddlq_u16(q12);
     40     q3 = vpaddlq_u32(q1);
     41     d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
     42                   vreinterpret_u32_u64(vget_high_u64(q3)));
     43 
     44     return vget_lane_u32(d5, 0);
     45 }
     46 
     47 unsigned int vp8_sad8x16_neon(
     48         unsigned char *src_ptr,
     49         int src_stride,
     50         unsigned char *ref_ptr,
     51         int ref_stride) {
     52     uint8x8_t d0, d8;
     53     uint16x8_t q12;
     54     uint32x4_t q1;
     55     uint64x2_t q3;
     56     uint32x2_t d5;
     57     int i;
     58 
     59     d0 = vld1_u8(src_ptr);
     60     src_ptr += src_stride;
     61     d8 = vld1_u8(ref_ptr);
     62     ref_ptr += ref_stride;
     63     q12 = vabdl_u8(d0, d8);
     64 
     65     for (i = 0; i < 15; i++) {
     66         d0 = vld1_u8(src_ptr);
     67         src_ptr += src_stride;
     68         d8 = vld1_u8(ref_ptr);
     69         ref_ptr += ref_stride;
     70         q12 = vabal_u8(q12, d0, d8);
     71     }
     72 
     73     q1 = vpaddlq_u16(q12);
     74     q3 = vpaddlq_u32(q1);
     75     d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
     76                   vreinterpret_u32_u64(vget_high_u64(q3)));
     77 
     78     return vget_lane_u32(d5, 0);
     79 }
     80 
     81 unsigned int vp8_sad4x4_neon(
     82         unsigned char *src_ptr,
     83         int src_stride,
     84         unsigned char *ref_ptr,
     85         int ref_stride) {
     86     uint8x8_t d0, d8;
     87     uint16x8_t q12;
     88     uint32x2_t d1;
     89     uint64x1_t d3;
     90     int i;
     91 
     92     d0 = vld1_u8(src_ptr);
     93     src_ptr += src_stride;
     94     d8 = vld1_u8(ref_ptr);
     95     ref_ptr += ref_stride;
     96     q12 = vabdl_u8(d0, d8);
     97 
     98     for (i = 0; i < 3; i++) {
     99         d0 = vld1_u8(src_ptr);
    100         src_ptr += src_stride;
    101         d8 = vld1_u8(ref_ptr);
    102         ref_ptr += ref_stride;
    103         q12 = vabal_u8(q12, d0, d8);
    104     }
    105 
    106     d1 = vpaddl_u16(vget_low_u16(q12));
    107     d3 = vpaddl_u32(d1);
    108 
    109     return vget_lane_u32(vreinterpret_u32_u64(d3), 0);
    110 }
    111 
    112 unsigned int vp8_sad16x16_neon(
    113         unsigned char *src_ptr,
    114         int src_stride,
    115         unsigned char *ref_ptr,
    116         int ref_stride) {
    117     uint8x16_t q0, q4;
    118     uint16x8_t q12, q13;
    119     uint32x4_t q1;
    120     uint64x2_t q3;
    121     uint32x2_t d5;
    122     int i;
    123 
    124     q0 = vld1q_u8(src_ptr);
    125     src_ptr += src_stride;
    126     q4 = vld1q_u8(ref_ptr);
    127     ref_ptr += ref_stride;
    128     q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4));
    129     q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4));
    130 
    131     for (i = 0; i < 15; i++) {
    132         q0 = vld1q_u8(src_ptr);
    133         src_ptr += src_stride;
    134         q4 = vld1q_u8(ref_ptr);
    135         ref_ptr += ref_stride;
    136         q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4));
    137         q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4));
    138     }
    139 
    140     q12 = vaddq_u16(q12, q13);
    141     q1 = vpaddlq_u16(q12);
    142     q3 = vpaddlq_u32(q1);
    143     d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
    144                   vreinterpret_u32_u64(vget_high_u64(q3)));
    145 
    146     return vget_lane_u32(d5, 0);
    147 }
    148 
    149 unsigned int vp8_sad16x8_neon(
    150         unsigned char *src_ptr,
    151         int src_stride,
    152         unsigned char *ref_ptr,
    153         int ref_stride) {
    154     uint8x16_t q0, q4;
    155     uint16x8_t q12, q13;
    156     uint32x4_t q1;
    157     uint64x2_t q3;
    158     uint32x2_t d5;
    159     int i;
    160 
    161     q0 = vld1q_u8(src_ptr);
    162     src_ptr += src_stride;
    163     q4 = vld1q_u8(ref_ptr);
    164     ref_ptr += ref_stride;
    165     q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4));
    166     q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4));
    167 
    168     for (i = 0; i < 7; i++) {
    169         q0 = vld1q_u8(src_ptr);
    170         src_ptr += src_stride;
    171         q4 = vld1q_u8(ref_ptr);
    172         ref_ptr += ref_stride;
    173         q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4));
    174         q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4));
    175     }
    176 
    177     q12 = vaddq_u16(q12, q13);
    178     q1 = vpaddlq_u16(q12);
    179     q3 = vpaddlq_u32(q1);
    180     d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
    181                   vreinterpret_u32_u64(vget_high_u64(q3)));
    182 
    183     return vget_lane_u32(d5, 0);
    184 }
    185