Home | History | Annotate | Download | only in arm
      1 /***********************************************************************
      2 Copyright (c) 2017 Google Inc.
      3 Redistribution and use in source and binary forms, with or without
      4 modification, are permitted provided that the following conditions
      5 are met:
      6 - Redistributions of source code must retain the above copyright notice,
      7 this list of conditions and the following disclaimer.
      8 - Redistributions in binary form must reproduce the above copyright
      9 notice, this list of conditions and the following disclaimer in the
     10 documentation and/or other materials provided with the distribution.
     11 - Neither the name of Internet Society, IETF or IETF Trust, nor the
     12 names of specific contributors, may be used to endorse or promote
     13 products derived from this software without specific prior written
     14 permission.
     15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
     16 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     17 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     18 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
     19 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     20 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     21 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     22 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     23 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     24 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     25 POSSIBILITY OF SUCH DAMAGE.
     26 ***********************************************************************/
     27 
     28 #ifdef HAVE_CONFIG_H
     29 #include "config.h"
     30 #endif
     31 
     32 #include <arm_neon.h>
     33 #include "pitch.h"
     34 
     35 #ifdef FIXED_POINT
     36 
     37 opus_val32 celt_inner_prod_neon(const opus_val16 *x, const opus_val16 *y, int N)
     38 {
     39     int i;
     40     opus_val32 xy;
     41     int16x8_t x_s16x8, y_s16x8;
     42     int32x4_t xy_s32x4 = vdupq_n_s32(0);
     43     int64x2_t xy_s64x2;
     44     int64x1_t xy_s64x1;
     45 
     46     for (i = 0; i < N - 7; i += 8) {
     47         x_s16x8  = vld1q_s16(&x[i]);
     48         y_s16x8  = vld1q_s16(&y[i]);
     49         xy_s32x4 = vmlal_s16(xy_s32x4, vget_low_s16 (x_s16x8), vget_low_s16 (y_s16x8));
     50         xy_s32x4 = vmlal_s16(xy_s32x4, vget_high_s16(x_s16x8), vget_high_s16(y_s16x8));
     51     }
     52 
     53     if (N - i >= 4) {
     54         const int16x4_t x_s16x4 = vld1_s16(&x[i]);
     55         const int16x4_t y_s16x4 = vld1_s16(&y[i]);
     56         xy_s32x4 = vmlal_s16(xy_s32x4, x_s16x4, y_s16x4);
     57         i += 4;
     58     }
     59 
     60     xy_s64x2 = vpaddlq_s32(xy_s32x4);
     61     xy_s64x1 = vadd_s64(vget_low_s64(xy_s64x2), vget_high_s64(xy_s64x2));
     62     xy       = vget_lane_s32(vreinterpret_s32_s64(xy_s64x1), 0);
     63 
     64     for (; i < N; i++) {
     65         xy = MAC16_16(xy, x[i], y[i]);
     66     }
     67 
     68 #ifdef OPUS_CHECK_ASM
     69     celt_assert(celt_inner_prod_c(x, y, N) == xy);
     70 #endif
     71 
     72     return xy;
     73 }
     74 
     75 void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
     76         int N, opus_val32 *xy1, opus_val32 *xy2)
     77 {
     78     int i;
     79     opus_val32 xy01, xy02;
     80     int16x8_t x_s16x8, y01_s16x8, y02_s16x8;
     81     int32x4_t xy01_s32x4 = vdupq_n_s32(0);
     82     int32x4_t xy02_s32x4 = vdupq_n_s32(0);
     83     int64x2_t xy01_s64x2, xy02_s64x2;
     84     int64x1_t xy01_s64x1, xy02_s64x1;
     85 
     86     for (i = 0; i < N - 7; i += 8) {
     87         x_s16x8    = vld1q_s16(&x[i]);
     88         y01_s16x8  = vld1q_s16(&y01[i]);
     89         y02_s16x8  = vld1q_s16(&y02[i]);
     90         xy01_s32x4 = vmlal_s16(xy01_s32x4, vget_low_s16 (x_s16x8), vget_low_s16 (y01_s16x8));
     91         xy02_s32x4 = vmlal_s16(xy02_s32x4, vget_low_s16 (x_s16x8), vget_low_s16 (y02_s16x8));
     92         xy01_s32x4 = vmlal_s16(xy01_s32x4, vget_high_s16(x_s16x8), vget_high_s16(y01_s16x8));
     93         xy02_s32x4 = vmlal_s16(xy02_s32x4, vget_high_s16(x_s16x8), vget_high_s16(y02_s16x8));
     94     }
     95 
     96     if (N - i >= 4) {
     97         const int16x4_t x_s16x4   = vld1_s16(&x[i]);
     98         const int16x4_t y01_s16x4 = vld1_s16(&y01[i]);
     99         const int16x4_t y02_s16x4 = vld1_s16(&y02[i]);
    100         xy01_s32x4 = vmlal_s16(xy01_s32x4, x_s16x4, y01_s16x4);
    101         xy02_s32x4 = vmlal_s16(xy02_s32x4, x_s16x4, y02_s16x4);
    102         i += 4;
    103     }
    104 
    105     xy01_s64x2 = vpaddlq_s32(xy01_s32x4);
    106     xy02_s64x2 = vpaddlq_s32(xy02_s32x4);
    107     xy01_s64x1 = vadd_s64(vget_low_s64(xy01_s64x2), vget_high_s64(xy01_s64x2));
    108     xy02_s64x1 = vadd_s64(vget_low_s64(xy02_s64x2), vget_high_s64(xy02_s64x2));
    109     xy01       = vget_lane_s32(vreinterpret_s32_s64(xy01_s64x1), 0);
    110     xy02       = vget_lane_s32(vreinterpret_s32_s64(xy02_s64x1), 0);
    111 
    112     for (; i < N; i++) {
    113         xy01 = MAC16_16(xy01, x[i], y01[i]);
    114         xy02 = MAC16_16(xy02, x[i], y02[i]);
    115     }
    116     *xy1 = xy01;
    117     *xy2 = xy02;
    118 
    119 #ifdef OPUS_CHECK_ASM
    120     {
    121         opus_val32 xy1_c, xy2_c;
    122         dual_inner_prod_c(x, y01, y02, N, &xy1_c, &xy2_c);
    123         celt_assert(xy1_c == *xy1);
    124         celt_assert(xy2_c == *xy2);
    125     }
    126 #endif
    127 }
    128 
    129 #else /* !FIXED_POINT */
    130 
    131 /* ========================================================================== */
    132 
    133 #ifdef OPUS_CHECK_ASM
    134 
    135 /* This part of code simulates floating-point NEON operations. */
    136 
    137 /* celt_inner_prod_neon_float_c_simulation() simulates the floating-point   */
    138 /* operations of celt_inner_prod_neon(), and both functions should have bit */
    139 /* exact output.                                                            */
    140 static opus_val32 celt_inner_prod_neon_float_c_simulation(const opus_val16 *x, const opus_val16 *y, int N)
    141 {
    142    int i;
    143    opus_val32 xy, xy0 = 0, xy1 = 0, xy2 = 0, xy3 = 0;
    144    for (i = 0; i < N - 3; i += 4) {
    145       xy0 = MAC16_16(xy0, x[i + 0], y[i + 0]);
    146       xy1 = MAC16_16(xy1, x[i + 1], y[i + 1]);
    147       xy2 = MAC16_16(xy2, x[i + 2], y[i + 2]);
    148       xy3 = MAC16_16(xy3, x[i + 3], y[i + 3]);
    149    }
    150    xy0 += xy2;
    151    xy1 += xy3;
    152    xy = xy0 + xy1;
    153    for (; i < N; i++) {
    154       xy = MAC16_16(xy, x[i], y[i]);
    155    }
    156    return xy;
    157 }
    158 
    159 /* dual_inner_prod_neon_float_c_simulation() simulates the floating-point   */
    160 /* operations of dual_inner_prod_neon(), and both functions should have bit */
    161 /* exact output.                                                            */
    162 static void dual_inner_prod_neon_float_c_simulation(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
    163       int N, opus_val32 *xy1, opus_val32 *xy2)
    164 {
    165    int i;
    166    opus_val32 xy01, xy02, xy01_0 = 0, xy01_1 = 0, xy01_2 = 0, xy01_3 = 0, xy02_0 = 0, xy02_1 = 0, xy02_2 = 0, xy02_3 = 0;
    167    for (i = 0; i < N - 3; i += 4) {
    168       xy01_0 = MAC16_16(xy01_0, x[i + 0], y01[i + 0]);
    169       xy01_1 = MAC16_16(xy01_1, x[i + 1], y01[i + 1]);
    170       xy01_2 = MAC16_16(xy01_2, x[i + 2], y01[i + 2]);
    171       xy01_3 = MAC16_16(xy01_3, x[i + 3], y01[i + 3]);
    172       xy02_0 = MAC16_16(xy02_0, x[i + 0], y02[i + 0]);
    173       xy02_1 = MAC16_16(xy02_1, x[i + 1], y02[i + 1]);
    174       xy02_2 = MAC16_16(xy02_2, x[i + 2], y02[i + 2]);
    175       xy02_3 = MAC16_16(xy02_3, x[i + 3], y02[i + 3]);
    176    }
    177    xy01_0 += xy01_2;
    178    xy02_0 += xy02_2;
    179    xy01_1 += xy01_3;
    180    xy02_1 += xy02_3;
    181    xy01 = xy01_0 + xy01_1;
    182    xy02 = xy02_0 + xy02_1;
    183    for (; i < N; i++) {
    184       xy01 = MAC16_16(xy01, x[i], y01[i]);
    185       xy02 = MAC16_16(xy02, x[i], y02[i]);
    186    }
    187    *xy1 = xy01;
    188    *xy2 = xy02;
    189 }
    190 
    191 #endif /* OPUS_CHECK_ASM */
    192 
    193 /* ========================================================================== */
    194 
    195 opus_val32 celt_inner_prod_neon(const opus_val16 *x, const opus_val16 *y, int N)
    196 {
    197     int i;
    198     opus_val32 xy;
    199     float32x4_t xy_f32x4 = vdupq_n_f32(0);
    200     float32x2_t xy_f32x2;
    201 
    202     for (i = 0; i < N - 7; i += 8) {
    203         float32x4_t x_f32x4, y_f32x4;
    204         x_f32x4  = vld1q_f32(&x[i]);
    205         y_f32x4  = vld1q_f32(&y[i]);
    206         xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4);
    207         x_f32x4  = vld1q_f32(&x[i + 4]);
    208         y_f32x4  = vld1q_f32(&y[i + 4]);
    209         xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4);
    210     }
    211 
    212     if (N - i >= 4) {
    213         const float32x4_t x_f32x4 = vld1q_f32(&x[i]);
    214         const float32x4_t y_f32x4 = vld1q_f32(&y[i]);
    215         xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4);
    216         i += 4;
    217     }
    218 
    219     xy_f32x2 = vadd_f32(vget_low_f32(xy_f32x4), vget_high_f32(xy_f32x4));
    220     xy_f32x2 = vpadd_f32(xy_f32x2, xy_f32x2);
    221     xy       = vget_lane_f32(xy_f32x2, 0);
    222 
    223     for (; i < N; i++) {
    224         xy = MAC16_16(xy, x[i], y[i]);
    225     }
    226 
    227 #ifdef OPUS_CHECK_ASM
    228     celt_assert(ABS32(celt_inner_prod_neon_float_c_simulation(x, y, N) - xy) <= VERY_SMALL);
    229 #endif
    230 
    231     return xy;
    232 }
    233 
    234 void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
    235         int N, opus_val32 *xy1, opus_val32 *xy2)
    236 {
    237     int i;
    238     opus_val32 xy01, xy02;
    239     float32x4_t xy01_f32x4 = vdupq_n_f32(0);
    240     float32x4_t xy02_f32x4 = vdupq_n_f32(0);
    241     float32x2_t xy01_f32x2, xy02_f32x2;
    242 
    243     for (i = 0; i < N - 7; i += 8) {
    244         float32x4_t x_f32x4, y01_f32x4, y02_f32x4;
    245         x_f32x4    = vld1q_f32(&x[i]);
    246         y01_f32x4  = vld1q_f32(&y01[i]);
    247         y02_f32x4  = vld1q_f32(&y02[i]);
    248         xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4);
    249         xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4);
    250         x_f32x4    = vld1q_f32(&x[i + 4]);
    251         y01_f32x4  = vld1q_f32(&y01[i + 4]);
    252         y02_f32x4  = vld1q_f32(&y02[i + 4]);
    253         xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4);
    254         xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4);
    255     }
    256 
    257     if (N - i >= 4) {
    258         const float32x4_t x_f32x4   = vld1q_f32(&x[i]);
    259         const float32x4_t y01_f32x4 = vld1q_f32(&y01[i]);
    260         const float32x4_t y02_f32x4 = vld1q_f32(&y02[i]);
    261         xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4);
    262         xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4);
    263         i += 4;
    264     }
    265 
    266     xy01_f32x2 = vadd_f32(vget_low_f32(xy01_f32x4), vget_high_f32(xy01_f32x4));
    267     xy02_f32x2 = vadd_f32(vget_low_f32(xy02_f32x4), vget_high_f32(xy02_f32x4));
    268     xy01_f32x2 = vpadd_f32(xy01_f32x2, xy01_f32x2);
    269     xy02_f32x2 = vpadd_f32(xy02_f32x2, xy02_f32x2);
    270     xy01       = vget_lane_f32(xy01_f32x2, 0);
    271     xy02       = vget_lane_f32(xy02_f32x2, 0);
    272 
    273     for (; i < N; i++) {
    274         xy01 = MAC16_16(xy01, x[i], y01[i]);
    275         xy02 = MAC16_16(xy02, x[i], y02[i]);
    276     }
    277     *xy1 = xy01;
    278     *xy2 = xy02;
    279 
    280 #ifdef OPUS_CHECK_ASM
    281     {
    282         opus_val32 xy1_c, xy2_c;
    283         dual_inner_prod_neon_float_c_simulation(x, y01, y02, N, &xy1_c, &xy2_c);
    284         celt_assert(ABS32(xy1_c - *xy1) <= VERY_SMALL);
    285         celt_assert(ABS32(xy2_c - *xy2) <= VERY_SMALL);
    286     }
    287 #endif
    288 }
    289 
    290 #endif /* FIXED_POINT */
    291