Home | History | Annotate | Download | only in source
      1 /*
      2  *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 /* This file contains WebRtcIsacfix_MatrixProduct1Neon() and
     12  * WebRtcIsacfix_MatrixProduct2Neon() for ARM Neon platform. API's are in
     13  * entropy_coding.c. Results are bit exact with the c code for
     14  * generic platforms.
     15  */
     16 
     17 #include "entropy_coding.h"
     18 
     19 #include <arm_neon.h>
     20 #include <assert.h>
     21 #include <stddef.h>
     22 
     23 #include "signal_processing_library.h"
     24 
     25 void WebRtcIsacfix_MatrixProduct1Neon(const int16_t matrix0[],
     26                                       const int32_t matrix1[],
     27                                       int32_t matrix_product[],
     28                                       const int matrix1_index_factor1,
     29                                       const int matrix0_index_factor1,
     30                                       const int matrix1_index_init_case,
     31                                       const int matrix1_index_step,
     32                                       const int matrix0_index_step,
     33                                       const int inner_loop_count,
     34                                       const int mid_loop_count,
     35                                       const int shift) {
     36   int j = 0, k = 0, n = 0;
     37   int matrix1_index = 0, matrix0_index = 0, matrix_prod_index = 0;
     38   int* matrix1_index_factor2 = &j;
     39   int* matrix0_index_factor2 = &k;
     40   if (matrix1_index_init_case != 0) {
     41     matrix1_index_factor2 = &k;
     42     matrix0_index_factor2 = &j;
     43   }
     44   int32x4_t shift32x4 = vdupq_n_s32(shift);
     45   int32x2_t shift32x2 = vdup_n_s32(shift);
     46   int32x4_t sum_32x4 =  vdupq_n_s32(0);
     47   int32x2_t sum_32x2 =  vdup_n_s32(0);
     48 
     49   assert(inner_loop_count % 2 == 0);
     50   assert(mid_loop_count % 2 == 0);
     51 
     52   if (matrix1_index_init_case != 0 && matrix1_index_factor1 == 1) {
     53     for (j = 0; j < SUBFRAMES; j++) {
     54       matrix_prod_index = mid_loop_count * j;
     55       for (k = 0; k < (mid_loop_count >> 2) << 2; k += 4) {
     56         sum_32x4 = veorq_s32(sum_32x4, sum_32x4);  // Initialize to zeros.
     57         matrix1_index = k;
     58         matrix0_index = matrix0_index_factor1 * j;
     59         for (n = 0; n < inner_loop_count; n++) {
     60           int32x4_t matrix0_32x4 =
     61               vdupq_n_s32((int32_t)(matrix0[matrix0_index]) << 15);
     62           int32x4_t matrix1_32x4 =
     63               vshlq_s32(vld1q_s32(&matrix1[matrix1_index]), shift32x4);
     64           int32x4_t multi_32x4 = vqdmulhq_s32(matrix0_32x4, matrix1_32x4);
     65           sum_32x4 = vqaddq_s32(sum_32x4, multi_32x4);
     66           matrix1_index += matrix1_index_step;
     67           matrix0_index += matrix0_index_step;
     68         }
     69         vst1q_s32(&matrix_product[matrix_prod_index], sum_32x4);
     70         matrix_prod_index += 4;
     71       }
     72       if (mid_loop_count % 4 > 1) {
     73         sum_32x2 = veor_s32(sum_32x2, sum_32x2);  // Initialize to zeros.
     74         matrix1_index = k;
     75         k += 2;
     76         matrix0_index = matrix0_index_factor1 * j;
     77         for (n = 0; n < inner_loop_count; n++) {
     78           int32x2_t matrix0_32x2 =
     79               vdup_n_s32((int32_t)(matrix0[matrix0_index]) << 15);
     80           int32x2_t matrix1_32x2 =
     81               vshl_s32(vld1_s32(&matrix1[matrix1_index]), shift32x2);
     82           int32x2_t multi_32x2 = vqdmulh_s32(matrix0_32x2, matrix1_32x2);
     83           sum_32x2 = vqadd_s32(sum_32x2, multi_32x2);
     84           matrix1_index += matrix1_index_step;
     85           matrix0_index += matrix0_index_step;
     86         }
     87         vst1_s32(&matrix_product[matrix_prod_index], sum_32x2);
     88         matrix_prod_index += 2;
     89       }
     90     }
     91   }
     92   else if (matrix1_index_init_case == 0 && matrix0_index_factor1 == 1) {
     93     int32x2_t multi_32x2 = vdup_n_s32(0);
     94     int32x2_t matrix0_32x2 = vdup_n_s32(0);
     95     for (j = 0; j < SUBFRAMES; j++) {
     96       matrix_prod_index = mid_loop_count * j;
     97       for (k = 0; k < (mid_loop_count >> 2) << 2; k += 4) {
     98         sum_32x4 = veorq_s32(sum_32x4, sum_32x4);  // Initialize to zeros.
     99         matrix1_index = matrix1_index_factor1 * j;
    100         matrix0_index = k;
    101         for (n = 0; n < inner_loop_count; n++) {
    102           int32x4_t matrix1_32x4 = vdupq_n_s32(matrix1[matrix1_index] << shift);
    103           int32x4_t matrix0_32x4 =
    104               vshll_n_s16(vld1_s16(&matrix0[matrix0_index]), 15);
    105           int32x4_t multi_32x4 = vqdmulhq_s32(matrix0_32x4, matrix1_32x4);
    106           sum_32x4 = vqaddq_s32(sum_32x4, multi_32x4);
    107           matrix1_index += matrix1_index_step;
    108           matrix0_index += matrix0_index_step;
    109         }
    110         vst1q_s32(&matrix_product[matrix_prod_index], sum_32x4);
    111         matrix_prod_index += 4;
    112       }
    113       if (mid_loop_count % 4 > 1) {
    114         sum_32x2 = veor_s32(sum_32x2, sum_32x2);  // Initialize to zeros.
    115         matrix1_index = matrix1_index_factor1 * j;
    116         matrix0_index = k;
    117         for (n = 0; n < inner_loop_count; n++) {
    118           int32x2_t matrix1_32x2 = vdup_n_s32(matrix1[matrix1_index] << shift);
    119           matrix0_32x2 =
    120               vset_lane_s32((int32_t)matrix0[matrix0_index], matrix0_32x2, 0);
    121           matrix0_32x2 = vset_lane_s32((int32_t)matrix0[matrix0_index + 1],
    122                                      matrix0_32x2, 1);
    123           matrix0_32x2 = vshl_n_s32(matrix0_32x2, 15);
    124           multi_32x2 = vqdmulh_s32(matrix1_32x2, matrix0_32x2);
    125           sum_32x2 = vqadd_s32(sum_32x2, multi_32x2);
    126           matrix1_index += matrix1_index_step;
    127           matrix0_index += matrix0_index_step;
    128         }
    129         vst1_s32(&matrix_product[matrix_prod_index], sum_32x2);
    130         matrix_prod_index += 2;
    131       }
    132     }
    133   }
    134   else if (matrix1_index_init_case == 0 &&
    135            matrix1_index_step == 1 &&
    136            matrix0_index_step == 1) {
    137     int32x2_t multi_32x2 = vdup_n_s32(0);
    138     int32x2_t matrix0_32x2 = vdup_n_s32(0);
    139     for (j = 0; j < SUBFRAMES; j++) {
    140       matrix_prod_index = mid_loop_count * j;
    141       for (k = 0; k < mid_loop_count; k++) {
    142         sum_32x4 = veorq_s32(sum_32x4, sum_32x4);  // Initialize to zeros.
    143         matrix1_index = matrix1_index_factor1 * j;
    144         matrix0_index = matrix0_index_factor1 * k;
    145         for (n = 0; n < (inner_loop_count >> 2) << 2; n += 4) {
    146           int32x4_t matrix1_32x4 =
    147               vshlq_s32(vld1q_s32(&matrix1[matrix1_index]), shift32x4);
    148           int32x4_t matrix0_32x4 =
    149               vshll_n_s16(vld1_s16(&matrix0[matrix0_index]), 15);
    150           int32x4_t multi_32x4 = vqdmulhq_s32(matrix0_32x4, matrix1_32x4);
    151           sum_32x4 = vqaddq_s32(sum_32x4, multi_32x4);
    152           matrix1_index += 4;
    153           matrix0_index += 4;
    154         }
    155         sum_32x2 = vqadd_s32(vget_low_s32(sum_32x4), vget_high_s32(sum_32x4));
    156         if (inner_loop_count % 4 > 1) {
    157           int32x2_t matrix1_32x2 =
    158               vshl_s32(vld1_s32(&matrix1[matrix1_index]), shift32x2);
    159           matrix0_32x2 =
    160               vset_lane_s32((int32_t)matrix0[matrix0_index], matrix0_32x2, 0);
    161           matrix0_32x2 = vset_lane_s32((int32_t)matrix0[matrix0_index + 1],
    162                                      matrix0_32x2, 1);
    163           matrix0_32x2 = vshl_n_s32(matrix0_32x2, 15);
    164           multi_32x2 = vqdmulh_s32(matrix1_32x2, matrix0_32x2);
    165           sum_32x2 = vqadd_s32(sum_32x2, multi_32x2);
    166         }
    167         sum_32x2 = vpadd_s32(sum_32x2, sum_32x2);
    168         vst1_lane_s32(&matrix_product[matrix_prod_index], sum_32x2, 0);
    169         matrix_prod_index++;
    170       }
    171     }
    172   }
    173   else {
    174     for (j = 0; j < SUBFRAMES; j++) {
    175       matrix_prod_index = mid_loop_count * j;
    176       for (k=0; k < mid_loop_count; k++) {
    177         int32_t sum32 = 0;
    178         matrix1_index = matrix1_index_factor1 * (*matrix1_index_factor2);
    179         matrix0_index = matrix0_index_factor1 * (*matrix0_index_factor2);
    180         for (n = 0; n < inner_loop_count; n++) {
    181           sum32 += (WEBRTC_SPL_MUL_16_32_RSFT16(matrix0[matrix0_index],
    182               matrix1[matrix1_index] << shift));
    183           matrix1_index += matrix1_index_step;
    184           matrix0_index += matrix0_index_step;
    185         }
    186         matrix_product[matrix_prod_index] = sum32;
    187         matrix_prod_index++;
    188       }
    189     }
    190   }
    191 }
    192 
    193 void WebRtcIsacfix_MatrixProduct2Neon(const int16_t matrix0[],
    194                                       const int32_t matrix1[],
    195                                       int32_t matrix_product[],
    196                                       const int matrix0_index_factor,
    197                                       const int matrix0_index_step) {
    198   int j = 0, n = 0;
    199   int matrix1_index = 0, matrix0_index = 0, matrix_prod_index = 0;
    200   int32x2_t sum_32x2 = vdup_n_s32(0);
    201   for (j = 0; j < SUBFRAMES; j++) {
    202     sum_32x2 = veor_s32(sum_32x2, sum_32x2);  // Initialize to zeros.
    203     matrix1_index = 0;
    204     matrix0_index = matrix0_index_factor * j;
    205     for (n = SUBFRAMES; n > 0; n--) {
    206       int32x2_t matrix0_32x2 =
    207           vdup_n_s32((int32_t)(matrix0[matrix0_index]) << 15);
    208       int32x2_t matrix1_32x2 = vld1_s32(&matrix1[matrix1_index]);
    209       int32x2_t multi_32x2 = vqdmulh_s32(matrix0_32x2, matrix1_32x2);
    210       sum_32x2 = vqadd_s32(sum_32x2, multi_32x2);
    211       matrix1_index += 2;
    212       matrix0_index += matrix0_index_step;
    213     }
    214     sum_32x2 = vshr_n_s32(sum_32x2, 3);
    215     vst1_s32(&matrix_product[matrix_prod_index], sum_32x2);
    216     matrix_prod_index += 2;
    217   }
    218 }
    219