1 /* 2 * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 /* This file contains WebRtcIsacfix_MatrixProduct1Neon() and 12 * WebRtcIsacfix_MatrixProduct2Neon() for ARM Neon platform. API's are in 13 * entropy_coding.c. Results are bit exact with the c code for 14 * generic platforms. 15 */ 16 17 #include "entropy_coding.h" 18 19 #include <arm_neon.h> 20 #include <assert.h> 21 #include <stddef.h> 22 23 #include "signal_processing_library.h" 24 25 void WebRtcIsacfix_MatrixProduct1Neon(const int16_t matrix0[], 26 const int32_t matrix1[], 27 int32_t matrix_product[], 28 const int matrix1_index_factor1, 29 const int matrix0_index_factor1, 30 const int matrix1_index_init_case, 31 const int matrix1_index_step, 32 const int matrix0_index_step, 33 const int inner_loop_count, 34 const int mid_loop_count, 35 const int shift) { 36 int j = 0, k = 0, n = 0; 37 int matrix1_index = 0, matrix0_index = 0, matrix_prod_index = 0; 38 int* matrix1_index_factor2 = &j; 39 int* matrix0_index_factor2 = &k; 40 if (matrix1_index_init_case != 0) { 41 matrix1_index_factor2 = &k; 42 matrix0_index_factor2 = &j; 43 } 44 int32x4_t shift32x4 = vdupq_n_s32(shift); 45 int32x2_t shift32x2 = vdup_n_s32(shift); 46 int32x4_t sum_32x4 = vdupq_n_s32(0); 47 int32x2_t sum_32x2 = vdup_n_s32(0); 48 49 assert(inner_loop_count % 2 == 0); 50 assert(mid_loop_count % 2 == 0); 51 52 if (matrix1_index_init_case != 0 && matrix1_index_factor1 == 1) { 53 for (j = 0; j < SUBFRAMES; j++) { 54 matrix_prod_index = mid_loop_count * j; 55 for (k = 0; k < (mid_loop_count >> 2) << 2; k += 4) { 56 sum_32x4 = veorq_s32(sum_32x4, sum_32x4); // Initialize to zeros. 57 matrix1_index = k; 58 matrix0_index = matrix0_index_factor1 * j; 59 for (n = 0; n < inner_loop_count; n++) { 60 int32x4_t matrix0_32x4 = 61 vdupq_n_s32((int32_t)(matrix0[matrix0_index]) << 15); 62 int32x4_t matrix1_32x4 = 63 vshlq_s32(vld1q_s32(&matrix1[matrix1_index]), shift32x4); 64 int32x4_t multi_32x4 = vqdmulhq_s32(matrix0_32x4, matrix1_32x4); 65 sum_32x4 = vqaddq_s32(sum_32x4, multi_32x4); 66 matrix1_index += matrix1_index_step; 67 matrix0_index += matrix0_index_step; 68 } 69 vst1q_s32(&matrix_product[matrix_prod_index], sum_32x4); 70 matrix_prod_index += 4; 71 } 72 if (mid_loop_count % 4 > 1) { 73 sum_32x2 = veor_s32(sum_32x2, sum_32x2); // Initialize to zeros. 74 matrix1_index = k; 75 k += 2; 76 matrix0_index = matrix0_index_factor1 * j; 77 for (n = 0; n < inner_loop_count; n++) { 78 int32x2_t matrix0_32x2 = 79 vdup_n_s32((int32_t)(matrix0[matrix0_index]) << 15); 80 int32x2_t matrix1_32x2 = 81 vshl_s32(vld1_s32(&matrix1[matrix1_index]), shift32x2); 82 int32x2_t multi_32x2 = vqdmulh_s32(matrix0_32x2, matrix1_32x2); 83 sum_32x2 = vqadd_s32(sum_32x2, multi_32x2); 84 matrix1_index += matrix1_index_step; 85 matrix0_index += matrix0_index_step; 86 } 87 vst1_s32(&matrix_product[matrix_prod_index], sum_32x2); 88 matrix_prod_index += 2; 89 } 90 } 91 } 92 else if (matrix1_index_init_case == 0 && matrix0_index_factor1 == 1) { 93 int32x2_t multi_32x2 = vdup_n_s32(0); 94 int32x2_t matrix0_32x2 = vdup_n_s32(0); 95 for (j = 0; j < SUBFRAMES; j++) { 96 matrix_prod_index = mid_loop_count * j; 97 for (k = 0; k < (mid_loop_count >> 2) << 2; k += 4) { 98 sum_32x4 = veorq_s32(sum_32x4, sum_32x4); // Initialize to zeros. 99 matrix1_index = matrix1_index_factor1 * j; 100 matrix0_index = k; 101 for (n = 0; n < inner_loop_count; n++) { 102 int32x4_t matrix1_32x4 = vdupq_n_s32(matrix1[matrix1_index] << shift); 103 int32x4_t matrix0_32x4 = 104 vshll_n_s16(vld1_s16(&matrix0[matrix0_index]), 15); 105 int32x4_t multi_32x4 = vqdmulhq_s32(matrix0_32x4, matrix1_32x4); 106 sum_32x4 = vqaddq_s32(sum_32x4, multi_32x4); 107 matrix1_index += matrix1_index_step; 108 matrix0_index += matrix0_index_step; 109 } 110 vst1q_s32(&matrix_product[matrix_prod_index], sum_32x4); 111 matrix_prod_index += 4; 112 } 113 if (mid_loop_count % 4 > 1) { 114 sum_32x2 = veor_s32(sum_32x2, sum_32x2); // Initialize to zeros. 115 matrix1_index = matrix1_index_factor1 * j; 116 matrix0_index = k; 117 for (n = 0; n < inner_loop_count; n++) { 118 int32x2_t matrix1_32x2 = vdup_n_s32(matrix1[matrix1_index] << shift); 119 matrix0_32x2 = 120 vset_lane_s32((int32_t)matrix0[matrix0_index], matrix0_32x2, 0); 121 matrix0_32x2 = vset_lane_s32((int32_t)matrix0[matrix0_index + 1], 122 matrix0_32x2, 1); 123 matrix0_32x2 = vshl_n_s32(matrix0_32x2, 15); 124 multi_32x2 = vqdmulh_s32(matrix1_32x2, matrix0_32x2); 125 sum_32x2 = vqadd_s32(sum_32x2, multi_32x2); 126 matrix1_index += matrix1_index_step; 127 matrix0_index += matrix0_index_step; 128 } 129 vst1_s32(&matrix_product[matrix_prod_index], sum_32x2); 130 matrix_prod_index += 2; 131 } 132 } 133 } 134 else if (matrix1_index_init_case == 0 && 135 matrix1_index_step == 1 && 136 matrix0_index_step == 1) { 137 int32x2_t multi_32x2 = vdup_n_s32(0); 138 int32x2_t matrix0_32x2 = vdup_n_s32(0); 139 for (j = 0; j < SUBFRAMES; j++) { 140 matrix_prod_index = mid_loop_count * j; 141 for (k = 0; k < mid_loop_count; k++) { 142 sum_32x4 = veorq_s32(sum_32x4, sum_32x4); // Initialize to zeros. 143 matrix1_index = matrix1_index_factor1 * j; 144 matrix0_index = matrix0_index_factor1 * k; 145 for (n = 0; n < (inner_loop_count >> 2) << 2; n += 4) { 146 int32x4_t matrix1_32x4 = 147 vshlq_s32(vld1q_s32(&matrix1[matrix1_index]), shift32x4); 148 int32x4_t matrix0_32x4 = 149 vshll_n_s16(vld1_s16(&matrix0[matrix0_index]), 15); 150 int32x4_t multi_32x4 = vqdmulhq_s32(matrix0_32x4, matrix1_32x4); 151 sum_32x4 = vqaddq_s32(sum_32x4, multi_32x4); 152 matrix1_index += 4; 153 matrix0_index += 4; 154 } 155 sum_32x2 = vqadd_s32(vget_low_s32(sum_32x4), vget_high_s32(sum_32x4)); 156 if (inner_loop_count % 4 > 1) { 157 int32x2_t matrix1_32x2 = 158 vshl_s32(vld1_s32(&matrix1[matrix1_index]), shift32x2); 159 matrix0_32x2 = 160 vset_lane_s32((int32_t)matrix0[matrix0_index], matrix0_32x2, 0); 161 matrix0_32x2 = vset_lane_s32((int32_t)matrix0[matrix0_index + 1], 162 matrix0_32x2, 1); 163 matrix0_32x2 = vshl_n_s32(matrix0_32x2, 15); 164 multi_32x2 = vqdmulh_s32(matrix1_32x2, matrix0_32x2); 165 sum_32x2 = vqadd_s32(sum_32x2, multi_32x2); 166 } 167 sum_32x2 = vpadd_s32(sum_32x2, sum_32x2); 168 vst1_lane_s32(&matrix_product[matrix_prod_index], sum_32x2, 0); 169 matrix_prod_index++; 170 } 171 } 172 } 173 else { 174 for (j = 0; j < SUBFRAMES; j++) { 175 matrix_prod_index = mid_loop_count * j; 176 for (k=0; k < mid_loop_count; k++) { 177 int32_t sum32 = 0; 178 matrix1_index = matrix1_index_factor1 * (*matrix1_index_factor2); 179 matrix0_index = matrix0_index_factor1 * (*matrix0_index_factor2); 180 for (n = 0; n < inner_loop_count; n++) { 181 sum32 += (WEBRTC_SPL_MUL_16_32_RSFT16(matrix0[matrix0_index], 182 matrix1[matrix1_index] << shift)); 183 matrix1_index += matrix1_index_step; 184 matrix0_index += matrix0_index_step; 185 } 186 matrix_product[matrix_prod_index] = sum32; 187 matrix_prod_index++; 188 } 189 } 190 } 191 } 192 193 void WebRtcIsacfix_MatrixProduct2Neon(const int16_t matrix0[], 194 const int32_t matrix1[], 195 int32_t matrix_product[], 196 const int matrix0_index_factor, 197 const int matrix0_index_step) { 198 int j = 0, n = 0; 199 int matrix1_index = 0, matrix0_index = 0, matrix_prod_index = 0; 200 int32x2_t sum_32x2 = vdup_n_s32(0); 201 for (j = 0; j < SUBFRAMES; j++) { 202 sum_32x2 = veor_s32(sum_32x2, sum_32x2); // Initialize to zeros. 203 matrix1_index = 0; 204 matrix0_index = matrix0_index_factor * j; 205 for (n = SUBFRAMES; n > 0; n--) { 206 int32x2_t matrix0_32x2 = 207 vdup_n_s32((int32_t)(matrix0[matrix0_index]) << 15); 208 int32x2_t matrix1_32x2 = vld1_s32(&matrix1[matrix1_index]); 209 int32x2_t multi_32x2 = vqdmulh_s32(matrix0_32x2, matrix1_32x2); 210 sum_32x2 = vqadd_s32(sum_32x2, multi_32x2); 211 matrix1_index += 2; 212 matrix0_index += matrix0_index_step; 213 } 214 sum_32x2 = vshr_n_s32(sum_32x2, 3); 215 vst1_s32(&matrix_product[matrix_prod_index], sum_32x2); 216 matrix_prod_index += 2; 217 } 218 } 219