1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 #include <string.h> 16 17 #include "tensorflow/contrib/lite/builtin_op_data.h" 18 #include "tensorflow/contrib/lite/kernels/internal/common.h" 19 #include "tensorflow/contrib/lite/kernels/activation_functor.h" 20 #include "tensorflow/contrib/lite/kernels/internal/common.h" 21 #include "tensorflow/contrib/lite/kernels/internal/optimized/tensor_utils_impl.h" 22 23 #ifdef USE_NEON 24 25 #define kFloatWeightsPerNeonLane 4 26 27 namespace tflite { 28 namespace tensor_utils { 29 30 void NeonMatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows, 31 int m_cols, const float* vector, 32 int n_batch, float* result, 33 int result_stride) { 34 // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main 35 // vectorized loop, and we need to process sequentially. postamble_start shows 36 // the start index where this should happen. 37 const int postamble_start = 38 m_cols - (m_cols & (kFloatWeightsPerNeonLane - 1)); 39 40 // The arrays used to cache the vector. 41 float32x4_t* vector_cache_float32x4 = 42 new float32x4_t[(m_cols / kFloatWeightsPerNeonLane) * 43 sizeof(float32x4_t)]; 44 const int kUnrollSize = 2; 45 for (int b = 0; b < n_batch; b++) { 46 float* result_in_batch = result + b * m_rows * result_stride; 47 const float* vector_in_batch = vector + b * m_cols; 48 49 const float* matrix_ptr0 = matrix; 50 // If there is only 1 row, we don't want to assign an illegal pointer. 51 const float* matrix_ptr1 = nullptr; 52 if (m_rows > 1) { 53 matrix_ptr1 = matrix + m_cols; 54 } 55 56 // Cahce the vector. 57 for (int c = 0; c < postamble_start; c += kFloatWeightsPerNeonLane) { 58 vector_cache_float32x4[c >> 2] = vld1q_f32(vector_in_batch + c); 59 } 60 61 // Main matrix by vector multiplication loop, which handles two rows of 62 // matrix by vector multiplication. 63 for (int r = 0; r < (m_rows & ~(kUnrollSize - 1)); r += kUnrollSize) { 64 float32x4_t acc0_32x4 = vmovq_n_f32(0.0); 65 float32x4_t acc1_32x4 = vmovq_n_f32(0.0); 66 for (int c = 0; c < postamble_start; c += kFloatWeightsPerNeonLane) { 67 float32x4_t temp = vector_cache_float32x4[c >> 2]; 68 // Load 4 float values from vector1 and vector2 and accumulator. 69 float32x4_t v0_f32x4 = vld1q_f32(matrix_ptr0 + c); 70 float32x4_t v1_f32x4 = vld1q_f32(matrix_ptr1 + c); 71 // Vector multiply-accumulate 4 float 72 acc0_32x4 = vmlaq_f32(acc0_32x4, v0_f32x4, temp); 73 acc1_32x4 = vmlaq_f32(acc1_32x4, v1_f32x4, temp); 74 } 75 // Add the 4 intermediate sum values to get the final dot-prod value for 76 // this column. 77 *result_in_batch += 78 (vgetq_lane_f32(acc0_32x4, 0) + vgetq_lane_f32(acc0_32x4, 1) + 79 vgetq_lane_f32(acc0_32x4, 2) + vgetq_lane_f32(acc0_32x4, 3)); 80 *(result_in_batch + result_stride) += 81 (vgetq_lane_f32(acc1_32x4, 0) + vgetq_lane_f32(acc1_32x4, 1) + 82 vgetq_lane_f32(acc1_32x4, 2) + vgetq_lane_f32(acc1_32x4, 3)); 83 for (int c = postamble_start; c < m_cols; c++) { 84 *result_in_batch += matrix_ptr0[c] * vector_in_batch[c]; 85 *(result_in_batch + result_stride) += 86 matrix_ptr1[c] * vector_in_batch[c]; 87 } 88 matrix_ptr0 += kUnrollSize * m_cols; 89 matrix_ptr1 += kUnrollSize * m_cols; 90 result_in_batch += kUnrollSize * result_stride; 91 } 92 for (int r = (m_rows & ~(kUnrollSize - 1)); r < m_rows; r++) { 93 float32x4_t acc0_32x4 = vmovq_n_f32(0.0); 94 for (int c = 0; c < postamble_start; c += kFloatWeightsPerNeonLane) { 95 float32x4_t temp = vector_cache_float32x4[c >> 2]; 96 // Load 4 float values from vector1 and vector2 and accumulator. 97 float32x4_t v0_f32x4 = vld1q_f32(matrix_ptr0 + c); 98 // Vector multiply-accumulate 4 float 99 acc0_32x4 = vmlaq_f32(acc0_32x4, v0_f32x4, temp); 100 } 101 // Add the 4 intermediate sum values to get the final dot-prod value for 102 // this column. 103 *result_in_batch += 104 (vgetq_lane_f32(acc0_32x4, 0) + vgetq_lane_f32(acc0_32x4, 1) + 105 vgetq_lane_f32(acc0_32x4, 2) + vgetq_lane_f32(acc0_32x4, 3)); 106 for (int c = postamble_start; c < m_cols; c++) { 107 *result_in_batch += matrix_ptr0[c] * vector_in_batch[c]; 108 } 109 matrix_ptr0 += m_cols; 110 result_in_batch += result_stride; 111 } 112 } 113 delete[] vector_cache_float32x4; 114 } 115 116 void NeonVectorVectorCwiseProduct(const float* vector1, const float* vector2, 117 int v_size, float* result) { 118 // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main 119 // vectorized loop, and we need to process sequentially. postamble_start shows 120 // the start index where this should happen. 121 const int postamble_start = 122 v_size - (v_size & (kFloatWeightsPerNeonLane - 1)); 123 for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) { 124 // Load 4 float values from vector1 and vector2. 125 float32x4_t v1_f32x4 = vld1q_f32(vector1 + v); 126 float32x4_t v2_f32x4 = vld1q_f32(vector2 + v); 127 // Vector multiply 4 float 128 float32x4_t mul_32x4 = vmulq_f32(v1_f32x4, v2_f32x4); 129 // Save to result array. 130 vst1q_f32(&result[v], mul_32x4); 131 } 132 for (int v = postamble_start; v < v_size; v++) { 133 result[v] = vector1[v] * vector2[v]; 134 } 135 } 136 137 void NeonVectorVectorCwiseProductAccumulate(const float* vector1, 138 const float* vector2, int v_size, 139 float* result) { 140 // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main 141 // vectorized loop, and we need to process sequentially. postamble_start shows 142 // the start index where this should happen. 143 const int postamble_start = 144 v_size - (v_size & (kFloatWeightsPerNeonLane - 1)); 145 for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) { 146 // Load 4 float values from vector1 and vector2 and accumulator. 147 float32x4_t v1_f32x4 = vld1q_f32(vector1 + v); 148 float32x4_t v2_f32x4 = vld1q_f32(vector2 + v); 149 float32x4_t acc_32x4 = vld1q_f32(result + v); 150 // Vector multiply-accumulate 4 float 151 acc_32x4 = vmlaq_f32(acc_32x4, v1_f32x4, v2_f32x4); 152 // Save to result array. 153 vst1q_f32(&result[v], acc_32x4); 154 } 155 for (int v = postamble_start; v < v_size; v++) { 156 result[v] += vector1[v] * vector2[v]; 157 } 158 } 159 160 void NeonVectorBatchVectorCwiseProductAccumulate(const float* vector, 161 int v_size, 162 const float* batch_vector, 163 int n_batch, float* result) { 164 // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main 165 // vectorized loop, and we need to process sequentially. postamble_start shows 166 // the start index where this should happen. 167 const int postamble_start = 168 v_size - (v_size & (kFloatWeightsPerNeonLane - 1)); 169 170 // The arrays used to cache the vector. 171 float32x4_t* vector_cache_float32x4 = 172 new float32x4_t[(v_size / kFloatWeightsPerNeonLane) * 173 sizeof(float32x4_t)]; 174 for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) { 175 vector_cache_float32x4[v >> 2] = vld1q_f32(vector + v); 176 } 177 178 float* result_ptr = result; 179 const float* batch_vector_ptr = batch_vector; 180 for (int b = 0; b < n_batch; b++) { 181 for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) { 182 // Load from memory to vectors. 183 float32x4_t result_f32x4 = vld1q_f32(result_ptr + v); 184 float32x4_t batch_vector_f32x4 = vld1q_f32(batch_vector_ptr + v); 185 // Multiply-accumulate. 186 result_f32x4 = vmlaq_f32(result_f32x4, batch_vector_f32x4, 187 vector_cache_float32x4[v >> 2]); 188 // Store. 189 vst1q_f32(result_ptr + v, result_f32x4); 190 } 191 // Postamble loop 192 for (int v = postamble_start; v < v_size; v++) { 193 result_ptr[v] += vector[v] * batch_vector_ptr[v]; 194 } 195 // Update the pointers. 196 result_ptr += v_size; 197 batch_vector_ptr += v_size; 198 } 199 delete[] vector_cache_float32x4; 200 } 201 202 void NeonSub1Vector(const float* vector, int v_size, float* result) { 203 // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main 204 // vectorized loop, and we need to process sequentially. postamble_start shows 205 // the start index where this should happen. 206 const int postamble_start = 207 v_size - (v_size & (kFloatWeightsPerNeonLane - 1)); 208 209 float32x4_t one_f32x4 = vmovq_n_f32(1.0); 210 for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) { 211 // Load 4 float values from the current pointers of the input column and 212 // subtract from 1. 213 float32x4_t v_f32x4 = vld1q_f32(vector + v); 214 float32x4_t result_f32x4 = vsubq_f32(one_f32x4, v_f32x4); 215 // Save to output. 216 vst1q_f32(result + v, result_f32x4); 217 } 218 for (int v = postamble_start; v < v_size; v++) { 219 result[v] = 1.0f - vector[v]; 220 } 221 } 222 223 void NeonClipVector(const float* vector, int v_size, float abs_limit, 224 float* result) { 225 // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main 226 // vectorized loop, and we need to process sequentially. postamble_start shows 227 // the start index where this should happen. 228 const int postamble_start = 229 v_size - (v_size & (kFloatWeightsPerNeonLane - 1)); 230 231 // Replicate abs_limit and -abs_limit in two vectors. 232 const float32x4_t abs_limit_f32x4 = vmovq_n_f32(abs_limit); 233 const float32x4_t neg_abs_limit_f32x4 = vmovq_n_f32(-abs_limit); 234 235 for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) { 236 // Load from memory to vector. 237 float32x4_t v_f32x4 = vld1q_f32(vector + v); 238 // Clip between abs_limit and -abs_limit. 239 float32x4_t result_f32x4 = vminq_f32(abs_limit_f32x4, v_f32x4); 240 result_f32x4 = vmaxq_f32(neg_abs_limit_f32x4, result_f32x4); 241 // Save to output. 242 vst1q_f32(result + v, result_f32x4); 243 } 244 // Postamble loop. 245 for (int v = postamble_start; v < v_size; v++) { 246 result[v] = (abs_limit < vector[v]) ? abs_limit : vector[v]; 247 result[v] = (-abs_limit > result[v]) ? -abs_limit : result[v]; 248 } 249 } 250 251 float NeonVectorVectorDotProduct(const float* vector1, const float* vector2, 252 int v_size) { 253 // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main 254 // vectorized loop, and we need to process sequentially. postamble_start shows 255 // the start index where this should happen. 256 const int postamble_start = 257 v_size - (v_size & (kFloatWeightsPerNeonLane - 1)); 258 float32x4_t acc_32x4 = vmovq_n_f32(0.0); 259 for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) { 260 // Load 4 float values from vector1 and vector2 and accumulator. 261 float32x4_t v1_f32x4 = vld1q_f32(vector1 + v); 262 float32x4_t v2_f32x4 = vld1q_f32(vector2 + v); 263 // Vector multiply-accumulate 4 float 264 acc_32x4 = vmlaq_f32(acc_32x4, v1_f32x4, v2_f32x4); 265 } 266 267 float result = (vgetq_lane_f32(acc_32x4, 0) + vgetq_lane_f32(acc_32x4, 1) + 268 vgetq_lane_f32(acc_32x4, 2) + vgetq_lane_f32(acc_32x4, 3)); 269 // Postamble loop. 270 for (int v = postamble_start; v < v_size; v++) { 271 result += vector1[v] * vector2[v]; 272 } 273 return result; 274 } 275 276 void NeonBatchVectorBatchVectorDotProduct(const float* vector1, 277 const float* vector2, int v_size, 278 int n_batch, float* result, 279 int result_stride) { 280 float* result_ptr = result; 281 const float* vector1_ptr = vector1; 282 const float* vector2_ptr = vector2; 283 for (int b = 0; b < n_batch; b++) { 284 *result_ptr = NeonVectorVectorDotProduct(vector1_ptr, vector2_ptr, v_size); 285 vector1_ptr += v_size; 286 vector2_ptr += v_size; 287 result_ptr += result_stride; 288 } 289 } 290 291 void NeonReductionSumVector(const float* input_vector, float* output_vector, 292 int output_size, int reduction_size) { 293 const float* input_vector_ptr = input_vector; 294 for (int o = 0; o < output_size; o++) { 295 // If reduction_size is not divisible by kWeightsPerNeonLane, we cannot use 296 // the main vectorized loop, and we need to process sequentially. 297 // postamble_start shows the start index where this should happen. 298 const int postamble_start = 299 reduction_size - (reduction_size & (kFloatWeightsPerNeonLane - 1)); 300 float32x4_t sum_f32x4 = vmovq_n_f32(0.0); 301 for (int r = 0; r < postamble_start; r += kFloatWeightsPerNeonLane) { 302 float32x4_t v1_f32x4 = vld1q_f32(input_vector_ptr + r); 303 sum_f32x4 = vaddq_f32(sum_f32x4, v1_f32x4); 304 } 305 output_vector[o] += 306 (vgetq_lane_f32(sum_f32x4, 0) + vgetq_lane_f32(sum_f32x4, 1) + 307 vgetq_lane_f32(sum_f32x4, 2) + vgetq_lane_f32(sum_f32x4, 3)); 308 input_vector_ptr += postamble_start; 309 310 // Postamble loop. 311 for (int r = postamble_start; r < reduction_size; r++) { 312 output_vector[o] += *input_vector_ptr++; 313 } 314 } 315 } 316 317 void NeonVectorShiftLeft(float* vector, int v_size, float shift_value) { 318 // This variable keeps track of the next to the last index which is being 319 // copied to make sure we are not out of the vector boundary. 320 int last_index_copy = kFloatWeightsPerNeonLane; 321 int current_index_copy = 0; 322 while (last_index_copy < v_size) { 323 float32x4_t v_f32x4 = vld1q_f32(vector + current_index_copy + 1); 324 vst1q_f32(vector + current_index_copy, v_f32x4); 325 current_index_copy += kFloatWeightsPerNeonLane; 326 last_index_copy += kFloatWeightsPerNeonLane; 327 } 328 // Postamble loop. 329 for (int i = current_index_copy; i < v_size - 1; i++) { 330 vector[i] = vector[i + 1]; 331 } 332 vector[v_size - 1] = shift_value; 333 } 334 335 } // namespace tensor_utils 336 } // namespace tflite 337 338 #endif // USE_NEON 339