Home | History | Annotate | Download | only in optimized
      1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 #include <string.h>
     16 
     17 #include "tensorflow/contrib/lite/builtin_op_data.h"
     18 #include "tensorflow/contrib/lite/kernels/internal/common.h"
     19 #include "tensorflow/contrib/lite/kernels/activation_functor.h"
     20 #include "tensorflow/contrib/lite/kernels/internal/common.h"
     21 #include "tensorflow/contrib/lite/kernels/internal/optimized/tensor_utils_impl.h"
     22 
     23 #ifdef USE_NEON
     24 
     25 #define kFloatWeightsPerNeonLane 4
     26 
     27 namespace tflite {
     28 namespace tensor_utils {
     29 
     30 void NeonMatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
     31                                              int m_cols, const float* vector,
     32                                              int n_batch, float* result,
     33                                              int result_stride) {
     34   // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main
     35   // vectorized loop, and we need to process sequentially. postamble_start shows
     36   // the start index where this should happen.
     37   const int postamble_start =
     38       m_cols - (m_cols & (kFloatWeightsPerNeonLane - 1));
     39 
     40   // The arrays used to cache the vector.
     41   float32x4_t* vector_cache_float32x4 =
     42       new float32x4_t[(m_cols / kFloatWeightsPerNeonLane) *
     43                       sizeof(float32x4_t)];
     44   const int kUnrollSize = 2;
     45   for (int b = 0; b < n_batch; b++) {
     46     float* result_in_batch = result + b * m_rows * result_stride;
     47     const float* vector_in_batch = vector + b * m_cols;
     48 
     49     const float* matrix_ptr0 = matrix;
     50     // If there is only 1 row, we don't want to assign an illegal pointer.
     51     const float* matrix_ptr1 = nullptr;
     52     if (m_rows > 1) {
     53       matrix_ptr1 = matrix + m_cols;
     54     }
     55 
     56     // Cahce the vector.
     57     for (int c = 0; c < postamble_start; c += kFloatWeightsPerNeonLane) {
     58       vector_cache_float32x4[c >> 2] = vld1q_f32(vector_in_batch + c);
     59     }
     60 
     61     // Main matrix by vector multiplication loop, which handles two rows of
     62     // matrix by vector multiplication.
     63     for (int r = 0; r < (m_rows & ~(kUnrollSize - 1)); r += kUnrollSize) {
     64       float32x4_t acc0_32x4 = vmovq_n_f32(0.0);
     65       float32x4_t acc1_32x4 = vmovq_n_f32(0.0);
     66       for (int c = 0; c < postamble_start; c += kFloatWeightsPerNeonLane) {
     67         float32x4_t temp = vector_cache_float32x4[c >> 2];
     68         // Load 4 float values from vector1 and vector2 and accumulator.
     69         float32x4_t v0_f32x4 = vld1q_f32(matrix_ptr0 + c);
     70         float32x4_t v1_f32x4 = vld1q_f32(matrix_ptr1 + c);
     71         // Vector multiply-accumulate 4 float
     72         acc0_32x4 = vmlaq_f32(acc0_32x4, v0_f32x4, temp);
     73         acc1_32x4 = vmlaq_f32(acc1_32x4, v1_f32x4, temp);
     74       }
     75       // Add the 4 intermediate sum values to get the final dot-prod value for
     76       // this column.
     77       *result_in_batch +=
     78           (vgetq_lane_f32(acc0_32x4, 0) + vgetq_lane_f32(acc0_32x4, 1) +
     79            vgetq_lane_f32(acc0_32x4, 2) + vgetq_lane_f32(acc0_32x4, 3));
     80       *(result_in_batch + result_stride) +=
     81           (vgetq_lane_f32(acc1_32x4, 0) + vgetq_lane_f32(acc1_32x4, 1) +
     82            vgetq_lane_f32(acc1_32x4, 2) + vgetq_lane_f32(acc1_32x4, 3));
     83       for (int c = postamble_start; c < m_cols; c++) {
     84         *result_in_batch += matrix_ptr0[c] * vector_in_batch[c];
     85         *(result_in_batch + result_stride) +=
     86             matrix_ptr1[c] * vector_in_batch[c];
     87       }
     88       matrix_ptr0 += kUnrollSize * m_cols;
     89       matrix_ptr1 += kUnrollSize * m_cols;
     90       result_in_batch += kUnrollSize * result_stride;
     91     }
     92     for (int r = (m_rows & ~(kUnrollSize - 1)); r < m_rows; r++) {
     93       float32x4_t acc0_32x4 = vmovq_n_f32(0.0);
     94       for (int c = 0; c < postamble_start; c += kFloatWeightsPerNeonLane) {
     95         float32x4_t temp = vector_cache_float32x4[c >> 2];
     96         // Load 4 float values from vector1 and vector2 and accumulator.
     97         float32x4_t v0_f32x4 = vld1q_f32(matrix_ptr0 + c);
     98         // Vector multiply-accumulate 4 float
     99         acc0_32x4 = vmlaq_f32(acc0_32x4, v0_f32x4, temp);
    100       }
    101       // Add the 4 intermediate sum values to get the final dot-prod value for
    102       // this column.
    103       *result_in_batch +=
    104           (vgetq_lane_f32(acc0_32x4, 0) + vgetq_lane_f32(acc0_32x4, 1) +
    105            vgetq_lane_f32(acc0_32x4, 2) + vgetq_lane_f32(acc0_32x4, 3));
    106       for (int c = postamble_start; c < m_cols; c++) {
    107         *result_in_batch += matrix_ptr0[c] * vector_in_batch[c];
    108       }
    109       matrix_ptr0 += m_cols;
    110       result_in_batch += result_stride;
    111     }
    112   }
    113   delete[] vector_cache_float32x4;
    114 }
    115 
    116 void NeonVectorVectorCwiseProduct(const float* vector1, const float* vector2,
    117                                   int v_size, float* result) {
    118   // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main
    119   // vectorized loop, and we need to process sequentially. postamble_start shows
    120   // the start index where this should happen.
    121   const int postamble_start =
    122       v_size - (v_size & (kFloatWeightsPerNeonLane - 1));
    123   for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) {
    124     // Load 4 float values from vector1 and vector2.
    125     float32x4_t v1_f32x4 = vld1q_f32(vector1 + v);
    126     float32x4_t v2_f32x4 = vld1q_f32(vector2 + v);
    127     // Vector multiply 4 float
    128     float32x4_t mul_32x4 = vmulq_f32(v1_f32x4, v2_f32x4);
    129     // Save to result array.
    130     vst1q_f32(&result[v], mul_32x4);
    131   }
    132   for (int v = postamble_start; v < v_size; v++) {
    133     result[v] = vector1[v] * vector2[v];
    134   }
    135 }
    136 
    137 void NeonVectorVectorCwiseProductAccumulate(const float* vector1,
    138                                             const float* vector2, int v_size,
    139                                             float* result) {
    140   // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main
    141   // vectorized loop, and we need to process sequentially. postamble_start shows
    142   // the start index where this should happen.
    143   const int postamble_start =
    144       v_size - (v_size & (kFloatWeightsPerNeonLane - 1));
    145   for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) {
    146     // Load 4 float values from vector1 and vector2 and accumulator.
    147     float32x4_t v1_f32x4 = vld1q_f32(vector1 + v);
    148     float32x4_t v2_f32x4 = vld1q_f32(vector2 + v);
    149     float32x4_t acc_32x4 = vld1q_f32(result + v);
    150     // Vector multiply-accumulate 4 float
    151     acc_32x4 = vmlaq_f32(acc_32x4, v1_f32x4, v2_f32x4);
    152     // Save to result array.
    153     vst1q_f32(&result[v], acc_32x4);
    154   }
    155   for (int v = postamble_start; v < v_size; v++) {
    156     result[v] += vector1[v] * vector2[v];
    157   }
    158 }
    159 
    160 void NeonVectorBatchVectorCwiseProductAccumulate(const float* vector,
    161                                                  int v_size,
    162                                                  const float* batch_vector,
    163                                                  int n_batch, float* result) {
    164   // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main
    165   // vectorized loop, and we need to process sequentially. postamble_start shows
    166   // the start index where this should happen.
    167   const int postamble_start =
    168       v_size - (v_size & (kFloatWeightsPerNeonLane - 1));
    169 
    170   // The arrays used to cache the vector.
    171   float32x4_t* vector_cache_float32x4 =
    172       new float32x4_t[(v_size / kFloatWeightsPerNeonLane) *
    173                       sizeof(float32x4_t)];
    174   for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) {
    175     vector_cache_float32x4[v >> 2] = vld1q_f32(vector + v);
    176   }
    177 
    178   float* result_ptr = result;
    179   const float* batch_vector_ptr = batch_vector;
    180   for (int b = 0; b < n_batch; b++) {
    181     for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) {
    182       // Load from memory to vectors.
    183       float32x4_t result_f32x4 = vld1q_f32(result_ptr + v);
    184       float32x4_t batch_vector_f32x4 = vld1q_f32(batch_vector_ptr + v);
    185       // Multiply-accumulate.
    186       result_f32x4 = vmlaq_f32(result_f32x4, batch_vector_f32x4,
    187                                vector_cache_float32x4[v >> 2]);
    188       // Store.
    189       vst1q_f32(result_ptr + v, result_f32x4);
    190     }
    191     // Postamble loop
    192     for (int v = postamble_start; v < v_size; v++) {
    193       result_ptr[v] += vector[v] * batch_vector_ptr[v];
    194     }
    195     // Update the pointers.
    196     result_ptr += v_size;
    197     batch_vector_ptr += v_size;
    198   }
    199   delete[] vector_cache_float32x4;
    200 }
    201 
    202 void NeonSub1Vector(const float* vector, int v_size, float* result) {
    203   // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main
    204   // vectorized loop, and we need to process sequentially. postamble_start shows
    205   // the start index where this should happen.
    206   const int postamble_start =
    207       v_size - (v_size & (kFloatWeightsPerNeonLane - 1));
    208 
    209   float32x4_t one_f32x4 = vmovq_n_f32(1.0);
    210   for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) {
    211     // Load 4 float values from the current pointers of the input column and
    212     // subtract from 1.
    213     float32x4_t v_f32x4 = vld1q_f32(vector + v);
    214     float32x4_t result_f32x4 = vsubq_f32(one_f32x4, v_f32x4);
    215     // Save to output.
    216     vst1q_f32(result + v, result_f32x4);
    217   }
    218   for (int v = postamble_start; v < v_size; v++) {
    219     result[v] = 1.0f - vector[v];
    220   }
    221 }
    222 
    223 void NeonClipVector(const float* vector, int v_size, float abs_limit,
    224                     float* result) {
    225   // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main
    226   // vectorized loop, and we need to process sequentially. postamble_start shows
    227   // the start index where this should happen.
    228   const int postamble_start =
    229       v_size - (v_size & (kFloatWeightsPerNeonLane - 1));
    230 
    231   // Replicate abs_limit and -abs_limit in two vectors.
    232   const float32x4_t abs_limit_f32x4 = vmovq_n_f32(abs_limit);
    233   const float32x4_t neg_abs_limit_f32x4 = vmovq_n_f32(-abs_limit);
    234 
    235   for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) {
    236     // Load from memory to vector.
    237     float32x4_t v_f32x4 = vld1q_f32(vector + v);
    238     // Clip between abs_limit and -abs_limit.
    239     float32x4_t result_f32x4 = vminq_f32(abs_limit_f32x4, v_f32x4);
    240     result_f32x4 = vmaxq_f32(neg_abs_limit_f32x4, result_f32x4);
    241     // Save to output.
    242     vst1q_f32(result + v, result_f32x4);
    243   }
    244   // Postamble loop.
    245   for (int v = postamble_start; v < v_size; v++) {
    246     result[v] = (abs_limit < vector[v]) ? abs_limit : vector[v];
    247     result[v] = (-abs_limit > result[v]) ? -abs_limit : result[v];
    248   }
    249 }
    250 
    251 float NeonVectorVectorDotProduct(const float* vector1, const float* vector2,
    252                                  int v_size) {
    253   // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main
    254   // vectorized loop, and we need to process sequentially. postamble_start shows
    255   // the start index where this should happen.
    256   const int postamble_start =
    257       v_size - (v_size & (kFloatWeightsPerNeonLane - 1));
    258   float32x4_t acc_32x4 = vmovq_n_f32(0.0);
    259   for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) {
    260     // Load 4 float values from vector1 and vector2 and accumulator.
    261     float32x4_t v1_f32x4 = vld1q_f32(vector1 + v);
    262     float32x4_t v2_f32x4 = vld1q_f32(vector2 + v);
    263     // Vector multiply-accumulate 4 float
    264     acc_32x4 = vmlaq_f32(acc_32x4, v1_f32x4, v2_f32x4);
    265   }
    266 
    267   float result = (vgetq_lane_f32(acc_32x4, 0) + vgetq_lane_f32(acc_32x4, 1) +
    268                   vgetq_lane_f32(acc_32x4, 2) + vgetq_lane_f32(acc_32x4, 3));
    269   // Postamble loop.
    270   for (int v = postamble_start; v < v_size; v++) {
    271     result += vector1[v] * vector2[v];
    272   }
    273   return result;
    274 }
    275 
    276 void NeonBatchVectorBatchVectorDotProduct(const float* vector1,
    277                                           const float* vector2, int v_size,
    278                                           int n_batch, float* result,
    279                                           int result_stride) {
    280   float* result_ptr = result;
    281   const float* vector1_ptr = vector1;
    282   const float* vector2_ptr = vector2;
    283   for (int b = 0; b < n_batch; b++) {
    284     *result_ptr = NeonVectorVectorDotProduct(vector1_ptr, vector2_ptr, v_size);
    285     vector1_ptr += v_size;
    286     vector2_ptr += v_size;
    287     result_ptr += result_stride;
    288   }
    289 }
    290 
    291 void NeonReductionSumVector(const float* input_vector, float* output_vector,
    292                             int output_size, int reduction_size) {
    293   const float* input_vector_ptr = input_vector;
    294   for (int o = 0; o < output_size; o++) {
    295     // If reduction_size is not divisible by kWeightsPerNeonLane, we cannot use
    296     // the main vectorized loop, and we need to process sequentially.
    297     // postamble_start shows the start index where this should happen.
    298     const int postamble_start =
    299         reduction_size - (reduction_size & (kFloatWeightsPerNeonLane - 1));
    300     float32x4_t sum_f32x4 = vmovq_n_f32(0.0);
    301     for (int r = 0; r < postamble_start; r += kFloatWeightsPerNeonLane) {
    302       float32x4_t v1_f32x4 = vld1q_f32(input_vector_ptr + r);
    303       sum_f32x4 = vaddq_f32(sum_f32x4, v1_f32x4);
    304     }
    305     output_vector[o] +=
    306         (vgetq_lane_f32(sum_f32x4, 0) + vgetq_lane_f32(sum_f32x4, 1) +
    307          vgetq_lane_f32(sum_f32x4, 2) + vgetq_lane_f32(sum_f32x4, 3));
    308     input_vector_ptr += postamble_start;
    309 
    310     // Postamble loop.
    311     for (int r = postamble_start; r < reduction_size; r++) {
    312       output_vector[o] += *input_vector_ptr++;
    313     }
    314   }
    315 }
    316 
    317 void NeonVectorShiftLeft(float* vector, int v_size, float shift_value) {
    318   // This variable keeps track of the next to the last index which is being
    319   // copied to make sure we are not out of the vector boundary.
    320   int last_index_copy = kFloatWeightsPerNeonLane;
    321   int current_index_copy = 0;
    322   while (last_index_copy < v_size) {
    323     float32x4_t v_f32x4 = vld1q_f32(vector + current_index_copy + 1);
    324     vst1q_f32(vector + current_index_copy, v_f32x4);
    325     current_index_copy += kFloatWeightsPerNeonLane;
    326     last_index_copy += kFloatWeightsPerNeonLane;
    327   }
    328   // Postamble loop.
    329   for (int i = current_index_copy; i < v_size - 1; i++) {
    330     vector[i] = vector[i + 1];
    331   }
    332   vector[v_size - 1] = shift_value;
    333 }
    334 
    335 }  // namespace tensor_utils
    336 }  // namespace tflite
    337 
    338 #endif  // USE_NEON
    339