Home | History | Annotate | Download | only in object_tracking
      1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 // NEON implementations of Image methods for compatible devices.  Control
     17 // should never enter this compilation unit on incompatible devices.
     18 
     19 #ifdef __ARM_NEON
     20 
     21 #include <arm_neon.h>
     22 
     23 #include "tensorflow/examples/android/jni/object_tracking/geom.h"
     24 #include "tensorflow/examples/android/jni/object_tracking/image-inl.h"
     25 #include "tensorflow/examples/android/jni/object_tracking/image.h"
     26 #include "tensorflow/examples/android/jni/object_tracking/utils.h"
     27 
     28 namespace tf_tracking {
     29 
     30 inline static float GetSum(const float32x4_t& values) {
     31   static float32_t summed_values[4];
     32   vst1q_f32(summed_values, values);
     33   return summed_values[0]
     34        + summed_values[1]
     35        + summed_values[2]
     36        + summed_values[3];
     37 }
     38 
     39 
     40 float ComputeMeanNeon(const float* const values, const int num_vals) {
     41   SCHECK(num_vals >= 8, "Not enough values to merit NEON: %d", num_vals);
     42 
     43   const float32_t* const arm_vals = (const float32_t* const) values;
     44   float32x4_t accum = vdupq_n_f32(0.0f);
     45 
     46   int offset = 0;
     47   for (; offset <= num_vals - 4; offset += 4) {
     48     accum = vaddq_f32(accum, vld1q_f32(&arm_vals[offset]));
     49   }
     50 
     51   // Pull the accumulated values into a single variable.
     52   float sum = GetSum(accum);
     53 
     54   // Get the remaining 1 to 3 values.
     55   for (; offset < num_vals; ++offset) {
     56     sum += values[offset];
     57   }
     58 
     59   const float mean_neon = sum / static_cast<float>(num_vals);
     60 
     61 #ifdef SANITY_CHECKS
     62   const float mean_cpu = ComputeMeanCpu(values, num_vals);
     63   SCHECK(NearlyEqual(mean_neon, mean_cpu, EPSILON * num_vals),
     64         "Neon mismatch with CPU mean! %.10f vs %.10f",
     65         mean_neon, mean_cpu);
     66 #endif
     67 
     68   return mean_neon;
     69 }
     70 
     71 
     72 float ComputeStdDevNeon(const float* const values,
     73                         const int num_vals, const float mean) {
     74   SCHECK(num_vals >= 8, "Not enough values to merit NEON: %d", num_vals);
     75 
     76   const float32_t* const arm_vals = (const float32_t* const) values;
     77   const float32x4_t mean_vec = vdupq_n_f32(-mean);
     78 
     79   float32x4_t accum = vdupq_n_f32(0.0f);
     80 
     81   int offset = 0;
     82   for (; offset <= num_vals - 4; offset += 4) {
     83     const float32x4_t deltas =
     84         vaddq_f32(mean_vec, vld1q_f32(&arm_vals[offset]));
     85 
     86     accum = vmlaq_f32(accum, deltas, deltas);
     87   }
     88 
     89   // Pull the accumulated values into a single variable.
     90   float squared_sum = GetSum(accum);
     91 
     92   // Get the remaining 1 to 3 values.
     93   for (; offset < num_vals; ++offset) {
     94     squared_sum += Square(values[offset] - mean);
     95   }
     96 
     97   const float std_dev_neon = sqrt(squared_sum / static_cast<float>(num_vals));
     98 
     99 #ifdef SANITY_CHECKS
    100   const float std_dev_cpu = ComputeStdDevCpu(values, num_vals, mean);
    101   SCHECK(NearlyEqual(std_dev_neon, std_dev_cpu, EPSILON * num_vals),
    102         "Neon mismatch with CPU std dev! %.10f vs %.10f",
    103         std_dev_neon, std_dev_cpu);
    104 #endif
    105 
    106   return std_dev_neon;
    107 }
    108 
    109 
    110 float ComputeCrossCorrelationNeon(const float* const values1,
    111                                   const float* const values2,
    112                                   const int num_vals) {
    113   SCHECK(num_vals >= 8, "Not enough values to merit NEON: %d", num_vals);
    114 
    115   const float32_t* const arm_vals1 = (const float32_t* const) values1;
    116   const float32_t* const arm_vals2 = (const float32_t* const) values2;
    117 
    118   float32x4_t accum = vdupq_n_f32(0.0f);
    119 
    120   int offset = 0;
    121   for (; offset <= num_vals - 4; offset += 4) {
    122     accum = vmlaq_f32(accum,
    123                       vld1q_f32(&arm_vals1[offset]),
    124                       vld1q_f32(&arm_vals2[offset]));
    125   }
    126 
    127   // Pull the accumulated values into a single variable.
    128   float sxy = GetSum(accum);
    129 
    130   // Get the remaining 1 to 3 values.
    131   for (; offset < num_vals; ++offset) {
    132     sxy += values1[offset] * values2[offset];
    133   }
    134 
    135   const float cross_correlation_neon = sxy / num_vals;
    136 
    137 #ifdef SANITY_CHECKS
    138   const float cross_correlation_cpu =
    139       ComputeCrossCorrelationCpu(values1, values2, num_vals);
    140   SCHECK(NearlyEqual(cross_correlation_neon, cross_correlation_cpu,
    141                     EPSILON * num_vals),
    142         "Neon mismatch with CPU cross correlation! %.10f vs %.10f",
    143         cross_correlation_neon, cross_correlation_cpu);
    144 #endif
    145 
    146   return cross_correlation_neon;
    147 }
    148 
    149 }  // namespace tf_tracking
    150 
    151 #endif  // __ARM_NEON
    152