Home | History | Annotate | Download | only in object_tracking
      1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 // NEON implementations of Image methods for compatible devices.  Control
     17 // should never enter this compilation unit on incompatible devices.
     18 
     19 #ifdef __ARM_NEON
     20 
     21 #include <arm_neon.h>
     22 
     23 #include <stdint.h>
     24 
     25 #include "tensorflow/examples/android/jni/object_tracking/image-inl.h"
     26 #include "tensorflow/examples/android/jni/object_tracking/image.h"
     27 #include "tensorflow/examples/android/jni/object_tracking/image_utils.h"
     28 #include "tensorflow/examples/android/jni/object_tracking/utils.h"
     29 
     30 namespace tf_tracking {
     31 
     32 // This function does the bulk of the work.
     33 template <>
     34 void Image<uint8_t>::Downsample2x32ColumnsNeon(const uint8_t* const original,
     35                                                const int stride,
     36                                                const int orig_x) {
     37   // Divide input x offset by 2 to find output offset.
     38   const int new_x = orig_x >> 1;
     39 
     40   // Initial offset into top row.
     41   const uint8_t* offset = original + orig_x;
     42 
     43   // This points to the leftmost pixel of our 8 horizontally arranged
     44   // pixels in the destination data.
     45   uint8_t* ptr_dst = (*this)[0] + new_x;
     46 
     47   // Sum along vertical columns.
     48   // Process 32x2 input pixels and 16x1 output pixels per iteration.
     49   for (int new_y = 0; new_y < height_; ++new_y) {
     50     uint16x8_t accum1 = vdupq_n_u16(0);
     51     uint16x8_t accum2 = vdupq_n_u16(0);
     52 
     53     // Go top to bottom across the four rows of input pixels that make up
     54     // this output row.
     55     for (int row_num = 0; row_num < 2; ++row_num) {
     56       // First 16 bytes.
     57       {
     58         // Load 16 bytes of data from current offset.
     59         const uint8x16_t curr_data1 = vld1q_u8(offset);
     60 
     61         // Pairwise add and accumulate into accum vectors (16 bit to account
     62         // for values above 255).
     63         accum1 = vpadalq_u8(accum1, curr_data1);
     64       }
     65 
     66       // Second 16 bytes.
     67       {
     68         // Load 16 bytes of data from current offset.
     69         const uint8x16_t curr_data2 = vld1q_u8(offset + 16);
     70 
     71         // Pairwise add and accumulate into accum vectors (16 bit to account
     72         // for values above 255).
     73         accum2 = vpadalq_u8(accum2, curr_data2);
     74       }
     75 
     76       // Move offset down one row.
     77       offset += stride;
     78     }
     79 
     80     // Divide by 4 (number of input pixels per output
     81     // pixel) and narrow data from 16 bits per pixel to 8 bpp.
     82     const uint8x8_t tmp_pix1 = vqshrn_n_u16(accum1, 2);
     83     const uint8x8_t tmp_pix2 = vqshrn_n_u16(accum2, 2);
     84 
     85     // Concatenate 8x1 pixel strips into 16x1 pixel strip.
     86     const uint8x16_t allpixels = vcombine_u8(tmp_pix1, tmp_pix2);
     87 
     88     // Copy all pixels from composite 16x1 vector into output strip.
     89     vst1q_u8(ptr_dst, allpixels);
     90 
     91     ptr_dst += stride_;
     92   }
     93 }
     94 
     95 // This function does the bulk of the work.
     96 template <>
     97 void Image<uint8_t>::Downsample4x32ColumnsNeon(const uint8_t* const original,
     98                                                const int stride,
     99                                                const int orig_x) {
    100   // Divide input x offset by 4 to find output offset.
    101   const int new_x = orig_x >> 2;
    102 
    103   // Initial offset into top row.
    104   const uint8_t* offset = original + orig_x;
    105 
    106   // This points to the leftmost pixel of our 8 horizontally arranged
    107   // pixels in the destination data.
    108   uint8_t* ptr_dst = (*this)[0] + new_x;
    109 
    110   // Sum along vertical columns.
    111   // Process 32x4 input pixels and 8x1 output pixels per iteration.
    112   for (int new_y = 0; new_y < height_; ++new_y) {
    113     uint16x8_t accum1 = vdupq_n_u16(0);
    114     uint16x8_t accum2 = vdupq_n_u16(0);
    115 
    116     // Go top to bottom across the four rows of input pixels that make up
    117     // this output row.
    118     for (int row_num = 0; row_num < 4; ++row_num) {
    119       // First 16 bytes.
    120       {
    121         // Load 16 bytes of data from current offset.
    122         const uint8x16_t curr_data1 = vld1q_u8(offset);
    123 
    124         // Pairwise add and accumulate into accum vectors (16 bit to account
    125         // for values above 255).
    126         accum1 = vpadalq_u8(accum1, curr_data1);
    127       }
    128 
    129       // Second 16 bytes.
    130       {
    131         // Load 16 bytes of data from current offset.
    132         const uint8x16_t curr_data2 = vld1q_u8(offset + 16);
    133 
    134         // Pairwise add and accumulate into accum vectors (16 bit to account
    135         // for values above 255).
    136         accum2 = vpadalq_u8(accum2, curr_data2);
    137       }
    138 
    139       // Move offset down one row.
    140       offset += stride;
    141     }
    142 
    143     // Add and widen, then divide by 16 (number of input pixels per output
    144     // pixel) and narrow data from 32 bits per pixel to 16 bpp.
    145     const uint16x4_t tmp_pix1 = vqshrn_n_u32(vpaddlq_u16(accum1), 4);
    146     const uint16x4_t tmp_pix2 = vqshrn_n_u32(vpaddlq_u16(accum2), 4);
    147 
    148     // Combine 4x1 pixel strips into 8x1 pixel strip and narrow from
    149     // 16 bits to 8 bits per pixel.
    150     const uint8x8_t allpixels = vmovn_u16(vcombine_u16(tmp_pix1, tmp_pix2));
    151 
    152     // Copy all pixels from composite 8x1 vector into output strip.
    153     vst1_u8(ptr_dst, allpixels);
    154 
    155     ptr_dst += stride_;
    156   }
    157 }
    158 
    159 
    160 // Hardware accelerated downsampling method for supported devices.
    161 // Requires that image size be a multiple of 16 pixels in each dimension,
    162 // and that downsampling be by a factor of 2 or 4.
    163 template <>
    164 void Image<uint8_t>::DownsampleAveragedNeon(const uint8_t* const original,
    165                                             const int stride,
    166                                             const int factor) {
    167   // TODO(andrewharp): stride is a bad approximation for the src image's width.
    168   // Better to pass that in directly.
    169   SCHECK(width_ * factor <= stride, "Uh oh!");
    170   const int last_starting_index = width_ * factor - 32;
    171 
    172   // We process 32 input pixels lengthwise at a time.
    173   // The output per pass of this loop is an 8 wide by downsampled height tall
    174   // pixel strip.
    175   int orig_x = 0;
    176   for (; orig_x <= last_starting_index; orig_x += 32) {
    177     if (factor == 2) {
    178       Downsample2x32ColumnsNeon(original, stride, orig_x);
    179     } else {
    180       Downsample4x32ColumnsNeon(original, stride, orig_x);
    181     }
    182   }
    183 
    184   // If a last pass is required, push it to the left enough so that it never
    185   // goes out of bounds. This will result in some extra computation on devices
    186   // whose frame widths are multiples of 16 and not 32.
    187   if (orig_x < last_starting_index + 32) {
    188     if (factor == 2) {
    189       Downsample2x32ColumnsNeon(original, stride, last_starting_index);
    190     } else {
    191       Downsample4x32ColumnsNeon(original, stride, last_starting_index);
    192     }
    193   }
    194 }
    195 
    196 
    197 // Puts the image gradient matrix about a pixel into the 2x2 float array G.
    198 // vals_x should be an array of the window x gradient values, whose indices
    199 // can be in any order but are parallel to the vals_y entries.
    200 // See http://robots.stanford.edu/cs223b04/algo_tracking.pdf for more details.
    201 void CalculateGNeon(const float* const vals_x, const float* const vals_y,
    202                     const int num_vals, float* const G) {
    203   const float32_t* const arm_vals_x = (const float32_t*) vals_x;
    204   const float32_t* const arm_vals_y = (const float32_t*) vals_y;
    205 
    206   // Running sums.
    207   float32x4_t xx = vdupq_n_f32(0.0f);
    208   float32x4_t xy = vdupq_n_f32(0.0f);
    209   float32x4_t yy = vdupq_n_f32(0.0f);
    210 
    211   // Maximum index we can load 4 consecutive values from.
    212   // e.g. if there are 81 values, our last full pass can be from index 77:
    213   // 81-4=>77 (77, 78, 79, 80)
    214   const int max_i = num_vals - 4;
    215 
    216   // Defined here because we want to keep track of how many values were
    217   // processed by NEON, so that we can finish off the remainder the normal
    218   // way.
    219   int i = 0;
    220 
    221   // Process values 4 at a time, accumulating the sums of
    222   // the pixel-wise x*x, x*y, and y*y values.
    223   for (; i <= max_i; i += 4) {
    224     // Load xs
    225     float32x4_t x = vld1q_f32(arm_vals_x + i);
    226 
    227     // Multiply x*x and accumulate.
    228     xx = vmlaq_f32(xx, x, x);
    229 
    230     // Load ys
    231     float32x4_t y = vld1q_f32(arm_vals_y + i);
    232 
    233     // Multiply x*y and accumulate.
    234     xy = vmlaq_f32(xy, x, y);
    235 
    236     // Multiply y*y and accumulate.
    237     yy = vmlaq_f32(yy, y, y);
    238   }
    239 
    240   static float32_t xx_vals[4];
    241   static float32_t xy_vals[4];
    242   static float32_t yy_vals[4];
    243 
    244   vst1q_f32(xx_vals, xx);
    245   vst1q_f32(xy_vals, xy);
    246   vst1q_f32(yy_vals, yy);
    247 
    248   // Accumulated values are store in sets of 4, we have to manually add
    249   // the last bits together.
    250   for (int j = 0; j < 4; ++j) {
    251     G[0] += xx_vals[j];
    252     G[1] += xy_vals[j];
    253     G[3] += yy_vals[j];
    254   }
    255 
    256   // Finishes off last few values (< 4) from above.
    257   for (; i < num_vals; ++i) {
    258     G[0] += Square(vals_x[i]);
    259     G[1] += vals_x[i] * vals_y[i];
    260     G[3] += Square(vals_y[i]);
    261   }
    262 
    263   // The matrix is symmetric, so this is a given.
    264   G[2] = G[1];
    265 }
    266 
    267 }  // namespace tf_tracking
    268 
    269 #endif
    270