Home | History | Annotate | Download | only in fixedpoint
      1 // Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
      2 //
      3 // Licensed under the Apache License, Version 2.0 (the "License");
      4 // you may not use this file except in compliance with the License.
      5 // You may obtain a copy of the License at
      6 //
      7 //     http://www.apache.org/licenses/LICENSE-2.0
      8 //
      9 // Unless required by applicable law or agreed to in writing, software
     10 // distributed under the License is distributed on an "AS IS" BASIS,
     11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 // See the License for the specific language governing permissions and
     13 // limitations under the License.
     14 
     15 // fixedpoint_neon.h: optimized NEON specializations of the templates
     16 // in fixedpoint.h.
     17 
     18 #ifndef GEMMLOWP_INTERNAL_FIXEDPOINT_NEON_H_
     19 #define GEMMLOWP_INTERNAL_FIXEDPOINT_NEON_H_
     20 
     21 #include <arm_neon.h>
     22 
     23 namespace gemmlowp {
     24 
     25 template <>
     26 struct FixedPointRawTypeTraits<int32x4_t> {
     27   typedef std::int32_t ScalarRawType;
     28   static const int kLanes = 4;
     29 };
     30 
     31 template <>
     32 inline int32x4_t BitAnd(int32x4_t a, int32x4_t b) {
     33   return vandq_s32(a, b);
     34 }
     35 
     36 template <>
     37 inline int32x4_t BitOr(int32x4_t a, int32x4_t b) {
     38   return vorrq_s32(a, b);
     39 }
     40 
     41 template <>
     42 inline int32x4_t BitXor(int32x4_t a, int32x4_t b) {
     43   return veorq_s32(a, b);
     44 }
     45 
     46 template <>
     47 inline int32x4_t BitNot(int32x4_t a) {
     48   return veorq_s32(a, vdupq_n_s32(-1));
     49 }
     50 
     51 template <>
     52 inline int32x4_t Add(int32x4_t a, int32x4_t b) {
     53   return vaddq_s32(a, b);
     54 }
     55 
     56 template <>
     57 inline int32x4_t Sub(int32x4_t a, int32x4_t b) {
     58   return vsubq_s32(a, b);
     59 }
     60 
     61 template <>
     62 inline int32x4_t Neg(int32x4_t a) {
     63   return vnegq_s32(a);
     64 }
     65 
     66 template <>
     67 inline int32x4_t ShiftLeft(int32x4_t a, int offset) {
     68   return vshlq_s32(a, vdupq_n_s32(offset));
     69 }
     70 
     71 template <>
     72 inline int32x4_t ShiftRight(int32x4_t a, int offset) {
     73   return vshlq_s32(a, vdupq_n_s32(-offset));
     74 }
     75 
     76 template <>
     77 inline int32x4_t SelectUsingMask(int32x4_t if_mask, int32x4_t then_val,
     78                                  int32x4_t else_val) {
     79   return vbslq_s32(vreinterpretq_u32_s32(if_mask), then_val, else_val);
     80 }
     81 
     82 template <>
     83 inline int32x4_t MaskIfEqual(int32x4_t a, int32x4_t b) {
     84   return vreinterpretq_s32_u32(vceqq_s32(a, b));
     85 }
     86 
     87 template <>
     88 inline int32x4_t MaskIfNotEqual(int32x4_t a, int32x4_t b) {
     89   return BitNot(MaskIfEqual(a, b));
     90 }
     91 
     92 template <>
     93 inline int32x4_t MaskIfZero(int32x4_t a) {
     94   return MaskIfEqual(a, vdupq_n_s32(0));
     95 }
     96 
     97 template <>
     98 inline int32x4_t MaskIfNonZero(int32x4_t a) {
     99   return vreinterpretq_s32_u32(vtstq_s32(a, a));
    100 }
    101 
    102 template <>
    103 inline int32x4_t MaskIfGreaterThan(int32x4_t a, int32x4_t b) {
    104   return vreinterpretq_s32_u32(vcgtq_s32(a, b));
    105 }
    106 
    107 template <>
    108 inline int32x4_t MaskIfGreaterThanOrEqual(int32x4_t a, int32x4_t b) {
    109   return vreinterpretq_s32_u32(vcgeq_s32(a, b));
    110 }
    111 
    112 template <>
    113 inline int32x4_t MaskIfLessThan(int32x4_t a, int32x4_t b) {
    114   return vreinterpretq_s32_u32(vcltq_s32(a, b));
    115 }
    116 
    117 template <>
    118 inline int32x4_t MaskIfLessThanOrEqual(int32x4_t a, int32x4_t b) {
    119   return vreinterpretq_s32_u32(vcleq_s32(a, b));
    120 }
    121 
    122 template <>
    123 inline bool All(int32x4_t a) {
    124   a = vandq_s32(a, vextq_s32(a, a, 1));
    125   a = vandq_s32(a, vextq_s32(a, a, 2));
    126   return vgetq_lane_s32(a, 0);
    127 }
    128 
    129 template <>
    130 inline bool Any(int32x4_t a) {
    131   a = vorrq_s32(a, vextq_s32(a, a, 1));
    132   a = vorrq_s32(a, vextq_s32(a, a, 2));
    133   return vgetq_lane_s32(a, 0);
    134 }
    135 
    136 template <>
    137 inline int32x4_t RoundingHalfSum(int32x4_t a, int32x4_t b) {
    138   return vrhaddq_s32(a, b);
    139 }
    140 
    141 template <>
    142 inline int32x4_t SaturatingRoundingDoublingHighMul(int32x4_t a, int32x4_t b) {
    143   return vqrdmulhq_s32(a, b);
    144 }
    145 
    146 template <>
    147 inline int32x4_t RoundingDivideByPOT(int32x4_t x, int exponent) {
    148   const int32x4_t shift_vec = vdupq_n_s32(-exponent);
    149   const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift_vec), 31);
    150   const int32x4_t fixed_up_x = vqaddq_s32(x, fixup);
    151   return vrshlq_s32(fixed_up_x, shift_vec);
    152 }
    153 
    154 template <int Exponent>
    155 struct ImplSaturatingRoundingMultiplyByPOT<Exponent, int32x4_t, 1> {
    156   static int32x4_t eval(int32x4_t x) { return vqshlq_n_s32(x, Exponent); }
    157 };
    158 
    159 template <int Exponent>
    160 struct ImplSaturatingRoundingMultiplyByPOT<Exponent, int32x4_t, -1> {
    161   static int32x4_t eval(int32x4_t x) {
    162     const int32x4_t fixup = vshrq_n_s32(x, 31);
    163     const int32x4_t fixed_up_x = vqaddq_s32(x, fixup);
    164     return vrshrq_n_s32(fixed_up_x, -Exponent);
    165   }
    166 };
    167 
    168 template <>
    169 inline int32x4_t Dup<int32x4_t>(std::int32_t x) {
    170   return vdupq_n_s32(x);
    171 }
    172 
    173 }  // end namespace gemmlowp
    174 
    175 #endif  // GEMMLOWP_INTERNAL_FIXEDPOINT_NEON_H_
    176