1 // Copyright 2015 The Gemmlowp Authors. All Rights Reserved. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // fixedpoint_neon.h: optimized NEON specializations of the templates 16 // in fixedpoint.h. 17 18 #ifndef GEMMLOWP_INTERNAL_FIXEDPOINT_NEON_H_ 19 #define GEMMLOWP_INTERNAL_FIXEDPOINT_NEON_H_ 20 21 #include <arm_neon.h> 22 23 namespace gemmlowp { 24 25 template <> 26 struct FixedPointRawTypeTraits<int32x4_t> { 27 typedef std::int32_t ScalarRawType; 28 static const int kLanes = 4; 29 }; 30 31 template <> 32 inline int32x4_t BitAnd(int32x4_t a, int32x4_t b) { 33 return vandq_s32(a, b); 34 } 35 36 template <> 37 inline int32x4_t BitOr(int32x4_t a, int32x4_t b) { 38 return vorrq_s32(a, b); 39 } 40 41 template <> 42 inline int32x4_t BitXor(int32x4_t a, int32x4_t b) { 43 return veorq_s32(a, b); 44 } 45 46 template <> 47 inline int32x4_t BitNot(int32x4_t a) { 48 return veorq_s32(a, vdupq_n_s32(-1)); 49 } 50 51 template <> 52 inline int32x4_t Add(int32x4_t a, int32x4_t b) { 53 return vaddq_s32(a, b); 54 } 55 56 template <> 57 inline int32x4_t Sub(int32x4_t a, int32x4_t b) { 58 return vsubq_s32(a, b); 59 } 60 61 template <> 62 inline int32x4_t Neg(int32x4_t a) { 63 return vnegq_s32(a); 64 } 65 66 template <> 67 inline int32x4_t ShiftLeft(int32x4_t a, int offset) { 68 return vshlq_s32(a, vdupq_n_s32(offset)); 69 } 70 71 template <> 72 inline int32x4_t ShiftRight(int32x4_t a, int offset) { 73 return vshlq_s32(a, vdupq_n_s32(-offset)); 74 } 75 76 template <> 77 inline int32x4_t SelectUsingMask(int32x4_t if_mask, int32x4_t then_val, 78 int32x4_t else_val) { 79 return vbslq_s32(vreinterpretq_u32_s32(if_mask), then_val, else_val); 80 } 81 82 template <> 83 inline int32x4_t MaskIfEqual(int32x4_t a, int32x4_t b) { 84 return vreinterpretq_s32_u32(vceqq_s32(a, b)); 85 } 86 87 template <> 88 inline int32x4_t MaskIfNotEqual(int32x4_t a, int32x4_t b) { 89 return BitNot(MaskIfEqual(a, b)); 90 } 91 92 template <> 93 inline int32x4_t MaskIfZero(int32x4_t a) { 94 return MaskIfEqual(a, vdupq_n_s32(0)); 95 } 96 97 template <> 98 inline int32x4_t MaskIfNonZero(int32x4_t a) { 99 return vreinterpretq_s32_u32(vtstq_s32(a, a)); 100 } 101 102 template <> 103 inline int32x4_t MaskIfGreaterThan(int32x4_t a, int32x4_t b) { 104 return vreinterpretq_s32_u32(vcgtq_s32(a, b)); 105 } 106 107 template <> 108 inline int32x4_t MaskIfGreaterThanOrEqual(int32x4_t a, int32x4_t b) { 109 return vreinterpretq_s32_u32(vcgeq_s32(a, b)); 110 } 111 112 template <> 113 inline int32x4_t MaskIfLessThan(int32x4_t a, int32x4_t b) { 114 return vreinterpretq_s32_u32(vcltq_s32(a, b)); 115 } 116 117 template <> 118 inline int32x4_t MaskIfLessThanOrEqual(int32x4_t a, int32x4_t b) { 119 return vreinterpretq_s32_u32(vcleq_s32(a, b)); 120 } 121 122 template <> 123 inline bool All(int32x4_t a) { 124 a = vandq_s32(a, vextq_s32(a, a, 1)); 125 a = vandq_s32(a, vextq_s32(a, a, 2)); 126 return vgetq_lane_s32(a, 0); 127 } 128 129 template <> 130 inline bool Any(int32x4_t a) { 131 a = vorrq_s32(a, vextq_s32(a, a, 1)); 132 a = vorrq_s32(a, vextq_s32(a, a, 2)); 133 return vgetq_lane_s32(a, 0); 134 } 135 136 template <> 137 inline int32x4_t RoundingHalfSum(int32x4_t a, int32x4_t b) { 138 return vrhaddq_s32(a, b); 139 } 140 141 template <> 142 inline int32x4_t SaturatingRoundingDoublingHighMul(int32x4_t a, int32x4_t b) { 143 return vqrdmulhq_s32(a, b); 144 } 145 146 template <> 147 inline int32x4_t RoundingDivideByPOT(int32x4_t x, int exponent) { 148 const int32x4_t shift_vec = vdupq_n_s32(-exponent); 149 const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift_vec), 31); 150 const int32x4_t fixed_up_x = vqaddq_s32(x, fixup); 151 return vrshlq_s32(fixed_up_x, shift_vec); 152 } 153 154 template <int Exponent> 155 struct ImplSaturatingRoundingMultiplyByPOT<Exponent, int32x4_t, 1> { 156 static int32x4_t eval(int32x4_t x) { return vqshlq_n_s32(x, Exponent); } 157 }; 158 159 template <int Exponent> 160 struct ImplSaturatingRoundingMultiplyByPOT<Exponent, int32x4_t, -1> { 161 static int32x4_t eval(int32x4_t x) { 162 const int32x4_t fixup = vshrq_n_s32(x, 31); 163 const int32x4_t fixed_up_x = vqaddq_s32(x, fixup); 164 return vrshrq_n_s32(fixed_up_x, -Exponent); 165 } 166 }; 167 168 template <> 169 inline int32x4_t Dup<int32x4_t>(std::int32_t x) { 170 return vdupq_n_s32(x); 171 } 172 173 } // end namespace gemmlowp 174 175 #endif // GEMMLOWP_INTERNAL_FIXEDPOINT_NEON_H_ 176