Home | History | Annotate | Download | only in internal
      1 // Copyright 2015 Google Inc. All Rights Reserved.
      2 //
      3 // Licensed under the Apache License, Version 2.0 (the "License");
      4 // you may not use this file except in compliance with the License.
      5 // You may obtain a copy of the License at
      6 //
      7 //     http://www.apache.org/licenses/LICENSE-2.0
      8 //
      9 // Unless required by applicable law or agreed to in writing, software
     10 // distributed under the License is distributed on an "AS IS" BASIS,
     11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 // See the License for the specific language governing permissions and
     13 // limitations under the License.
     14 
     15 // fixedpoint_neon.h: optimized NEON specializations of the templates
     16 // in fixedpoint.h.
     17 
     18 #ifndef GEMMLOWP_INTERNAL_FIXEDPOINT_NEON_H_
     19 #define GEMMLOWP_INTERNAL_FIXEDPOINT_NEON_H_
     20 
     21 #include "fixedpoint.h"
     22 
     23 #include <arm_neon.h>
     24 
     25 namespace gemmlowp {
     26 
     27 template <>
     28 inline int32x4_t BitAnd(int32x4_t a, int32x4_t b) {
     29   return vandq_s32(a, b);
     30 }
     31 
     32 template <>
     33 inline int32x4_t BitOr(int32x4_t a, int32x4_t b) {
     34   return vorrq_s32(a, b);
     35 }
     36 
     37 template <>
     38 inline int32x4_t BitXor(int32x4_t a, int32x4_t b) {
     39   return veorq_s32(a, b);
     40 }
     41 
     42 template <>
     43 inline int32x4_t BitNot(int32x4_t a) {
     44   return veorq_s32(a, vdupq_n_s32(-1));
     45 }
     46 
     47 template <>
     48 inline int32x4_t Add(int32x4_t a, int32x4_t b) {
     49   return vaddq_s32(a, b);
     50 }
     51 
     52 template <>
     53 inline int32x4_t Sub(int32x4_t a, int32x4_t b) {
     54   return vsubq_s32(a, b);
     55 }
     56 
     57 template <>
     58 inline int32x4_t Neg(int32x4_t a) {
     59   return vnegq_s32(a);
     60 }
     61 
     62 template <>
     63 inline int32x4_t ShiftLeft(int32x4_t a, int offset) {
     64   return vshlq_s32(a, vdupq_n_s32(offset));
     65 }
     66 
     67 template <>
     68 inline int32x4_t ShiftRight(int32x4_t a, int offset) {
     69   return vshlq_s32(a, vdupq_n_s32(-offset));
     70 }
     71 
     72 template <>
     73 inline int32x4_t SelectUsingMask(int32x4_t if_mask, int32x4_t then_val,
     74                                  int32x4_t else_val) {
     75   return vbslq_s32(vreinterpretq_u32_s32(if_mask), then_val, else_val);
     76 }
     77 
     78 template <>
     79 inline int32x4_t MaskIfEqual(int32x4_t a, int32x4_t b) {
     80   return vreinterpretq_s32_u32(vceqq_s32(a, b));
     81 }
     82 
     83 template <>
     84 inline int32x4_t MaskIfNotEqual(int32x4_t a, int32x4_t b) {
     85   return BitNot(MaskIfEqual(a, b));
     86 }
     87 
     88 template <>
     89 inline int32x4_t MaskIfZero(int32x4_t a) {
     90   return MaskIfEqual(a, vdupq_n_s32(0));
     91 }
     92 
     93 template <>
     94 inline int32x4_t MaskIfNonZero(int32x4_t a) {
     95   return vreinterpretq_s32_u32(vtstq_s32(a, a));
     96 }
     97 
     98 template <>
     99 inline int32x4_t MaskIfGreaterThan(int32x4_t a, int32x4_t b) {
    100   return vreinterpretq_s32_u32(vcgtq_s32(a, b));
    101 }
    102 
    103 template <>
    104 inline int32x4_t MaskIfGreaterThanOrEqual(int32x4_t a, int32x4_t b) {
    105   return vreinterpretq_s32_u32(vcgeq_s32(a, b));
    106 }
    107 
    108 template <>
    109 inline int32x4_t MaskIfLessThan(int32x4_t a, int32x4_t b) {
    110   return vreinterpretq_s32_u32(vcltq_s32(a, b));
    111 }
    112 
    113 template <>
    114 inline int32x4_t MaskIfLessThanOrEqual(int32x4_t a, int32x4_t b) {
    115   return vreinterpretq_s32_u32(vcleq_s32(a, b));
    116 }
    117 
    118 template <>
    119 inline bool All(int32x4_t a) {
    120   a = vandq_s32(a, vextq_s32(a, a, 1));
    121   a = vandq_s32(a, vextq_s32(a, a, 2));
    122   return vgetq_lane_s32(a, 0);
    123 }
    124 
    125 template <>
    126 inline bool Any(int32x4_t a) {
    127   a = vorrq_s32(a, vextq_s32(a, a, 1));
    128   a = vorrq_s32(a, vextq_s32(a, a, 2));
    129   return vgetq_lane_s32(a, 0);
    130 }
    131 
    132 template <>
    133 inline int32x4_t RoundingHalfSum(int32x4_t a, int32x4_t b) {
    134   return vrhaddq_s32(a, b);
    135 }
    136 
    137 template <>
    138 inline int32x4_t SaturatingRoundingDoublingHighMul(int32x4_t a, int32x4_t b) {
    139   return vqrdmulhq_s32(a, b);
    140 }
    141 
    142 template <int Exponent>
    143 struct ImplSaturatingRoundingMultiplyByPOT<Exponent, int32x4_t, 1> {
    144   static int32x4_t eval(int32x4_t x) { return vqshlq_n_s32(x, Exponent); }
    145 };
    146 
    147 template <int Exponent>
    148 struct ImplSaturatingRoundingMultiplyByPOT<Exponent, int32x4_t, -1> {
    149   static int32x4_t eval(int32x4_t x) { return vrshrq_n_s32(x, -Exponent); }
    150 };
    151 
    152 template <>
    153 struct FixedPointRawTypeTraits<int32x4_t> {
    154   typedef int32_t ScalarRawType;
    155   static const int kLanes = 4;
    156 };
    157 
    158 template <>
    159 inline int32x4_t Dup<int32x4_t>(int32_t x) {
    160   return vdupq_n_s32(x);
    161 }
    162 
    163 }  // end namespace gemmlowp
    164 
    165 #endif  // GEMMLOWP_INTERNAL_FIXEDPOINT_NEON_H_
    166