Home | History | Annotate | Download | only in platform
      1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 #include "tensorflow/core/platform/denormal.h"
     17 #include "third_party/eigen3/Eigen/Core"
     18 #include "tensorflow/core/platform/cpu_info.h"
     19 #include "tensorflow/core/platform/logging.h"
     20 #include "tensorflow/core/platform/platform.h"
     21 // If we're on gcc 4.8 or older, there's a known bug that prevents the use of
     22 // intrinsics when the architecture is not defined in the flags. See
     23 // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=57202
     24 #if !defined(__SSE3__) && !defined(__clang__) && \
     25     (defined(__GNUC__) && (__GNUC__ < 4) ||      \
     26      ((__GNUC__ == 4) && (__GNUC_MINOR__ < 9)))
     27 #define GCC_WITHOUT_INTRINSICS
     28 #endif
     29 // Only try to use SSE3 instructions if we're on an x86 platform, and it's not
     30 // mobile, and we're not on a known bad gcc version.
     31 #if defined(PLATFORM_IS_X86) && !defined(IS_MOBILE_PLATFORM) && \
     32     !defined(GCC_WITHOUT_INTRINSICS)
     33 #define DENORM_USE_INTRINSICS
     34 #endif
     35 
     36 #ifdef DENORM_USE_INTRINSICS
     37 #include <pmmintrin.h>
     38 #endif
     39 
     40 namespace tensorflow {
     41 namespace port {
     42 
     43 static void SetDenormalState(bool flush_zero_mode, bool denormals_zero_mode) {
     44   // For now, we flush denormals only on SSE 3.  Other architectures such as ARM
     45   // can be added as needed.
     46 
     47 #ifdef DENORM_USE_INTRINSICS
     48   if (TestCPUFeature(SSE3)) {
     49     // Restore flags
     50     _MM_SET_FLUSH_ZERO_MODE(flush_zero_mode ? _MM_FLUSH_ZERO_ON
     51                                             : _MM_FLUSH_ZERO_OFF);
     52     _MM_SET_DENORMALS_ZERO_MODE(denormals_zero_mode ? _MM_DENORMALS_ZERO_ON
     53                                                     : _MM_DENORMALS_ZERO_OFF);
     54   }
     55 #endif
     56 }
     57 
     58 static std::pair<bool, bool> GetDernormalState() {
     59   // For now, we flush denormals only on SSE 3.  Other architectures such as ARM
     60   // can be added as needed.
     61 
     62 #ifdef DENORM_USE_INTRINSICS
     63   if (TestCPUFeature(SSE3)) {
     64     // Save existing flags
     65     bool flush_zero_mode = _MM_GET_FLUSH_ZERO_MODE() == _MM_FLUSH_ZERO_ON;
     66     bool denormals_zero_mode =
     67         _MM_GET_DENORMALS_ZERO_MODE() == _MM_DENORMALS_ZERO_ON;
     68     return {flush_zero_mode, denormals_zero_mode};
     69   }
     70 #endif
     71   return {false, false};
     72 }
     73 
     74 ScopedRestoreFlushDenormalState::ScopedRestoreFlushDenormalState() {
     75   std::tie(flush_zero_mode_, denormals_zero_mode_) = GetDernormalState();
     76 }
     77 
     78 ScopedRestoreFlushDenormalState::~ScopedRestoreFlushDenormalState() {
     79   SetDenormalState(flush_zero_mode_, denormals_zero_mode_);
     80 }
     81 
     82 ScopedFlushDenormal::ScopedFlushDenormal() {
     83   SetDenormalState(/*flush_zero_mode=*/true, /*denormals_zero_mode=*/true);
     84 }
     85 
     86 ScopedDontFlushDenormal::ScopedDontFlushDenormal() {
     87   SetDenormalState(/*flush_zero_mode=*/false, /*denormals_zero_mode=*/false);
     88 }
     89 
     90 }  // namespace port
     91 }  // namespace tensorflow
     92