Home | History | Annotate | Download | only in profile_utils
      1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 // This class is designed to get accurate profile for programs.
     16 
     17 #ifndef TENSORFLOW_PLATFORM_PROFILEUTILS_CPU_UTILS_H__
     18 #define TENSORFLOW_PLATFORM_PROFILEUTILS_CPU_UTILS_H__
     19 
     20 #include <chrono>
     21 #include <memory>
     22 
     23 #include "tensorflow/core/platform/macros.h"
     24 #include "tensorflow/core/platform/profile_utils/i_cpu_utils_helper.h"
     25 #include "tensorflow/core/platform/types.h"
     26 
     27 #if defined(ARMV6) || defined(__ARM_ARCH_7A__)
     28 #include <sys/time.h>
     29 #endif
     30 
     31 namespace tensorflow {
     32 
     33 namespace profile_utils {
     34 
     35 // CpuUtils is a profiling tool with static functions
     36 // designed to be called from multiple classes.
     37 // A dedicated class which inherits ICpuUtilsHelper is
     38 // stored as a function-local static variable which inherits
     39 // GetCpuUtilsHelperSingletonInstance that caches CPU information,
     40 // because loading CPU information may take a long time.
     41 // Users must call EnableClockCycleProfiling before using CpuUtils.
     42 class CpuUtils {
     43  public:
     44   // Constant for invalid frequency.
     45   // This value is returned when the frequency is not obtained somehow.
     46   static constexpr int64 INVALID_FREQUENCY = -1;
     47   static constexpr uint64 DUMMY_CYCLE_CLOCK = 1;
     48 
     49   // Return current clock cycle. This function is designed to
     50   // minimize the overhead to get clock and maximize the accuracy of
     51   // time for profile.
     52   // This returns unsigned int because there is no guarantee that rdtsc
     53   // is less than 2 ^ 61.
     54   static inline uint64 GetCurrentClockCycle() {
     55 #if defined(__ANDROID__)
     56     return GetCpuUtilsHelperSingletonInstance().GetCurrentClockCycle();
     57 // ----------------------------------------------------------------
     58 #elif defined(__x86_64__) || defined(__amd64__)
     59     uint64_t high, low;
     60     __asm__ volatile("rdtsc" : "=a"(low), "=d"(high));
     61     return (high << 32) | low;
     62 // ----------------------------------------------------------------
     63 #elif defined(__aarch64__)
     64     // System timer of ARMv8 runs at a different frequency than the CPU's.
     65     // The frequency is fixed, typically in the range 1-50MHz.  It can because
     66     // read at CNTFRQ special register.  We assume the OS has set up
     67     // the virtual timer properly.
     68     uint64_t virtual_timer_value;
     69     asm volatile("mrs %0, cntvct_el0" : "=r"(virtual_timer_value));
     70     return virtual_timer_value;
     71 // ----------------------------------------------------------------
     72 // V6 is the earliest arm that has a standard cyclecount
     73 #elif defined(ARMV6) || defined(__ARM_ARCH_7A__)
     74     uint32_t pmccntr;
     75     uint32_t pmuseren;
     76     uint32_t pmcntenset;
     77     // Read the user mode perf monitor counter access permissions.
     78     asm volatile("mrc p15, 0, %0, c9, c14, 0" : "=r"(pmuseren));
     79     if (pmuseren & 1) {  // Allows reading perfmon counters for user mode code.
     80       asm volatile("mrc p15, 0, %0, c9, c12, 1" : "=r"(pmcntenset));
     81       if (pmcntenset & 0x80000000ul) {  // Is it counting?
     82         asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r"(pmccntr));
     83         // The counter is set up to count every 64th cyclecount
     84         return static_cast<uint64>(pmccntr) * 64;  // Should optimize to << 64
     85       }
     86     }
     87     // Returning dummy clock when can't access to the counter
     88     return DUMMY_CYCLE_CLOCK;
     89 #else
     90     // TODO(satok): Support generic way to emulate clock count.
     91     // TODO(satok): Support other architectures if wanted.
     92     // Returning dummy clock when can't access to the counter
     93     return DUMMY_CYCLE_CLOCK;
     94 #endif
     95   }
     96 
     97 // Return cycle counter frequency.
     98 // As this method caches the cpu frequency internally,
     99 // the first call will incur overhead, but not subsequent calls.
    100 #if (defined(__powerpc__) ||                                             \
    101      defined(__ppc__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) || \
    102     (defined(__s390x__))
    103   static uint64 GetCycleCounterFrequency();
    104 #else
    105   static int64 GetCycleCounterFrequency();
    106 #endif
    107 
    108   // Return micro second per each clock
    109   // As this method caches the cpu frequency internally,
    110   // the first call will incur overhead, but not subsequent calls.
    111   static double GetMicroSecPerClock();
    112 
    113   // Reset clock cycle
    114   // Resetting clock cycle is recommended to prevent
    115   // clock cycle counters from overflowing on some platforms.
    116   static void ResetClockCycle();
    117 
    118   // Enable clock cycle profile
    119   // You can enable / disable profile if it's supported by the platform
    120   static void EnableClockCycleProfiling(bool enable);
    121 
    122   // Return chrono::duration per each clock
    123   static std::chrono::duration<double> ConvertClockCycleToTime(
    124       const int64 clock_cycle);
    125 
    126  private:
    127   class DefaultCpuUtilsHelper : public ICpuUtilsHelper {
    128    public:
    129     DefaultCpuUtilsHelper() = default;
    130     void ResetClockCycle() final {}
    131     uint64 GetCurrentClockCycle() final { return DUMMY_CYCLE_CLOCK; }
    132     void EnableClockCycleProfiling(bool /* enable */) final {}
    133     int64 CalculateCpuFrequency() final { return INVALID_FREQUENCY; }
    134 
    135    private:
    136     TF_DISALLOW_COPY_AND_ASSIGN(DefaultCpuUtilsHelper);
    137   };
    138 
    139   // Return cpu frequency.
    140   // CAVEAT: as this method calls system call and parse the mssage,
    141   // this call may be slow. This is why this class caches the value by
    142   // StaticVariableInitializer.
    143   static int64 GetCycleCounterFrequencyImpl();
    144 
    145   // Return a singleton of ICpuUtilsHelper
    146   // ICpuUtilsHelper is declared as a function-local static variable
    147   // for the following two reasons:
    148   // 1. Avoid passing instances to all classes which want
    149   // to use profiling tools in CpuUtils
    150   // 2. Minimize the overhead of acquiring ICpuUtilsHelper
    151   static ICpuUtilsHelper& GetCpuUtilsHelperSingletonInstance();
    152 
    153   TF_DISALLOW_COPY_AND_ASSIGN(CpuUtils);
    154 };
    155 
    156 }  // namespace profile_utils
    157 
    158 }  // namespace tensorflow
    159 
    160 #endif  // TENSORFLOW_PLATFORM_PROFILEUTILS_CPU_UTILS_H__
    161