Home | History | Annotate | Download | only in gpu
      1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #if defined(OS_WIN)
      6 #include <windows.h>
      7 #endif
      8 
      9 #include "content/gpu/gpu_watchdog_thread.h"
     10 
     11 #include "base/bind.h"
     12 #include "base/bind_helpers.h"
     13 #include "base/command_line.h"
     14 #include "base/compiler_specific.h"
     15 #include "base/files/file_util.h"
     16 #include "base/power_monitor/power_monitor.h"
     17 #include "base/process/process.h"
     18 #include "build/build_config.h"
     19 #include "content/public/common/content_switches.h"
     20 #include "content/public/common/result_codes.h"
     21 
     22 namespace content {
     23 namespace {
     24 const int64 kCheckPeriodMs = 2000;
     25 #if defined(OS_CHROMEOS)
     26 const base::FilePath::CharType
     27     kTtyFilePath[] = FILE_PATH_LITERAL("/sys/class/tty/tty0/active");
     28 #endif
     29 }  // namespace
     30 
     31 GpuWatchdogThread::GpuWatchdogThread(int timeout)
     32     : base::Thread("Watchdog"),
     33       watched_message_loop_(base::MessageLoop::current()),
     34       timeout_(base::TimeDelta::FromMilliseconds(timeout)),
     35       armed_(false),
     36 #if defined(OS_WIN)
     37       watched_thread_handle_(0),
     38       arm_cpu_time_(),
     39 #endif
     40       task_observer_(this),
     41       suspended_(false),
     42       weak_factory_(this) {
     43   DCHECK(timeout >= 0);
     44 
     45 #if defined(OS_WIN)
     46   // GetCurrentThread returns a pseudo-handle that cannot be used by one thread
     47   // to identify another. DuplicateHandle creates a "real" handle that can be
     48   // used for this purpose.
     49   BOOL result = DuplicateHandle(GetCurrentProcess(),
     50                                 GetCurrentThread(),
     51                                 GetCurrentProcess(),
     52                                 &watched_thread_handle_,
     53                                 THREAD_QUERY_INFORMATION,
     54                                 FALSE,
     55                                 0);
     56   DCHECK(result);
     57 #endif
     58 
     59 #if defined(OS_CHROMEOS)
     60   tty_file_ = base::OpenFile(base::FilePath(kTtyFilePath), "r");
     61 #endif
     62   watched_message_loop_->AddTaskObserver(&task_observer_);
     63 }
     64 
     65 void GpuWatchdogThread::PostAcknowledge() {
     66   // Called on the monitored thread. Responds with OnAcknowledge. Cannot use
     67   // the method factory. Rely on reference counting instead.
     68   message_loop()->PostTask(
     69       FROM_HERE,
     70       base::Bind(&GpuWatchdogThread::OnAcknowledge, this));
     71 }
     72 
     73 void GpuWatchdogThread::CheckArmed() {
     74   // Acknowledge the watchdog if it has armed itself. The watchdog will not
     75   // change its armed state until it is acknowledged.
     76   if (armed()) {
     77     PostAcknowledge();
     78   }
     79 }
     80 
     81 void GpuWatchdogThread::Init() {
     82   // Schedule the first check.
     83   OnCheck(false);
     84 }
     85 
     86 void GpuWatchdogThread::CleanUp() {
     87   weak_factory_.InvalidateWeakPtrs();
     88 }
     89 
     90 GpuWatchdogThread::GpuWatchdogTaskObserver::GpuWatchdogTaskObserver(
     91     GpuWatchdogThread* watchdog)
     92     : watchdog_(watchdog) {
     93 }
     94 
     95 GpuWatchdogThread::GpuWatchdogTaskObserver::~GpuWatchdogTaskObserver() {
     96 }
     97 
     98 void GpuWatchdogThread::GpuWatchdogTaskObserver::WillProcessTask(
     99     const base::PendingTask& pending_task) {
    100   watchdog_->CheckArmed();
    101 }
    102 
    103 void GpuWatchdogThread::GpuWatchdogTaskObserver::DidProcessTask(
    104     const base::PendingTask& pending_task) {
    105   watchdog_->CheckArmed();
    106 }
    107 
    108 GpuWatchdogThread::~GpuWatchdogThread() {
    109   // Verify that the thread was explicitly stopped. If the thread is stopped
    110   // implicitly by the destructor, CleanUp() will not be called.
    111   DCHECK(!weak_factory_.HasWeakPtrs());
    112 
    113 #if defined(OS_WIN)
    114   CloseHandle(watched_thread_handle_);
    115 #endif
    116 
    117   base::PowerMonitor* power_monitor = base::PowerMonitor::Get();
    118   if (power_monitor)
    119     power_monitor->RemoveObserver(this);
    120 
    121 #if defined(OS_CHROMEOS)
    122   if (tty_file_)
    123     fclose(tty_file_);
    124 #endif
    125 
    126   watched_message_loop_->RemoveTaskObserver(&task_observer_);
    127 }
    128 
    129 void GpuWatchdogThread::OnAcknowledge() {
    130   CHECK(base::PlatformThread::CurrentId() == thread_id());
    131 
    132   // The check has already been acknowledged and another has already been
    133   // scheduled by a previous call to OnAcknowledge. It is normal for a
    134   // watched thread to see armed_ being true multiple times before
    135   // the OnAcknowledge task is run on the watchdog thread.
    136   if (!armed_)
    137     return;
    138 
    139   // Revoke any pending hang termination.
    140   weak_factory_.InvalidateWeakPtrs();
    141   armed_ = false;
    142 
    143   if (suspended_)
    144     return;
    145 
    146   // If it took a long time for the acknowledgement, assume the computer was
    147   // recently suspended.
    148   bool was_suspended = (base::Time::Now() > suspension_timeout_);
    149 
    150   // The monitored thread has responded. Post a task to check it again.
    151   message_loop()->PostDelayedTask(
    152       FROM_HERE,
    153       base::Bind(&GpuWatchdogThread::OnCheck, weak_factory_.GetWeakPtr(),
    154           was_suspended),
    155       base::TimeDelta::FromMilliseconds(kCheckPeriodMs));
    156 }
    157 
    158 void GpuWatchdogThread::OnCheck(bool after_suspend) {
    159   CHECK(base::PlatformThread::CurrentId() == thread_id());
    160 
    161   // Do not create any new termination tasks if one has already been created
    162   // or the system is suspended.
    163   if (armed_ || suspended_)
    164     return;
    165 
    166   // Must set armed before posting the task. This task might be the only task
    167   // that will activate the TaskObserver on the watched thread and it must not
    168   // miss the false -> true transition.
    169   armed_ = true;
    170 
    171 #if defined(OS_WIN)
    172   arm_cpu_time_ = GetWatchedThreadTime();
    173 #endif
    174 
    175   // Immediately after the computer is woken up from being suspended it might
    176   // be pretty sluggish, so allow some extra time before the next timeout.
    177   base::TimeDelta timeout = timeout_ * (after_suspend ? 3 : 1);
    178   suspension_timeout_ = base::Time::Now() + timeout * 2;
    179 
    180   // Post a task to the monitored thread that does nothing but wake up the
    181   // TaskObserver. Any other tasks that are pending on the watched thread will
    182   // also wake up the observer. This simply ensures there is at least one.
    183   watched_message_loop_->PostTask(
    184       FROM_HERE,
    185       base::Bind(&base::DoNothing));
    186 
    187   // Post a task to the watchdog thread to exit if the monitored thread does
    188   // not respond in time.
    189   message_loop()->PostDelayedTask(
    190       FROM_HERE,
    191       base::Bind(
    192           &GpuWatchdogThread::DeliberatelyTerminateToRecoverFromHang,
    193           weak_factory_.GetWeakPtr()),
    194       timeout);
    195 }
    196 
    197 // Use the --disable-gpu-watchdog command line switch to disable this.
    198 void GpuWatchdogThread::DeliberatelyTerminateToRecoverFromHang() {
    199   // Should not get here while the system is suspended.
    200   DCHECK(!suspended_);
    201 
    202 #if defined(OS_WIN)
    203   // Defer termination until a certain amount of CPU time has elapsed on the
    204   // watched thread.
    205   base::TimeDelta time_since_arm = GetWatchedThreadTime() - arm_cpu_time_;
    206   if (time_since_arm < timeout_) {
    207     message_loop()->PostDelayedTask(
    208         FROM_HERE,
    209         base::Bind(
    210             &GpuWatchdogThread::DeliberatelyTerminateToRecoverFromHang,
    211             weak_factory_.GetWeakPtr()),
    212         timeout_ - time_since_arm);
    213     return;
    214   }
    215 #endif
    216 
    217   // If the watchdog woke up significantly behind schedule, disarm and reset
    218   // the watchdog check. This is to prevent the watchdog thread from terminating
    219   // when a machine wakes up from sleep or hibernation, which would otherwise
    220   // appear to be a hang.
    221   if (base::Time::Now() > suspension_timeout_) {
    222     armed_ = false;
    223     OnCheck(true);
    224     return;
    225   }
    226 
    227   // For minimal developer annoyance, don't keep terminating. You need to skip
    228   // the call to base::Process::Terminate below in a debugger for this to be
    229   // useful.
    230   static bool terminated = false;
    231   if (terminated)
    232     return;
    233 
    234 #if defined(OS_WIN)
    235   if (IsDebuggerPresent())
    236     return;
    237 #endif
    238 
    239 #if defined(OS_CHROMEOS)
    240   // Don't crash if we're not on tty1. This avoids noise in the GPU process
    241   // crashes caused by people who use VT2 but still enable crash reporting.
    242   char tty_string[8] = {0};
    243   if (tty_file_ &&
    244       !fseek(tty_file_, 0, SEEK_SET) &&
    245       fread(tty_string, 1, 7, tty_file_)) {
    246     int tty_number = -1;
    247     int num_res = sscanf(tty_string, "tty%d", &tty_number);
    248     if (num_res == 1 && tty_number != 1)
    249       return;
    250   }
    251 #endif
    252 
    253   LOG(ERROR) << "The GPU process hung. Terminating after "
    254              << timeout_.InMilliseconds() << " ms.";
    255 
    256   // Deliberately crash the process to create a crash dump.
    257   *((volatile int*)0) = 0x1337;
    258 
    259   terminated = true;
    260 }
    261 
    262 void GpuWatchdogThread::AddPowerObserver() {
    263   message_loop()->PostTask(
    264       FROM_HERE,
    265       base::Bind(&GpuWatchdogThread::OnAddPowerObserver, this));
    266 }
    267 
    268 void GpuWatchdogThread::OnAddPowerObserver() {
    269   base::PowerMonitor* power_monitor = base::PowerMonitor::Get();
    270   DCHECK(power_monitor);
    271   power_monitor->AddObserver(this);
    272 }
    273 
    274 void GpuWatchdogThread::OnSuspend() {
    275   suspended_ = true;
    276 
    277   // When suspending force an acknowledgement to cancel any pending termination
    278   // tasks.
    279   OnAcknowledge();
    280 }
    281 
    282 void GpuWatchdogThread::OnResume() {
    283   suspended_ = false;
    284 
    285   // After resuming jump-start the watchdog again.
    286   armed_ = false;
    287   OnCheck(true);
    288 }
    289 
    290 #if defined(OS_WIN)
    291 base::TimeDelta GpuWatchdogThread::GetWatchedThreadTime() {
    292   FILETIME creation_time;
    293   FILETIME exit_time;
    294   FILETIME user_time;
    295   FILETIME kernel_time;
    296   BOOL result = GetThreadTimes(watched_thread_handle_,
    297                                &creation_time,
    298                                &exit_time,
    299                                &kernel_time,
    300                                &user_time);
    301   DCHECK(result);
    302 
    303   ULARGE_INTEGER user_time64;
    304   user_time64.HighPart = user_time.dwHighDateTime;
    305   user_time64.LowPart = user_time.dwLowDateTime;
    306 
    307   ULARGE_INTEGER kernel_time64;
    308   kernel_time64.HighPart = kernel_time.dwHighDateTime;
    309   kernel_time64.LowPart = kernel_time.dwLowDateTime;
    310 
    311   // Time is reported in units of 100 nanoseconds. Kernel and user time are
    312   // summed to deal with to kinds of hangs. One is where the GPU process is
    313   // stuck in user level, never calling into the kernel and kernel time is
    314   // not increasing. The other is where either the kernel hangs and never
    315   // returns to user level or where user level code
    316   // calls into kernel level repeatedly, giving up its quanta before it is
    317   // tracked, for example a loop that repeatedly Sleeps.
    318   return base::TimeDelta::FromMilliseconds(static_cast<int64>(
    319       (user_time64.QuadPart + kernel_time64.QuadPart) / 10000));
    320 }
    321 #endif
    322 
    323 }  // namespace content
    324