Home | History | Annotate | Download | only in metrics
      1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "chrome/browser/metrics/thread_watcher.h"
      6 
      7 #include <math.h>  // ceil
      8 
      9 #include "base/bind.h"
     10 #include "base/compiler_specific.h"
     11 #include "base/debug/alias.h"
     12 #include "base/lazy_instance.h"
     13 #include "base/strings/string_number_conversions.h"
     14 #include "base/strings/string_split.h"
     15 #include "base/strings/string_tokenizer.h"
     16 #include "base/strings/stringprintf.h"
     17 #include "base/threading/thread_restrictions.h"
     18 #include "build/build_config.h"
     19 #include "chrome/browser/metrics/metrics_service.h"
     20 #include "chrome/common/chrome_switches.h"
     21 #include "chrome/common/chrome_version_info.h"
     22 #include "chrome/common/dump_without_crashing.h"
     23 #include "chrome/common/logging_chrome.h"
     24 
     25 #if defined(OS_WIN)
     26 #include "base/win/windows_version.h"
     27 #endif
     28 
     29 using content::BrowserThread;
     30 
     31 namespace {
     32 
     33 // The following are unique function names for forcing the crash when a thread
     34 // is unresponsive. This makes it possible to tell from the callstack alone what
     35 // thread was unresponsive.
     36 //
     37 // We disable optimizations for this block of functions so the compiler doesn't
     38 // merge them all together.
     39 MSVC_DISABLE_OPTIMIZE()
     40 MSVC_PUSH_DISABLE_WARNING(4748)
     41 
     42 #ifndef NDEBUG
     43 int* NullPointer() {
     44   return reinterpret_cast<int*>(NULL);
     45 }
     46 #endif
     47 
     48 void NullPointerCrash(int line_number) {
     49 #ifndef NDEBUG
     50   *NullPointer() = line_number;  // Crash.
     51 #else
     52   logging::DumpWithoutCrashing();
     53 #endif
     54 }
     55 
     56 NOINLINE void ShutdownCrash() {
     57   NullPointerCrash(__LINE__);
     58 }
     59 
     60 NOINLINE void ThreadUnresponsive_UI() {
     61   NullPointerCrash(__LINE__);
     62 }
     63 
     64 NOINLINE void ThreadUnresponsive_DB() {
     65   NullPointerCrash(__LINE__);
     66 }
     67 
     68 NOINLINE void ThreadUnresponsive_FILE() {
     69   NullPointerCrash(__LINE__);
     70 }
     71 
     72 NOINLINE void ThreadUnresponsive_FILE_USER_BLOCKING() {
     73   NullPointerCrash(__LINE__);
     74 }
     75 
     76 NOINLINE void ThreadUnresponsive_PROCESS_LAUNCHER() {
     77   NullPointerCrash(__LINE__);
     78 }
     79 
     80 NOINLINE void ThreadUnresponsive_CACHE() {
     81   NullPointerCrash(__LINE__);
     82 }
     83 
     84 NOINLINE void ThreadUnresponsive_IO() {
     85   NullPointerCrash(__LINE__);
     86 }
     87 
     88 MSVC_POP_WARNING()
     89 MSVC_ENABLE_OPTIMIZE();
     90 
     91 void CrashBecauseThreadWasUnresponsive(BrowserThread::ID thread_id) {
     92   base::debug::Alias(&thread_id);
     93 
     94   switch (thread_id) {
     95     case BrowserThread::UI:
     96       return ThreadUnresponsive_UI();
     97     case BrowserThread::DB:
     98       return ThreadUnresponsive_DB();
     99     case BrowserThread::FILE:
    100       return ThreadUnresponsive_FILE();
    101     case BrowserThread::FILE_USER_BLOCKING:
    102       return ThreadUnresponsive_FILE_USER_BLOCKING();
    103     case BrowserThread::PROCESS_LAUNCHER:
    104       return ThreadUnresponsive_PROCESS_LAUNCHER();
    105     case BrowserThread::CACHE:
    106       return ThreadUnresponsive_CACHE();
    107     case BrowserThread::IO:
    108       return ThreadUnresponsive_IO();
    109     case BrowserThread::ID_COUNT:
    110       CHECK(false);  // This shouldn't actually be reached!
    111       break;
    112 
    113     // Omission of the default hander is intentional -- that way the compiler
    114     // should warn if our switch becomes outdated.
    115   }
    116 
    117   CHECK(false) << "Unknown thread was unresponsive.";  // Shouldn't be reached.
    118 }
    119 
    120 }  // namespace
    121 
    122 // ThreadWatcher methods and members.
    123 ThreadWatcher::ThreadWatcher(const WatchingParams& params)
    124     : thread_id_(params.thread_id),
    125       thread_name_(params.thread_name),
    126       watched_loop_(
    127           BrowserThread::GetMessageLoopProxyForThread(params.thread_id)),
    128       sleep_time_(params.sleep_time),
    129       unresponsive_time_(params.unresponsive_time),
    130       ping_time_(base::TimeTicks::Now()),
    131       pong_time_(ping_time_),
    132       ping_sequence_number_(0),
    133       active_(false),
    134       ping_count_(params.unresponsive_threshold),
    135       response_time_histogram_(NULL),
    136       unresponsive_time_histogram_(NULL),
    137       unresponsive_count_(0),
    138       hung_processing_complete_(false),
    139       unresponsive_threshold_(params.unresponsive_threshold),
    140       crash_on_hang_(params.crash_on_hang),
    141       live_threads_threshold_(params.live_threads_threshold),
    142       weak_ptr_factory_(this) {
    143   DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
    144   Initialize();
    145 }
    146 
    147 ThreadWatcher::~ThreadWatcher() {}
    148 
    149 // static
    150 void ThreadWatcher::StartWatching(const WatchingParams& params) {
    151   DCHECK_GE(params.sleep_time.InMilliseconds(), 0);
    152   DCHECK_GE(params.unresponsive_time.InMilliseconds(),
    153             params.sleep_time.InMilliseconds());
    154 
    155   // If we are not on WatchDogThread, then post a task to call StartWatching on
    156   // WatchDogThread.
    157   if (!WatchDogThread::CurrentlyOnWatchDogThread()) {
    158     WatchDogThread::PostTask(
    159         FROM_HERE,
    160         base::Bind(&ThreadWatcher::StartWatching, params));
    161     return;
    162   }
    163 
    164   DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
    165 
    166   // Create a new thread watcher object for the given thread and activate it.
    167   ThreadWatcher* watcher = new ThreadWatcher(params);
    168 
    169   DCHECK(watcher);
    170   // If we couldn't register the thread watcher object, we are shutting down,
    171   // then don't activate thread watching.
    172   if (!ThreadWatcherList::IsRegistered(params.thread_id))
    173     return;
    174   watcher->ActivateThreadWatching();
    175 }
    176 
    177 void ThreadWatcher::ActivateThreadWatching() {
    178   DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
    179   if (active_) return;
    180   active_ = true;
    181   ping_count_ = unresponsive_threshold_;
    182   ResetHangCounters();
    183   base::MessageLoop::current()->PostTask(
    184       FROM_HERE,
    185       base::Bind(&ThreadWatcher::PostPingMessage,
    186                  weak_ptr_factory_.GetWeakPtr()));
    187 }
    188 
    189 void ThreadWatcher::DeActivateThreadWatching() {
    190   DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
    191   active_ = false;
    192   ping_count_ = 0;
    193   weak_ptr_factory_.InvalidateWeakPtrs();
    194 }
    195 
    196 void ThreadWatcher::WakeUp() {
    197   DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
    198   // There is some user activity, PostPingMessage task of thread watcher if
    199   // needed.
    200   if (!active_) return;
    201 
    202   // Throw away the previous |unresponsive_count_| and start over again. Just
    203   // before going to sleep, |unresponsive_count_| could be very close to
    204   // |unresponsive_threshold_| and when user becomes active,
    205   // |unresponsive_count_| can go over |unresponsive_threshold_| if there was no
    206   // response for ping messages. Reset |unresponsive_count_| to start measuring
    207   // the unresponsiveness of the threads when system becomes active.
    208   unresponsive_count_ = 0;
    209 
    210   if (ping_count_ <= 0) {
    211     ping_count_ = unresponsive_threshold_;
    212     ResetHangCounters();
    213     PostPingMessage();
    214   } else {
    215     ping_count_ = unresponsive_threshold_;
    216   }
    217 }
    218 
    219 void ThreadWatcher::PostPingMessage() {
    220   DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
    221   // If we have stopped watching or if the user is idle, then stop sending
    222   // ping messages.
    223   if (!active_ || ping_count_ <= 0)
    224     return;
    225 
    226   // Save the current time when we have sent ping message.
    227   ping_time_ = base::TimeTicks::Now();
    228 
    229   // Send a ping message to the watched thread. Callback will be called on
    230   // the WatchDogThread.
    231   base::Closure callback(
    232       base::Bind(&ThreadWatcher::OnPongMessage, weak_ptr_factory_.GetWeakPtr(),
    233                  ping_sequence_number_));
    234   if (watched_loop_->PostTask(
    235           FROM_HERE,
    236           base::Bind(&ThreadWatcher::OnPingMessage, thread_id_,
    237                      callback))) {
    238       // Post a task to check the responsiveness of watched thread.
    239       base::MessageLoop::current()->PostDelayedTask(
    240           FROM_HERE,
    241           base::Bind(&ThreadWatcher::OnCheckResponsiveness,
    242                      weak_ptr_factory_.GetWeakPtr(), ping_sequence_number_),
    243           unresponsive_time_);
    244   } else {
    245     // Watched thread might have gone away, stop watching it.
    246     DeActivateThreadWatching();
    247   }
    248 }
    249 
    250 void ThreadWatcher::OnPongMessage(uint64 ping_sequence_number) {
    251   DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
    252 
    253   // Record watched thread's response time.
    254   base::TimeTicks now = base::TimeTicks::Now();
    255   base::TimeDelta response_time = now - ping_time_;
    256   response_time_histogram_->AddTime(response_time);
    257 
    258   // Save the current time when we have got pong message.
    259   pong_time_ = now;
    260 
    261   // Check if there are any extra pings in flight.
    262   DCHECK_EQ(ping_sequence_number_, ping_sequence_number);
    263   if (ping_sequence_number_ != ping_sequence_number)
    264     return;
    265 
    266   // Increment sequence number for the next ping message to indicate watched
    267   // thread is responsive.
    268   ++ping_sequence_number_;
    269 
    270   // If we have stopped watching or if the user is idle, then stop sending
    271   // ping messages.
    272   if (!active_ || --ping_count_ <= 0)
    273     return;
    274 
    275   base::MessageLoop::current()->PostDelayedTask(
    276       FROM_HERE,
    277       base::Bind(&ThreadWatcher::PostPingMessage,
    278                  weak_ptr_factory_.GetWeakPtr()),
    279       sleep_time_);
    280 }
    281 
    282 void ThreadWatcher::OnCheckResponsiveness(uint64 ping_sequence_number) {
    283   DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
    284   // If we have stopped watching then consider thread as responding.
    285   if (!active_) {
    286     responsive_ = true;
    287     return;
    288   }
    289   // If the latest ping_sequence_number_ is not same as the ping_sequence_number
    290   // that is passed in, then we can assume OnPongMessage was called.
    291   // OnPongMessage increments ping_sequence_number_.
    292   if (ping_sequence_number_ != ping_sequence_number) {
    293     // Reset unresponsive_count_ to zero because we got a response from the
    294     // watched thread.
    295     ResetHangCounters();
    296 
    297     responsive_ = true;
    298     return;
    299   }
    300   // Record that we got no response from watched thread.
    301   GotNoResponse();
    302 
    303   // Post a task to check the responsiveness of watched thread.
    304   base::MessageLoop::current()->PostDelayedTask(
    305       FROM_HERE,
    306       base::Bind(&ThreadWatcher::OnCheckResponsiveness,
    307                  weak_ptr_factory_.GetWeakPtr(), ping_sequence_number_),
    308       unresponsive_time_);
    309   responsive_ = false;
    310 }
    311 
    312 void ThreadWatcher::Initialize() {
    313   DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
    314   ThreadWatcherList::Register(this);
    315 
    316   const std::string response_time_histogram_name =
    317       "ThreadWatcher.ResponseTime." + thread_name_;
    318   response_time_histogram_ = base::Histogram::FactoryTimeGet(
    319       response_time_histogram_name,
    320       base::TimeDelta::FromMilliseconds(1),
    321       base::TimeDelta::FromSeconds(100), 50,
    322       base::Histogram::kUmaTargetedHistogramFlag);
    323 
    324   const std::string unresponsive_time_histogram_name =
    325       "ThreadWatcher.Unresponsive." + thread_name_;
    326   unresponsive_time_histogram_ = base::Histogram::FactoryTimeGet(
    327       unresponsive_time_histogram_name,
    328       base::TimeDelta::FromMilliseconds(1),
    329       base::TimeDelta::FromSeconds(100), 50,
    330       base::Histogram::kUmaTargetedHistogramFlag);
    331 
    332   const std::string responsive_count_histogram_name =
    333       "ThreadWatcher.ResponsiveThreads." + thread_name_;
    334   responsive_count_histogram_ = base::LinearHistogram::FactoryGet(
    335       responsive_count_histogram_name, 1, 10, 11,
    336       base::Histogram::kUmaTargetedHistogramFlag);
    337 
    338   const std::string unresponsive_count_histogram_name =
    339       "ThreadWatcher.UnresponsiveThreads." + thread_name_;
    340   unresponsive_count_histogram_ = base::LinearHistogram::FactoryGet(
    341       unresponsive_count_histogram_name, 1, 10, 11,
    342       base::Histogram::kUmaTargetedHistogramFlag);
    343 }
    344 
    345 // static
    346 void ThreadWatcher::OnPingMessage(const BrowserThread::ID& thread_id,
    347                                   const base::Closure& callback_task) {
    348   // This method is called on watched thread.
    349   DCHECK(BrowserThread::CurrentlyOn(thread_id));
    350   WatchDogThread::PostTask(FROM_HERE, callback_task);
    351 }
    352 
    353 void ThreadWatcher::ResetHangCounters() {
    354   DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
    355   unresponsive_count_ = 0;
    356   hung_processing_complete_ = false;
    357 }
    358 
    359 void ThreadWatcher::GotNoResponse() {
    360   DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
    361 
    362   ++unresponsive_count_;
    363   if (!IsVeryUnresponsive())
    364     return;
    365 
    366   // Record total unresponsive_time since last pong message.
    367   base::TimeDelta unresponse_time = base::TimeTicks::Now() - pong_time_;
    368   unresponsive_time_histogram_->AddTime(unresponse_time);
    369 
    370   // We have already collected stats for the non-responding watched thread.
    371   if (hung_processing_complete_)
    372     return;
    373 
    374   // Record how other threads are responding.
    375   uint32 responding_thread_count = 0;
    376   uint32 unresponding_thread_count = 0;
    377   ThreadWatcherList::GetStatusOfThreads(&responding_thread_count,
    378                                         &unresponding_thread_count);
    379 
    380   // Record how many watched threads are responding.
    381   responsive_count_histogram_->Add(responding_thread_count);
    382 
    383   // Record how many watched threads are not responding.
    384   unresponsive_count_histogram_->Add(unresponding_thread_count);
    385 
    386   // Crash the browser if the watched thread is to be crashed on hang and if the
    387   // number of other threads responding is less than or equal to
    388   // live_threads_threshold_ and at least one other thread is responding.
    389   if (crash_on_hang_ &&
    390       responding_thread_count > 0 &&
    391       responding_thread_count <= live_threads_threshold_) {
    392     static bool crashed_once = false;
    393     if (!crashed_once) {
    394       crashed_once = true;
    395       CrashBecauseThreadWasUnresponsive(thread_id_);
    396     }
    397   }
    398 
    399   hung_processing_complete_ = true;
    400 }
    401 
    402 bool ThreadWatcher::IsVeryUnresponsive() {
    403   DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
    404   return unresponsive_count_ >= unresponsive_threshold_;
    405 }
    406 
    407 // ThreadWatcherList methods and members.
    408 //
    409 // static
    410 ThreadWatcherList* ThreadWatcherList::g_thread_watcher_list_ = NULL;
    411 // static
    412 const int ThreadWatcherList::kSleepSeconds = 1;
    413 // static
    414 const int ThreadWatcherList::kUnresponsiveSeconds = 2;
    415 // static
    416 const int ThreadWatcherList::kUnresponsiveCount = 9;
    417 // static
    418 const int ThreadWatcherList::kLiveThreadsThreshold = 2;
    419 
    420 ThreadWatcherList::CrashDataThresholds::CrashDataThresholds(
    421     uint32 live_threads_threshold,
    422     uint32 unresponsive_threshold)
    423     : live_threads_threshold(live_threads_threshold),
    424       unresponsive_threshold(unresponsive_threshold) {
    425 }
    426 
    427 ThreadWatcherList::CrashDataThresholds::CrashDataThresholds()
    428     : live_threads_threshold(kLiveThreadsThreshold),
    429       unresponsive_threshold(kUnresponsiveCount) {
    430 }
    431 
    432 // static
    433 void ThreadWatcherList::StartWatchingAll(const CommandLine& command_line) {
    434   // TODO(rtenneti): Enable ThreadWatcher.
    435   uint32 unresponsive_threshold;
    436   CrashOnHangThreadMap crash_on_hang_threads;
    437   ParseCommandLine(command_line,
    438                    &unresponsive_threshold,
    439                    &crash_on_hang_threads);
    440 
    441   ThreadWatcherObserver::SetupNotifications(
    442       base::TimeDelta::FromSeconds(kSleepSeconds * unresponsive_threshold));
    443 
    444   WatchDogThread::PostDelayedTask(
    445       FROM_HERE,
    446       base::Bind(&ThreadWatcherList::InitializeAndStartWatching,
    447                  unresponsive_threshold,
    448                  crash_on_hang_threads),
    449       base::TimeDelta::FromSeconds(120));
    450 }
    451 
    452 // static
    453 void ThreadWatcherList::StopWatchingAll() {
    454   // TODO(rtenneti): Enable ThreadWatcher.
    455   ThreadWatcherObserver::RemoveNotifications();
    456   DeleteAll();
    457 }
    458 
    459 // static
    460 void ThreadWatcherList::Register(ThreadWatcher* watcher) {
    461   DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
    462   if (!g_thread_watcher_list_)
    463     return;
    464   DCHECK(!g_thread_watcher_list_->Find(watcher->thread_id()));
    465   g_thread_watcher_list_->registered_[watcher->thread_id()] = watcher;
    466 }
    467 
    468 // static
    469 bool ThreadWatcherList::IsRegistered(const BrowserThread::ID thread_id) {
    470   DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
    471   return NULL != ThreadWatcherList::Find(thread_id);
    472 }
    473 
    474 // static
    475 void ThreadWatcherList::GetStatusOfThreads(uint32* responding_thread_count,
    476                                            uint32* unresponding_thread_count) {
    477   DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
    478   *responding_thread_count = 0;
    479   *unresponding_thread_count = 0;
    480   if (!g_thread_watcher_list_)
    481     return;
    482 
    483   for (RegistrationList::iterator it =
    484            g_thread_watcher_list_->registered_.begin();
    485        g_thread_watcher_list_->registered_.end() != it;
    486        ++it) {
    487     if (it->second->IsVeryUnresponsive())
    488       ++(*unresponding_thread_count);
    489     else
    490       ++(*responding_thread_count);
    491   }
    492 }
    493 
    494 // static
    495 void ThreadWatcherList::WakeUpAll() {
    496   DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
    497   if (!g_thread_watcher_list_)
    498     return;
    499 
    500   for (RegistrationList::iterator it =
    501            g_thread_watcher_list_->registered_.begin();
    502        g_thread_watcher_list_->registered_.end() != it;
    503        ++it)
    504     it->second->WakeUp();
    505 }
    506 
    507 ThreadWatcherList::ThreadWatcherList() {
    508   DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
    509   CHECK(!g_thread_watcher_list_);
    510   g_thread_watcher_list_ = this;
    511 }
    512 
    513 ThreadWatcherList::~ThreadWatcherList() {
    514   DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
    515   DCHECK(this == g_thread_watcher_list_);
    516   g_thread_watcher_list_ = NULL;
    517 }
    518 
    519 // static
    520 void ThreadWatcherList::ParseCommandLine(
    521     const CommandLine& command_line,
    522     uint32* unresponsive_threshold,
    523     CrashOnHangThreadMap* crash_on_hang_threads) {
    524   // Initialize |unresponsive_threshold| to a default value.
    525   *unresponsive_threshold = kUnresponsiveCount;
    526 
    527   // Increase the unresponsive_threshold on the Stable and Beta channels to
    528   // reduce the number of crashes due to ThreadWatcher.
    529   chrome::VersionInfo::Channel channel = chrome::VersionInfo::GetChannel();
    530   if (channel == chrome::VersionInfo::CHANNEL_STABLE) {
    531     *unresponsive_threshold *= 4;
    532   } else if (channel == chrome::VersionInfo::CHANNEL_BETA) {
    533     *unresponsive_threshold *= 2;
    534   }
    535 
    536 #if defined(OS_WIN)
    537   // For Windows XP (old systems), double the unresponsive_threshold to give
    538   // the OS a chance to schedule UI/IO threads a time slice to respond with a
    539   // pong message (to get around limitations with the OS).
    540   if (base::win::GetVersion() <= base::win::VERSION_XP)
    541     *unresponsive_threshold *= 2;
    542 #endif
    543 
    544   uint32 crash_seconds = *unresponsive_threshold * kUnresponsiveSeconds;
    545   std::string crash_on_hang_thread_names;
    546   bool has_command_line_overwrite = false;
    547   if (command_line.HasSwitch(switches::kCrashOnHangThreads)) {
    548     crash_on_hang_thread_names =
    549         command_line.GetSwitchValueASCII(switches::kCrashOnHangThreads);
    550     has_command_line_overwrite = true;
    551   } else if (channel != chrome::VersionInfo::CHANNEL_STABLE) {
    552     // Default to crashing the browser if UI or IO or FILE threads are not
    553     // responsive except in stable channel.
    554     crash_on_hang_thread_names = base::StringPrintf(
    555         "UI:%d:%d,IO:%d:%d,FILE:%d:%d",
    556         kLiveThreadsThreshold, crash_seconds,
    557         kLiveThreadsThreshold, crash_seconds,
    558         kLiveThreadsThreshold, crash_seconds * 5);
    559   }
    560 
    561   ParseCommandLineCrashOnHangThreads(crash_on_hang_thread_names,
    562                                      kLiveThreadsThreshold,
    563                                      crash_seconds,
    564                                      crash_on_hang_threads);
    565 
    566   if (channel != chrome::VersionInfo::CHANNEL_CANARY ||
    567       has_command_line_overwrite) {
    568     return;
    569   }
    570 
    571   // Set up a field trial for 100% of the users to crash if either UI or IO
    572   // thread is not responsive for 30 seconds (or 15 pings).
    573   scoped_refptr<base::FieldTrial> field_trial(
    574       base::FieldTrialList::FactoryGetFieldTrial(
    575           "ThreadWatcher", 100, "default_hung_threads",
    576           2014, 10, 30, base::FieldTrial::SESSION_RANDOMIZED, NULL));
    577   int hung_thread_group = field_trial->AppendGroup("hung_thread", 100);
    578   if (field_trial->group() == hung_thread_group) {
    579     for (CrashOnHangThreadMap::iterator it = crash_on_hang_threads->begin();
    580          crash_on_hang_threads->end() != it;
    581          ++it) {
    582       if (it->first == "FILE")
    583         continue;
    584       it->second.live_threads_threshold = INT_MAX;
    585       if (it->first == "UI") {
    586         // TODO(rtenneti): set unresponsive threshold to 120 seconds to catch
    587         // the worst UI hangs and for fewer crashes due to ThreadWatcher. Reduce
    588         // it to a more reasonable time ala IO thread.
    589         it->second.unresponsive_threshold = 60;
    590       } else {
    591         it->second.unresponsive_threshold = 15;
    592       }
    593     }
    594   }
    595 }
    596 
    597 // static
    598 void ThreadWatcherList::ParseCommandLineCrashOnHangThreads(
    599     const std::string& crash_on_hang_thread_names,
    600     uint32 default_live_threads_threshold,
    601     uint32 default_crash_seconds,
    602     CrashOnHangThreadMap* crash_on_hang_threads) {
    603   base::StringTokenizer tokens(crash_on_hang_thread_names, ",");
    604   std::vector<std::string> values;
    605   while (tokens.GetNext()) {
    606     const std::string& token = tokens.token();
    607     base::SplitString(token, ':', &values);
    608     std::string thread_name = values[0];
    609 
    610     uint32 live_threads_threshold = default_live_threads_threshold;
    611     uint32 crash_seconds = default_crash_seconds;
    612     if (values.size() >= 2 &&
    613         (!base::StringToUint(values[1], &live_threads_threshold))) {
    614       continue;
    615     }
    616     if (values.size() >= 3 &&
    617         (!base::StringToUint(values[2], &crash_seconds))) {
    618       continue;
    619     }
    620     uint32 unresponsive_threshold = static_cast<uint32>(
    621         ceil(static_cast<float>(crash_seconds) / kUnresponsiveSeconds));
    622 
    623     CrashDataThresholds crash_data(live_threads_threshold,
    624                                    unresponsive_threshold);
    625     // Use the last specifier.
    626     (*crash_on_hang_threads)[thread_name] = crash_data;
    627   }
    628 }
    629 
    630 // static
    631 void ThreadWatcherList::InitializeAndStartWatching(
    632     uint32 unresponsive_threshold,
    633     const CrashOnHangThreadMap& crash_on_hang_threads) {
    634   DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
    635 
    636   ThreadWatcherList* thread_watcher_list = new ThreadWatcherList();
    637   CHECK(thread_watcher_list);
    638 
    639   BrowserThread::PostTask(
    640       BrowserThread::UI,
    641       FROM_HERE,
    642       base::Bind(&StartupTimeBomb::DisarmStartupTimeBomb));
    643 
    644   const base::TimeDelta kSleepTime =
    645       base::TimeDelta::FromSeconds(kSleepSeconds);
    646   const base::TimeDelta kUnresponsiveTime =
    647       base::TimeDelta::FromSeconds(kUnresponsiveSeconds);
    648 
    649   StartWatching(BrowserThread::UI, "UI", kSleepTime, kUnresponsiveTime,
    650                 unresponsive_threshold, crash_on_hang_threads);
    651   StartWatching(BrowserThread::IO, "IO", kSleepTime, kUnresponsiveTime,
    652                 unresponsive_threshold, crash_on_hang_threads);
    653   StartWatching(BrowserThread::DB, "DB", kSleepTime, kUnresponsiveTime,
    654                 unresponsive_threshold, crash_on_hang_threads);
    655   StartWatching(BrowserThread::FILE, "FILE", kSleepTime, kUnresponsiveTime,
    656                 unresponsive_threshold, crash_on_hang_threads);
    657   StartWatching(BrowserThread::CACHE, "CACHE", kSleepTime, kUnresponsiveTime,
    658                 unresponsive_threshold, crash_on_hang_threads);
    659 }
    660 
    661 // static
    662 void ThreadWatcherList::StartWatching(
    663     const BrowserThread::ID& thread_id,
    664     const std::string& thread_name,
    665     const base::TimeDelta& sleep_time,
    666     const base::TimeDelta& unresponsive_time,
    667     uint32 unresponsive_threshold,
    668     const CrashOnHangThreadMap& crash_on_hang_threads) {
    669   DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
    670 
    671   CrashOnHangThreadMap::const_iterator it =
    672       crash_on_hang_threads.find(thread_name);
    673   bool crash_on_hang = false;
    674   uint32 live_threads_threshold = 0;
    675   if (it != crash_on_hang_threads.end()) {
    676     crash_on_hang = true;
    677     live_threads_threshold = it->second.live_threads_threshold;
    678     unresponsive_threshold = it->second.unresponsive_threshold;
    679   }
    680 
    681   ThreadWatcher::StartWatching(
    682       ThreadWatcher::WatchingParams(thread_id,
    683                                     thread_name,
    684                                     sleep_time,
    685                                     unresponsive_time,
    686                                     unresponsive_threshold,
    687                                     crash_on_hang,
    688                                     live_threads_threshold));
    689 }
    690 
    691 // static
    692 void ThreadWatcherList::DeleteAll() {
    693   if (!WatchDogThread::CurrentlyOnWatchDogThread()) {
    694     WatchDogThread::PostTask(
    695         FROM_HERE,
    696         base::Bind(&ThreadWatcherList::DeleteAll));
    697     return;
    698   }
    699 
    700   DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
    701   if (!g_thread_watcher_list_)
    702     return;
    703 
    704   // Delete all thread watcher objects.
    705   while (!g_thread_watcher_list_->registered_.empty()) {
    706     RegistrationList::iterator it = g_thread_watcher_list_->registered_.begin();
    707     delete it->second;
    708     g_thread_watcher_list_->registered_.erase(it);
    709   }
    710 
    711   delete g_thread_watcher_list_;
    712 }
    713 
    714 // static
    715 ThreadWatcher* ThreadWatcherList::Find(const BrowserThread::ID& thread_id) {
    716   DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
    717   if (!g_thread_watcher_list_)
    718     return NULL;
    719   RegistrationList::iterator it =
    720       g_thread_watcher_list_->registered_.find(thread_id);
    721   if (g_thread_watcher_list_->registered_.end() == it)
    722     return NULL;
    723   return it->second;
    724 }
    725 
    726 // ThreadWatcherObserver methods and members.
    727 //
    728 // static
    729 ThreadWatcherObserver* ThreadWatcherObserver::g_thread_watcher_observer_ = NULL;
    730 
    731 ThreadWatcherObserver::ThreadWatcherObserver(
    732     const base::TimeDelta& wakeup_interval)
    733     : last_wakeup_time_(base::TimeTicks::Now()),
    734       wakeup_interval_(wakeup_interval) {
    735   CHECK(!g_thread_watcher_observer_);
    736   g_thread_watcher_observer_ = this;
    737 }
    738 
    739 ThreadWatcherObserver::~ThreadWatcherObserver() {
    740   DCHECK(this == g_thread_watcher_observer_);
    741   g_thread_watcher_observer_ = NULL;
    742 }
    743 
    744 // static
    745 void ThreadWatcherObserver::SetupNotifications(
    746     const base::TimeDelta& wakeup_interval) {
    747   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
    748   ThreadWatcherObserver* observer = new ThreadWatcherObserver(wakeup_interval);
    749   MetricsService::SetUpNotifications(&observer->registrar_, observer);
    750 }
    751 
    752 // static
    753 void ThreadWatcherObserver::RemoveNotifications() {
    754   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
    755   if (!g_thread_watcher_observer_)
    756     return;
    757   g_thread_watcher_observer_->registrar_.RemoveAll();
    758   delete g_thread_watcher_observer_;
    759 }
    760 
    761 void ThreadWatcherObserver::Observe(
    762     int type,
    763     const content::NotificationSource& source,
    764     const content::NotificationDetails& details) {
    765   // There is some user activity, see if thread watchers are to be awakened.
    766   base::TimeTicks now = base::TimeTicks::Now();
    767   if ((now - last_wakeup_time_) < wakeup_interval_)
    768     return;
    769   last_wakeup_time_ = now;
    770   WatchDogThread::PostTask(
    771       FROM_HERE,
    772       base::Bind(&ThreadWatcherList::WakeUpAll));
    773 }
    774 
    775 // WatchDogThread methods and members.
    776 
    777 // This lock protects g_watchdog_thread.
    778 static base::LazyInstance<base::Lock>::Leaky
    779     g_watchdog_lock = LAZY_INSTANCE_INITIALIZER;
    780 
    781 // The singleton of this class.
    782 static WatchDogThread* g_watchdog_thread = NULL;
    783 
    784 WatchDogThread::WatchDogThread() : Thread("BrowserWatchdog") {
    785 }
    786 
    787 WatchDogThread::~WatchDogThread() {
    788   Stop();
    789 }
    790 
    791 // static
    792 bool WatchDogThread::CurrentlyOnWatchDogThread() {
    793   base::AutoLock lock(g_watchdog_lock.Get());
    794   return g_watchdog_thread &&
    795       g_watchdog_thread->message_loop() == base::MessageLoop::current();
    796 }
    797 
    798 // static
    799 bool WatchDogThread::PostTask(const tracked_objects::Location& from_here,
    800                               const base::Closure& task) {
    801   return PostTaskHelper(from_here, task, base::TimeDelta());
    802 }
    803 
    804 // static
    805 bool WatchDogThread::PostDelayedTask(const tracked_objects::Location& from_here,
    806                                      const base::Closure& task,
    807                                      base::TimeDelta delay) {
    808   return PostTaskHelper(from_here, task, delay);
    809 }
    810 
    811 // static
    812 bool WatchDogThread::PostTaskHelper(
    813     const tracked_objects::Location& from_here,
    814     const base::Closure& task,
    815     base::TimeDelta delay) {
    816   {
    817     base::AutoLock lock(g_watchdog_lock.Get());
    818 
    819     base::MessageLoop* message_loop = g_watchdog_thread ?
    820         g_watchdog_thread->message_loop() : NULL;
    821     if (message_loop) {
    822       message_loop->PostDelayedTask(from_here, task, delay);
    823       return true;
    824     }
    825   }
    826 
    827   return false;
    828 }
    829 
    830 void WatchDogThread::Init() {
    831   // This thread shouldn't be allowed to perform any blocking disk I/O.
    832   base::ThreadRestrictions::SetIOAllowed(false);
    833 
    834   base::AutoLock lock(g_watchdog_lock.Get());
    835   CHECK(!g_watchdog_thread);
    836   g_watchdog_thread = this;
    837 }
    838 
    839 void WatchDogThread::CleanUp() {
    840   base::AutoLock lock(g_watchdog_lock.Get());
    841   g_watchdog_thread = NULL;
    842 }
    843 
    844 namespace {
    845 
    846 // StartupWatchDogThread methods and members.
    847 //
    848 // Class for detecting hangs during startup.
    849 class StartupWatchDogThread : public base::Watchdog {
    850  public:
    851   // Constructor specifies how long the StartupWatchDogThread will wait before
    852   // alarming.
    853   explicit StartupWatchDogThread(const base::TimeDelta& duration)
    854       : base::Watchdog(duration, "Startup watchdog thread", true) {
    855   }
    856 
    857   // Alarm is called if the time expires after an Arm() without someone calling
    858   // Disarm(). When Alarm goes off, in release mode we get the crash dump
    859   // without crashing and in debug mode we break into the debugger.
    860   virtual void Alarm() OVERRIDE {
    861 #ifndef NDEBUG
    862     DCHECK(false);
    863 #else
    864     logging::DumpWithoutCrashing();
    865 #endif
    866   }
    867 
    868   DISALLOW_COPY_AND_ASSIGN(StartupWatchDogThread);
    869 };
    870 
    871 // ShutdownWatchDogThread methods and members.
    872 //
    873 // Class for detecting hangs during shutdown.
    874 class ShutdownWatchDogThread : public base::Watchdog {
    875  public:
    876   // Constructor specifies how long the ShutdownWatchDogThread will wait before
    877   // alarming.
    878   explicit ShutdownWatchDogThread(const base::TimeDelta& duration)
    879       : base::Watchdog(duration, "Shutdown watchdog thread", true) {
    880   }
    881 
    882   // Alarm is called if the time expires after an Arm() without someone calling
    883   // Disarm(). We crash the browser if this method is called.
    884   virtual void Alarm() OVERRIDE {
    885     ShutdownCrash();
    886   }
    887 
    888   DISALLOW_COPY_AND_ASSIGN(ShutdownWatchDogThread);
    889 };
    890 }  // namespace
    891 
    892 // StartupTimeBomb methods and members.
    893 //
    894 // static
    895 StartupTimeBomb* StartupTimeBomb::g_startup_timebomb_ = NULL;
    896 
    897 StartupTimeBomb::StartupTimeBomb()
    898     : startup_watchdog_(NULL),
    899       thread_id_(base::PlatformThread::CurrentId()) {
    900   CHECK(!g_startup_timebomb_);
    901   g_startup_timebomb_ = this;
    902 }
    903 
    904 StartupTimeBomb::~StartupTimeBomb() {
    905   DCHECK(this == g_startup_timebomb_);
    906   DCHECK_EQ(thread_id_, base::PlatformThread::CurrentId());
    907   if (startup_watchdog_)
    908     Disarm();
    909   g_startup_timebomb_ = NULL;
    910 }
    911 
    912 void StartupTimeBomb::Arm(const base::TimeDelta& duration) {
    913   DCHECK_EQ(thread_id_, base::PlatformThread::CurrentId());
    914   DCHECK(!startup_watchdog_);
    915   startup_watchdog_ = new StartupWatchDogThread(duration);
    916   startup_watchdog_->Arm();
    917   return;
    918 }
    919 
    920 void StartupTimeBomb::Disarm() {
    921   DCHECK_EQ(thread_id_, base::PlatformThread::CurrentId());
    922   if (startup_watchdog_) {
    923     startup_watchdog_->Disarm();
    924     startup_watchdog_->Cleanup();
    925     DeleteStartupWatchdog();
    926   }
    927 }
    928 
    929 void StartupTimeBomb::DeleteStartupWatchdog() {
    930   DCHECK_EQ(thread_id_, base::PlatformThread::CurrentId());
    931   if (startup_watchdog_->IsJoinable()) {
    932     // Allow the watchdog thread to shutdown on UI. Watchdog thread shutdowns
    933     // very fast.
    934     base::ThreadRestrictions::SetIOAllowed(true);
    935     delete startup_watchdog_;
    936     startup_watchdog_ = NULL;
    937     return;
    938   }
    939   base::MessageLoop::current()->PostDelayedTask(
    940       FROM_HERE,
    941       base::Bind(&StartupTimeBomb::DeleteStartupWatchdog,
    942                  base::Unretained(this)),
    943       base::TimeDelta::FromSeconds(10));
    944 }
    945 
    946 // static
    947 void StartupTimeBomb::DisarmStartupTimeBomb() {
    948   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
    949   if (g_startup_timebomb_)
    950     g_startup_timebomb_->Disarm();
    951 }
    952 
    953 // ShutdownWatcherHelper methods and members.
    954 //
    955 // ShutdownWatcherHelper is a wrapper class for detecting hangs during
    956 // shutdown.
    957 ShutdownWatcherHelper::ShutdownWatcherHelper()
    958     : shutdown_watchdog_(NULL),
    959       thread_id_(base::PlatformThread::CurrentId()) {
    960 }
    961 
    962 ShutdownWatcherHelper::~ShutdownWatcherHelper() {
    963   DCHECK_EQ(thread_id_, base::PlatformThread::CurrentId());
    964   if (shutdown_watchdog_) {
    965     shutdown_watchdog_->Disarm();
    966     delete shutdown_watchdog_;
    967     shutdown_watchdog_ = NULL;
    968   }
    969 }
    970 
    971 void ShutdownWatcherHelper::Arm(const base::TimeDelta& duration) {
    972   DCHECK_EQ(thread_id_, base::PlatformThread::CurrentId());
    973   DCHECK(!shutdown_watchdog_);
    974   base::TimeDelta actual_duration = duration;
    975 
    976   chrome::VersionInfo::Channel channel = chrome::VersionInfo::GetChannel();
    977   if (channel == chrome::VersionInfo::CHANNEL_STABLE) {
    978     actual_duration *= 20;
    979   } else if (channel == chrome::VersionInfo::CHANNEL_BETA ||
    980              channel == chrome::VersionInfo::CHANNEL_DEV) {
    981     actual_duration *= 10;
    982   }
    983 
    984 #if defined(OS_WIN)
    985   // On Windows XP, give twice the time for shutdown.
    986   if (base::win::GetVersion() <= base::win::VERSION_XP)
    987     actual_duration *= 2;
    988 #endif
    989 
    990   shutdown_watchdog_ = new ShutdownWatchDogThread(actual_duration);
    991   shutdown_watchdog_->Arm();
    992 }
    993