Home | History | Annotate | Download | only in metrics
      1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "chrome/browser/metrics/thread_watcher.h"
      6 
      7 #include <math.h>  // ceil
      8 
      9 #include "base/bind.h"
     10 #include "base/compiler_specific.h"
     11 #include "base/debug/alias.h"
     12 #include "base/debug/debugger.h"
     13 #include "base/debug/dump_without_crashing.h"
     14 #include "base/lazy_instance.h"
     15 #include "base/metrics/field_trial.h"
     16 #include "base/strings/string_number_conversions.h"
     17 #include "base/strings/string_split.h"
     18 #include "base/strings/string_tokenizer.h"
     19 #include "base/strings/stringprintf.h"
     20 #include "base/threading/thread_restrictions.h"
     21 #include "build/build_config.h"
     22 #include "chrome/browser/chrome_notification_types.h"
     23 #include "chrome/common/chrome_switches.h"
     24 #include "chrome/common/chrome_version_info.h"
     25 #include "chrome/common/logging_chrome.h"
     26 #include "content/public/browser/notification_service.h"
     27 
     28 #if defined(OS_WIN)
     29 #include "base/win/windows_version.h"
     30 #endif
     31 
     32 using content::BrowserThread;
     33 
     34 namespace {
     35 
     36 // The following are unique function names for forcing the crash when a thread
     37 // is unresponsive. This makes it possible to tell from the callstack alone what
     38 // thread was unresponsive.
     39 //
     40 // We disable optimizations for this block of functions so the compiler doesn't
     41 // merge them all together.
     42 MSVC_DISABLE_OPTIMIZE()
     43 MSVC_PUSH_DISABLE_WARNING(4748)
     44 
     45 void ReportThreadHang() {
     46 #if defined(NDEBUG)
     47   base::debug::DumpWithoutCrashing();
     48 #else
     49   base::debug::BreakDebugger();
     50 #endif
     51 }
     52 
     53 #if !defined(OS_ANDROID) || !defined(NDEBUG)
     54 // TODO(rtenneti): Enabled crashing, after getting data.
     55 NOINLINE void StartupHang() {
     56   ReportThreadHang();
     57 }
     58 #endif  // OS_ANDROID
     59 
     60 NOINLINE void ShutdownHang() {
     61   ReportThreadHang();
     62 }
     63 
     64 NOINLINE void ThreadUnresponsive_UI() {
     65   ReportThreadHang();
     66 }
     67 
     68 NOINLINE void ThreadUnresponsive_DB() {
     69   ReportThreadHang();
     70 }
     71 
     72 NOINLINE void ThreadUnresponsive_FILE() {
     73   ReportThreadHang();
     74 }
     75 
     76 NOINLINE void ThreadUnresponsive_FILE_USER_BLOCKING() {
     77   ReportThreadHang();
     78 }
     79 
     80 NOINLINE void ThreadUnresponsive_PROCESS_LAUNCHER() {
     81   ReportThreadHang();
     82 }
     83 
     84 NOINLINE void ThreadUnresponsive_CACHE() {
     85   ReportThreadHang();
     86 }
     87 
     88 NOINLINE void ThreadUnresponsive_IO() {
     89   ReportThreadHang();
     90 }
     91 
     92 MSVC_POP_WARNING()
     93 MSVC_ENABLE_OPTIMIZE();
     94 
     95 void CrashBecauseThreadWasUnresponsive(BrowserThread::ID thread_id) {
     96   base::debug::Alias(&thread_id);
     97 
     98   switch (thread_id) {
     99     case BrowserThread::UI:
    100       return ThreadUnresponsive_UI();
    101     case BrowserThread::DB:
    102       return ThreadUnresponsive_DB();
    103     case BrowserThread::FILE:
    104       return ThreadUnresponsive_FILE();
    105     case BrowserThread::FILE_USER_BLOCKING:
    106       return ThreadUnresponsive_FILE_USER_BLOCKING();
    107     case BrowserThread::PROCESS_LAUNCHER:
    108       return ThreadUnresponsive_PROCESS_LAUNCHER();
    109     case BrowserThread::CACHE:
    110       return ThreadUnresponsive_CACHE();
    111     case BrowserThread::IO:
    112       return ThreadUnresponsive_IO();
    113     case BrowserThread::ID_COUNT:
    114       CHECK(false);  // This shouldn't actually be reached!
    115       break;
    116 
    117     // Omission of the default hander is intentional -- that way the compiler
    118     // should warn if our switch becomes outdated.
    119   }
    120 
    121   CHECK(false) << "Unknown thread was unresponsive.";  // Shouldn't be reached.
    122 }
    123 
    124 }  // namespace
    125 
    126 // ThreadWatcher methods and members.
    127 ThreadWatcher::ThreadWatcher(const WatchingParams& params)
    128     : thread_id_(params.thread_id),
    129       thread_name_(params.thread_name),
    130       watched_loop_(
    131           BrowserThread::GetMessageLoopProxyForThread(params.thread_id)),
    132       sleep_time_(params.sleep_time),
    133       unresponsive_time_(params.unresponsive_time),
    134       ping_time_(base::TimeTicks::Now()),
    135       pong_time_(ping_time_),
    136       ping_sequence_number_(0),
    137       active_(false),
    138       ping_count_(params.unresponsive_threshold),
    139       response_time_histogram_(NULL),
    140       unresponsive_time_histogram_(NULL),
    141       unresponsive_count_(0),
    142       hung_processing_complete_(false),
    143       unresponsive_threshold_(params.unresponsive_threshold),
    144       crash_on_hang_(params.crash_on_hang),
    145       live_threads_threshold_(params.live_threads_threshold),
    146       weak_ptr_factory_(this) {
    147   DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
    148   Initialize();
    149 }
    150 
    151 ThreadWatcher::~ThreadWatcher() {}
    152 
    153 // static
    154 void ThreadWatcher::StartWatching(const WatchingParams& params) {
    155   DCHECK_GE(params.sleep_time.InMilliseconds(), 0);
    156   DCHECK_GE(params.unresponsive_time.InMilliseconds(),
    157             params.sleep_time.InMilliseconds());
    158 
    159   // If we are not on WatchDogThread, then post a task to call StartWatching on
    160   // WatchDogThread.
    161   if (!WatchDogThread::CurrentlyOnWatchDogThread()) {
    162     WatchDogThread::PostTask(
    163         FROM_HERE,
    164         base::Bind(&ThreadWatcher::StartWatching, params));
    165     return;
    166   }
    167 
    168   DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
    169 
    170   // Create a new thread watcher object for the given thread and activate it.
    171   ThreadWatcher* watcher = new ThreadWatcher(params);
    172 
    173   DCHECK(watcher);
    174   // If we couldn't register the thread watcher object, we are shutting down,
    175   // then don't activate thread watching.
    176   if (!ThreadWatcherList::IsRegistered(params.thread_id))
    177     return;
    178   watcher->ActivateThreadWatching();
    179 }
    180 
    181 void ThreadWatcher::ActivateThreadWatching() {
    182   DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
    183   if (active_) return;
    184   active_ = true;
    185   ping_count_ = unresponsive_threshold_;
    186   ResetHangCounters();
    187   base::MessageLoop::current()->PostTask(
    188       FROM_HERE,
    189       base::Bind(&ThreadWatcher::PostPingMessage,
    190                  weak_ptr_factory_.GetWeakPtr()));
    191 }
    192 
    193 void ThreadWatcher::DeActivateThreadWatching() {
    194   DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
    195   active_ = false;
    196   ping_count_ = 0;
    197   weak_ptr_factory_.InvalidateWeakPtrs();
    198 }
    199 
    200 void ThreadWatcher::WakeUp() {
    201   DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
    202   // There is some user activity, PostPingMessage task of thread watcher if
    203   // needed.
    204   if (!active_) return;
    205 
    206   // Throw away the previous |unresponsive_count_| and start over again. Just
    207   // before going to sleep, |unresponsive_count_| could be very close to
    208   // |unresponsive_threshold_| and when user becomes active,
    209   // |unresponsive_count_| can go over |unresponsive_threshold_| if there was no
    210   // response for ping messages. Reset |unresponsive_count_| to start measuring
    211   // the unresponsiveness of the threads when system becomes active.
    212   unresponsive_count_ = 0;
    213 
    214   if (ping_count_ <= 0) {
    215     ping_count_ = unresponsive_threshold_;
    216     ResetHangCounters();
    217     PostPingMessage();
    218   } else {
    219     ping_count_ = unresponsive_threshold_;
    220   }
    221 }
    222 
    223 void ThreadWatcher::PostPingMessage() {
    224   DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
    225   // If we have stopped watching or if the user is idle, then stop sending
    226   // ping messages.
    227   if (!active_ || ping_count_ <= 0)
    228     return;
    229 
    230   // Save the current time when we have sent ping message.
    231   ping_time_ = base::TimeTicks::Now();
    232 
    233   // Send a ping message to the watched thread. Callback will be called on
    234   // the WatchDogThread.
    235   base::Closure callback(
    236       base::Bind(&ThreadWatcher::OnPongMessage, weak_ptr_factory_.GetWeakPtr(),
    237                  ping_sequence_number_));
    238   if (watched_loop_->PostTask(
    239           FROM_HERE,
    240           base::Bind(&ThreadWatcher::OnPingMessage, thread_id_,
    241                      callback))) {
    242       // Post a task to check the responsiveness of watched thread.
    243       base::MessageLoop::current()->PostDelayedTask(
    244           FROM_HERE,
    245           base::Bind(&ThreadWatcher::OnCheckResponsiveness,
    246                      weak_ptr_factory_.GetWeakPtr(), ping_sequence_number_),
    247           unresponsive_time_);
    248   } else {
    249     // Watched thread might have gone away, stop watching it.
    250     DeActivateThreadWatching();
    251   }
    252 }
    253 
    254 void ThreadWatcher::OnPongMessage(uint64 ping_sequence_number) {
    255   DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
    256 
    257   // Record watched thread's response time.
    258   base::TimeTicks now = base::TimeTicks::Now();
    259   base::TimeDelta response_time = now - ping_time_;
    260   response_time_histogram_->AddTime(response_time);
    261 
    262   // Save the current time when we have got pong message.
    263   pong_time_ = now;
    264 
    265   // Check if there are any extra pings in flight.
    266   DCHECK_EQ(ping_sequence_number_, ping_sequence_number);
    267   if (ping_sequence_number_ != ping_sequence_number)
    268     return;
    269 
    270   // Increment sequence number for the next ping message to indicate watched
    271   // thread is responsive.
    272   ++ping_sequence_number_;
    273 
    274   // If we have stopped watching or if the user is idle, then stop sending
    275   // ping messages.
    276   if (!active_ || --ping_count_ <= 0)
    277     return;
    278 
    279   base::MessageLoop::current()->PostDelayedTask(
    280       FROM_HERE,
    281       base::Bind(&ThreadWatcher::PostPingMessage,
    282                  weak_ptr_factory_.GetWeakPtr()),
    283       sleep_time_);
    284 }
    285 
    286 void ThreadWatcher::OnCheckResponsiveness(uint64 ping_sequence_number) {
    287   DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
    288   // If we have stopped watching then consider thread as responding.
    289   if (!active_) {
    290     responsive_ = true;
    291     return;
    292   }
    293   // If the latest ping_sequence_number_ is not same as the ping_sequence_number
    294   // that is passed in, then we can assume OnPongMessage was called.
    295   // OnPongMessage increments ping_sequence_number_.
    296   if (ping_sequence_number_ != ping_sequence_number) {
    297     // Reset unresponsive_count_ to zero because we got a response from the
    298     // watched thread.
    299     ResetHangCounters();
    300 
    301     responsive_ = true;
    302     return;
    303   }
    304   // Record that we got no response from watched thread.
    305   GotNoResponse();
    306 
    307   // Post a task to check the responsiveness of watched thread.
    308   base::MessageLoop::current()->PostDelayedTask(
    309       FROM_HERE,
    310       base::Bind(&ThreadWatcher::OnCheckResponsiveness,
    311                  weak_ptr_factory_.GetWeakPtr(), ping_sequence_number_),
    312       unresponsive_time_);
    313   responsive_ = false;
    314 }
    315 
    316 void ThreadWatcher::Initialize() {
    317   DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
    318   ThreadWatcherList::Register(this);
    319 
    320   const std::string response_time_histogram_name =
    321       "ThreadWatcher.ResponseTime." + thread_name_;
    322   response_time_histogram_ = base::Histogram::FactoryTimeGet(
    323       response_time_histogram_name,
    324       base::TimeDelta::FromMilliseconds(1),
    325       base::TimeDelta::FromSeconds(100), 50,
    326       base::Histogram::kUmaTargetedHistogramFlag);
    327 
    328   const std::string unresponsive_time_histogram_name =
    329       "ThreadWatcher.Unresponsive." + thread_name_;
    330   unresponsive_time_histogram_ = base::Histogram::FactoryTimeGet(
    331       unresponsive_time_histogram_name,
    332       base::TimeDelta::FromMilliseconds(1),
    333       base::TimeDelta::FromSeconds(100), 50,
    334       base::Histogram::kUmaTargetedHistogramFlag);
    335 
    336   const std::string responsive_count_histogram_name =
    337       "ThreadWatcher.ResponsiveThreads." + thread_name_;
    338   responsive_count_histogram_ = base::LinearHistogram::FactoryGet(
    339       responsive_count_histogram_name, 1, 10, 11,
    340       base::Histogram::kUmaTargetedHistogramFlag);
    341 
    342   const std::string unresponsive_count_histogram_name =
    343       "ThreadWatcher.UnresponsiveThreads." + thread_name_;
    344   unresponsive_count_histogram_ = base::LinearHistogram::FactoryGet(
    345       unresponsive_count_histogram_name, 1, 10, 11,
    346       base::Histogram::kUmaTargetedHistogramFlag);
    347 }
    348 
    349 // static
    350 void ThreadWatcher::OnPingMessage(const BrowserThread::ID& thread_id,
    351                                   const base::Closure& callback_task) {
    352   // This method is called on watched thread.
    353   DCHECK(BrowserThread::CurrentlyOn(thread_id));
    354   WatchDogThread::PostTask(FROM_HERE, callback_task);
    355 }
    356 
    357 void ThreadWatcher::ResetHangCounters() {
    358   DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
    359   unresponsive_count_ = 0;
    360   hung_processing_complete_ = false;
    361 }
    362 
    363 void ThreadWatcher::GotNoResponse() {
    364   DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
    365 
    366   ++unresponsive_count_;
    367   if (!IsVeryUnresponsive())
    368     return;
    369 
    370   // Record total unresponsive_time since last pong message.
    371   base::TimeDelta unresponse_time = base::TimeTicks::Now() - pong_time_;
    372   unresponsive_time_histogram_->AddTime(unresponse_time);
    373 
    374   // We have already collected stats for the non-responding watched thread.
    375   if (hung_processing_complete_)
    376     return;
    377 
    378   // Record how other threads are responding.
    379   uint32 responding_thread_count = 0;
    380   uint32 unresponding_thread_count = 0;
    381   ThreadWatcherList::GetStatusOfThreads(&responding_thread_count,
    382                                         &unresponding_thread_count);
    383 
    384   // Record how many watched threads are responding.
    385   responsive_count_histogram_->Add(responding_thread_count);
    386 
    387   // Record how many watched threads are not responding.
    388   unresponsive_count_histogram_->Add(unresponding_thread_count);
    389 
    390   // Crash the browser if the watched thread is to be crashed on hang and if the
    391   // number of other threads responding is less than or equal to
    392   // live_threads_threshold_ and at least one other thread is responding.
    393   if (crash_on_hang_ &&
    394       responding_thread_count > 0 &&
    395       responding_thread_count <= live_threads_threshold_) {
    396     static bool crashed_once = false;
    397     if (!crashed_once) {
    398       crashed_once = true;
    399       CrashBecauseThreadWasUnresponsive(thread_id_);
    400     }
    401   }
    402 
    403   hung_processing_complete_ = true;
    404 }
    405 
    406 bool ThreadWatcher::IsVeryUnresponsive() {
    407   DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
    408   return unresponsive_count_ >= unresponsive_threshold_;
    409 }
    410 
    411 // ThreadWatcherList methods and members.
    412 //
    413 // static
    414 ThreadWatcherList* ThreadWatcherList::g_thread_watcher_list_ = NULL;
    415 // static
    416 bool ThreadWatcherList::g_stopped_ = false;
    417 // static
    418 const int ThreadWatcherList::kSleepSeconds = 1;
    419 // static
    420 const int ThreadWatcherList::kUnresponsiveSeconds = 2;
    421 // static
    422 const int ThreadWatcherList::kUnresponsiveCount = 9;
    423 // static
    424 const int ThreadWatcherList::kLiveThreadsThreshold = 2;
    425 // static, non-const for tests.
    426 int ThreadWatcherList::g_initialize_delay_seconds = 120;
    427 
    428 ThreadWatcherList::CrashDataThresholds::CrashDataThresholds(
    429     uint32 live_threads_threshold,
    430     uint32 unresponsive_threshold)
    431     : live_threads_threshold(live_threads_threshold),
    432       unresponsive_threshold(unresponsive_threshold) {
    433 }
    434 
    435 ThreadWatcherList::CrashDataThresholds::CrashDataThresholds()
    436     : live_threads_threshold(kLiveThreadsThreshold),
    437       unresponsive_threshold(kUnresponsiveCount) {
    438 }
    439 
    440 // static
    441 void ThreadWatcherList::StartWatchingAll(const CommandLine& command_line) {
    442   // TODO(rtenneti): Enable ThreadWatcher.
    443   uint32 unresponsive_threshold;
    444   CrashOnHangThreadMap crash_on_hang_threads;
    445   ParseCommandLine(command_line,
    446                    &unresponsive_threshold,
    447                    &crash_on_hang_threads);
    448 
    449   ThreadWatcherObserver::SetupNotifications(
    450       base::TimeDelta::FromSeconds(kSleepSeconds * unresponsive_threshold));
    451 
    452   WatchDogThread::PostTask(
    453       FROM_HERE,
    454       base::Bind(&ThreadWatcherList::SetStopped, false));
    455 
    456   WatchDogThread::PostDelayedTask(
    457       FROM_HERE,
    458       base::Bind(&ThreadWatcherList::InitializeAndStartWatching,
    459                  unresponsive_threshold,
    460                  crash_on_hang_threads),
    461       base::TimeDelta::FromSeconds(g_initialize_delay_seconds));
    462 }
    463 
    464 // static
    465 void ThreadWatcherList::StopWatchingAll() {
    466   // TODO(rtenneti): Enable ThreadWatcher.
    467   ThreadWatcherObserver::RemoveNotifications();
    468   DeleteAll();
    469 }
    470 
    471 // static
    472 void ThreadWatcherList::Register(ThreadWatcher* watcher) {
    473   DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
    474   if (!g_thread_watcher_list_)
    475     return;
    476   DCHECK(!g_thread_watcher_list_->Find(watcher->thread_id()));
    477   g_thread_watcher_list_->registered_[watcher->thread_id()] = watcher;
    478 }
    479 
    480 // static
    481 bool ThreadWatcherList::IsRegistered(const BrowserThread::ID thread_id) {
    482   DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
    483   return NULL != ThreadWatcherList::Find(thread_id);
    484 }
    485 
    486 // static
    487 void ThreadWatcherList::GetStatusOfThreads(uint32* responding_thread_count,
    488                                            uint32* unresponding_thread_count) {
    489   DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
    490   *responding_thread_count = 0;
    491   *unresponding_thread_count = 0;
    492   if (!g_thread_watcher_list_)
    493     return;
    494 
    495   for (RegistrationList::iterator it =
    496            g_thread_watcher_list_->registered_.begin();
    497        g_thread_watcher_list_->registered_.end() != it;
    498        ++it) {
    499     if (it->second->IsVeryUnresponsive())
    500       ++(*unresponding_thread_count);
    501     else
    502       ++(*responding_thread_count);
    503   }
    504 }
    505 
    506 // static
    507 void ThreadWatcherList::WakeUpAll() {
    508   DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
    509   if (!g_thread_watcher_list_)
    510     return;
    511 
    512   for (RegistrationList::iterator it =
    513            g_thread_watcher_list_->registered_.begin();
    514        g_thread_watcher_list_->registered_.end() != it;
    515        ++it)
    516     it->second->WakeUp();
    517 }
    518 
    519 ThreadWatcherList::ThreadWatcherList() {
    520   DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
    521   CHECK(!g_thread_watcher_list_);
    522   g_thread_watcher_list_ = this;
    523 }
    524 
    525 ThreadWatcherList::~ThreadWatcherList() {
    526   DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
    527   DCHECK(this == g_thread_watcher_list_);
    528   g_thread_watcher_list_ = NULL;
    529 }
    530 
    531 // static
    532 void ThreadWatcherList::ParseCommandLine(
    533     const CommandLine& command_line,
    534     uint32* unresponsive_threshold,
    535     CrashOnHangThreadMap* crash_on_hang_threads) {
    536   // Initialize |unresponsive_threshold| to a default value.
    537   // TODO(rtenneti): Changed the default value to 4 times, until we can triage
    538   // hangs automatically (and to reduce the crash dumps).
    539   *unresponsive_threshold = kUnresponsiveCount * 4;
    540 
    541   // Increase the unresponsive_threshold on the Stable and Beta channels to
    542   // reduce the number of crashes due to ThreadWatcher.
    543   chrome::VersionInfo::Channel channel = chrome::VersionInfo::GetChannel();
    544   if (channel == chrome::VersionInfo::CHANNEL_STABLE) {
    545     *unresponsive_threshold *= 4;
    546   } else if (channel == chrome::VersionInfo::CHANNEL_BETA) {
    547     *unresponsive_threshold *= 2;
    548   }
    549 
    550 #if defined(OS_WIN)
    551   // For Windows XP (old systems), double the unresponsive_threshold to give
    552   // the OS a chance to schedule UI/IO threads a time slice to respond with a
    553   // pong message (to get around limitations with the OS).
    554   if (base::win::GetVersion() <= base::win::VERSION_XP)
    555     *unresponsive_threshold *= 2;
    556 #endif
    557 
    558   uint32 crash_seconds = *unresponsive_threshold * kUnresponsiveSeconds;
    559   std::string crash_on_hang_thread_names;
    560   bool has_command_line_overwrite = false;
    561   if (command_line.HasSwitch(switches::kCrashOnHangThreads)) {
    562     crash_on_hang_thread_names =
    563         command_line.GetSwitchValueASCII(switches::kCrashOnHangThreads);
    564     has_command_line_overwrite = true;
    565   } else if (channel != chrome::VersionInfo::CHANNEL_STABLE) {
    566     // Default to crashing the browser if UI or IO or FILE threads are not
    567     // responsive except in stable channel.
    568     crash_on_hang_thread_names = base::StringPrintf(
    569         "UI:%d:%d,IO:%d:%d,FILE:%d:%d",
    570         kLiveThreadsThreshold, crash_seconds,
    571         kLiveThreadsThreshold, crash_seconds,
    572         kLiveThreadsThreshold, crash_seconds * 5);
    573   }
    574 
    575   ParseCommandLineCrashOnHangThreads(crash_on_hang_thread_names,
    576                                      kLiveThreadsThreshold,
    577                                      crash_seconds,
    578                                      crash_on_hang_threads);
    579 
    580   if (channel != chrome::VersionInfo::CHANNEL_CANARY ||
    581       has_command_line_overwrite) {
    582     return;
    583   }
    584 
    585   const char* kFieldTrialName = "ThreadWatcher";
    586 
    587   // Nothing else to be done if the trial has already been set (i.e., when
    588   // StartWatchingAll() has been already called once).
    589   if (base::FieldTrialList::TrialExists(kFieldTrialName))
    590     return;
    591 
    592   // Set up a field trial for 100% of the users to crash if either UI or IO
    593   // thread is not responsive for 30 seconds (or 15 pings).
    594   scoped_refptr<base::FieldTrial> field_trial(
    595       base::FieldTrialList::FactoryGetFieldTrial(
    596           kFieldTrialName, 100, "default_hung_threads",
    597           2014, 10, 30, base::FieldTrial::SESSION_RANDOMIZED, NULL));
    598   int hung_thread_group = field_trial->AppendGroup("hung_thread", 100);
    599   if (field_trial->group() == hung_thread_group) {
    600     for (CrashOnHangThreadMap::iterator it = crash_on_hang_threads->begin();
    601          crash_on_hang_threads->end() != it;
    602          ++it) {
    603       if (it->first == "FILE")
    604         continue;
    605       it->second.live_threads_threshold = INT_MAX;
    606       if (it->first == "UI") {
    607         // TODO(rtenneti): set unresponsive threshold to 120 seconds to catch
    608         // the worst UI hangs and for fewer crashes due to ThreadWatcher. Reduce
    609         // it to a more reasonable time ala IO thread.
    610         it->second.unresponsive_threshold = 60;
    611       } else {
    612         it->second.unresponsive_threshold = 15;
    613       }
    614     }
    615   }
    616 }
    617 
    618 // static
    619 void ThreadWatcherList::ParseCommandLineCrashOnHangThreads(
    620     const std::string& crash_on_hang_thread_names,
    621     uint32 default_live_threads_threshold,
    622     uint32 default_crash_seconds,
    623     CrashOnHangThreadMap* crash_on_hang_threads) {
    624   base::StringTokenizer tokens(crash_on_hang_thread_names, ",");
    625   std::vector<std::string> values;
    626   while (tokens.GetNext()) {
    627     const std::string& token = tokens.token();
    628     base::SplitString(token, ':', &values);
    629     std::string thread_name = values[0];
    630 
    631     uint32 live_threads_threshold = default_live_threads_threshold;
    632     uint32 crash_seconds = default_crash_seconds;
    633     if (values.size() >= 2 &&
    634         (!base::StringToUint(values[1], &live_threads_threshold))) {
    635       continue;
    636     }
    637     if (values.size() >= 3 &&
    638         (!base::StringToUint(values[2], &crash_seconds))) {
    639       continue;
    640     }
    641     uint32 unresponsive_threshold = static_cast<uint32>(
    642         ceil(static_cast<float>(crash_seconds) / kUnresponsiveSeconds));
    643 
    644     CrashDataThresholds crash_data(live_threads_threshold,
    645                                    unresponsive_threshold);
    646     // Use the last specifier.
    647     (*crash_on_hang_threads)[thread_name] = crash_data;
    648   }
    649 }
    650 
    651 // static
    652 void ThreadWatcherList::InitializeAndStartWatching(
    653     uint32 unresponsive_threshold,
    654     const CrashOnHangThreadMap& crash_on_hang_threads) {
    655   DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
    656 
    657   // Disarm the startup timebomb, even if stop has been called.
    658   BrowserThread::PostTask(
    659       BrowserThread::UI,
    660       FROM_HERE,
    661       base::Bind(&StartupTimeBomb::DisarmStartupTimeBomb));
    662 
    663   // This method is deferred in relationship to its StopWatchingAll()
    664   // counterpart. If a previous initialization has already happened, or if
    665   // stop has been called, there's nothing left to do here.
    666   if (g_thread_watcher_list_ || g_stopped_)
    667     return;
    668 
    669   ThreadWatcherList* thread_watcher_list = new ThreadWatcherList();
    670   CHECK(thread_watcher_list);
    671 
    672   const base::TimeDelta kSleepTime =
    673       base::TimeDelta::FromSeconds(kSleepSeconds);
    674   const base::TimeDelta kUnresponsiveTime =
    675       base::TimeDelta::FromSeconds(kUnresponsiveSeconds);
    676 
    677   StartWatching(BrowserThread::UI, "UI", kSleepTime, kUnresponsiveTime,
    678                 unresponsive_threshold, crash_on_hang_threads);
    679   StartWatching(BrowserThread::IO, "IO", kSleepTime, kUnresponsiveTime,
    680                 unresponsive_threshold, crash_on_hang_threads);
    681   StartWatching(BrowserThread::DB, "DB", kSleepTime, kUnresponsiveTime,
    682                 unresponsive_threshold, crash_on_hang_threads);
    683   StartWatching(BrowserThread::FILE, "FILE", kSleepTime, kUnresponsiveTime,
    684                 unresponsive_threshold, crash_on_hang_threads);
    685   StartWatching(BrowserThread::CACHE, "CACHE", kSleepTime, kUnresponsiveTime,
    686                 unresponsive_threshold, crash_on_hang_threads);
    687 }
    688 
    689 // static
    690 void ThreadWatcherList::StartWatching(
    691     const BrowserThread::ID& thread_id,
    692     const std::string& thread_name,
    693     const base::TimeDelta& sleep_time,
    694     const base::TimeDelta& unresponsive_time,
    695     uint32 unresponsive_threshold,
    696     const CrashOnHangThreadMap& crash_on_hang_threads) {
    697   DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
    698 
    699   CrashOnHangThreadMap::const_iterator it =
    700       crash_on_hang_threads.find(thread_name);
    701   bool crash_on_hang = false;
    702   uint32 live_threads_threshold = 0;
    703   if (it != crash_on_hang_threads.end()) {
    704     crash_on_hang = true;
    705     live_threads_threshold = it->second.live_threads_threshold;
    706     unresponsive_threshold = it->second.unresponsive_threshold;
    707   }
    708 
    709   ThreadWatcher::StartWatching(
    710       ThreadWatcher::WatchingParams(thread_id,
    711                                     thread_name,
    712                                     sleep_time,
    713                                     unresponsive_time,
    714                                     unresponsive_threshold,
    715                                     crash_on_hang,
    716                                     live_threads_threshold));
    717 }
    718 
    719 // static
    720 void ThreadWatcherList::DeleteAll() {
    721   if (!WatchDogThread::CurrentlyOnWatchDogThread()) {
    722     WatchDogThread::PostTask(
    723         FROM_HERE,
    724         base::Bind(&ThreadWatcherList::DeleteAll));
    725     return;
    726   }
    727 
    728   DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
    729 
    730   SetStopped(true);
    731 
    732   if (!g_thread_watcher_list_)
    733     return;
    734 
    735   // Delete all thread watcher objects.
    736   while (!g_thread_watcher_list_->registered_.empty()) {
    737     RegistrationList::iterator it = g_thread_watcher_list_->registered_.begin();
    738     delete it->second;
    739     g_thread_watcher_list_->registered_.erase(it);
    740   }
    741 
    742   delete g_thread_watcher_list_;
    743 }
    744 
    745 // static
    746 ThreadWatcher* ThreadWatcherList::Find(const BrowserThread::ID& thread_id) {
    747   DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
    748   if (!g_thread_watcher_list_)
    749     return NULL;
    750   RegistrationList::iterator it =
    751       g_thread_watcher_list_->registered_.find(thread_id);
    752   if (g_thread_watcher_list_->registered_.end() == it)
    753     return NULL;
    754   return it->second;
    755 }
    756 
    757 // static
    758 void ThreadWatcherList::SetStopped(bool stopped) {
    759   DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
    760   g_stopped_ = stopped;
    761 }
    762 
    763 // ThreadWatcherObserver methods and members.
    764 //
    765 // static
    766 ThreadWatcherObserver* ThreadWatcherObserver::g_thread_watcher_observer_ = NULL;
    767 
    768 ThreadWatcherObserver::ThreadWatcherObserver(
    769     const base::TimeDelta& wakeup_interval)
    770     : last_wakeup_time_(base::TimeTicks::Now()),
    771       wakeup_interval_(wakeup_interval) {
    772   CHECK(!g_thread_watcher_observer_);
    773   g_thread_watcher_observer_ = this;
    774 }
    775 
    776 ThreadWatcherObserver::~ThreadWatcherObserver() {
    777   DCHECK(this == g_thread_watcher_observer_);
    778   g_thread_watcher_observer_ = NULL;
    779 }
    780 
    781 // static
    782 void ThreadWatcherObserver::SetupNotifications(
    783     const base::TimeDelta& wakeup_interval) {
    784   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
    785   ThreadWatcherObserver* observer = new ThreadWatcherObserver(wakeup_interval);
    786   observer->registrar_.Add(
    787       observer,
    788       chrome::NOTIFICATION_BROWSER_OPENED,
    789       content::NotificationService::AllBrowserContextsAndSources());
    790   observer->registrar_.Add(observer,
    791                            chrome::NOTIFICATION_BROWSER_CLOSED,
    792                            content::NotificationService::AllSources());
    793   observer->registrar_.Add(observer,
    794                            chrome::NOTIFICATION_TAB_PARENTED,
    795                            content::NotificationService::AllSources());
    796   observer->registrar_.Add(observer,
    797                            chrome::NOTIFICATION_TAB_CLOSING,
    798                            content::NotificationService::AllSources());
    799   observer->registrar_.Add(observer,
    800                            content::NOTIFICATION_LOAD_START,
    801                            content::NotificationService::AllSources());
    802   observer->registrar_.Add(observer,
    803                            content::NOTIFICATION_LOAD_STOP,
    804                            content::NotificationService::AllSources());
    805   observer->registrar_.Add(observer,
    806                            content::NOTIFICATION_RENDERER_PROCESS_CLOSED,
    807                            content::NotificationService::AllSources());
    808   observer->registrar_.Add(observer,
    809                            content::NOTIFICATION_RENDER_WIDGET_HOST_HANG,
    810                            content::NotificationService::AllSources());
    811   observer->registrar_.Add(observer,
    812                            chrome::NOTIFICATION_OMNIBOX_OPENED_URL,
    813                            content::NotificationService::AllSources());
    814 }
    815 
    816 // static
    817 void ThreadWatcherObserver::RemoveNotifications() {
    818   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
    819   if (!g_thread_watcher_observer_)
    820     return;
    821   g_thread_watcher_observer_->registrar_.RemoveAll();
    822   delete g_thread_watcher_observer_;
    823 }
    824 
    825 void ThreadWatcherObserver::Observe(
    826     int type,
    827     const content::NotificationSource& source,
    828     const content::NotificationDetails& details) {
    829   // There is some user activity, see if thread watchers are to be awakened.
    830   base::TimeTicks now = base::TimeTicks::Now();
    831   if ((now - last_wakeup_time_) < wakeup_interval_)
    832     return;
    833   last_wakeup_time_ = now;
    834   WatchDogThread::PostTask(
    835       FROM_HERE,
    836       base::Bind(&ThreadWatcherList::WakeUpAll));
    837 }
    838 
    839 // WatchDogThread methods and members.
    840 
    841 // This lock protects g_watchdog_thread.
    842 static base::LazyInstance<base::Lock>::Leaky
    843     g_watchdog_lock = LAZY_INSTANCE_INITIALIZER;
    844 
    845 // The singleton of this class.
    846 static WatchDogThread* g_watchdog_thread = NULL;
    847 
    848 WatchDogThread::WatchDogThread() : Thread("BrowserWatchdog") {
    849 }
    850 
    851 WatchDogThread::~WatchDogThread() {
    852   Stop();
    853 }
    854 
    855 // static
    856 bool WatchDogThread::CurrentlyOnWatchDogThread() {
    857   base::AutoLock lock(g_watchdog_lock.Get());
    858   return g_watchdog_thread &&
    859       g_watchdog_thread->message_loop() == base::MessageLoop::current();
    860 }
    861 
    862 // static
    863 bool WatchDogThread::PostTask(const tracked_objects::Location& from_here,
    864                               const base::Closure& task) {
    865   return PostTaskHelper(from_here, task, base::TimeDelta());
    866 }
    867 
    868 // static
    869 bool WatchDogThread::PostDelayedTask(const tracked_objects::Location& from_here,
    870                                      const base::Closure& task,
    871                                      base::TimeDelta delay) {
    872   return PostTaskHelper(from_here, task, delay);
    873 }
    874 
    875 // static
    876 bool WatchDogThread::PostTaskHelper(
    877     const tracked_objects::Location& from_here,
    878     const base::Closure& task,
    879     base::TimeDelta delay) {
    880   {
    881     base::AutoLock lock(g_watchdog_lock.Get());
    882 
    883     base::MessageLoop* message_loop = g_watchdog_thread ?
    884         g_watchdog_thread->message_loop() : NULL;
    885     if (message_loop) {
    886       message_loop->PostDelayedTask(from_here, task, delay);
    887       return true;
    888     }
    889   }
    890 
    891   return false;
    892 }
    893 
    894 void WatchDogThread::Init() {
    895   // This thread shouldn't be allowed to perform any blocking disk I/O.
    896   base::ThreadRestrictions::SetIOAllowed(false);
    897 
    898   base::AutoLock lock(g_watchdog_lock.Get());
    899   CHECK(!g_watchdog_thread);
    900   g_watchdog_thread = this;
    901 }
    902 
    903 void WatchDogThread::CleanUp() {
    904   base::AutoLock lock(g_watchdog_lock.Get());
    905   g_watchdog_thread = NULL;
    906 }
    907 
    908 namespace {
    909 
    910 // StartupWatchDogThread methods and members.
    911 //
    912 // Class for detecting hangs during startup.
    913 class StartupWatchDogThread : public base::Watchdog {
    914  public:
    915   // Constructor specifies how long the StartupWatchDogThread will wait before
    916   // alarming.
    917   explicit StartupWatchDogThread(const base::TimeDelta& duration)
    918       : base::Watchdog(duration, "Startup watchdog thread", true) {
    919 #if defined(OS_ANDROID)
    920     // TODO(rtenneti): Delete this code, after getting data.
    921     start_time_clock_= base::Time::Now();
    922     start_time_monotonic_ = base::TimeTicks::Now();
    923     start_time_thread_now_ = base::TimeTicks::IsThreadNowSupported()
    924         ? base::TimeTicks::ThreadNow() : base::TimeTicks::Now();
    925 #endif  // OS_ANDROID
    926   }
    927 
    928   // Alarm is called if the time expires after an Arm() without someone calling
    929   // Disarm(). When Alarm goes off, in release mode we get the crash dump
    930   // without crashing and in debug mode we break into the debugger.
    931   virtual void Alarm() OVERRIDE {
    932 #if !defined(NDEBUG)
    933     StartupHang();
    934     return;
    935 #elif !defined(OS_ANDROID)
    936     WatchDogThread::PostTask(FROM_HERE, base::Bind(&StartupHang));
    937     return;
    938 #else  // Android release: gather stats to figure out when to crash.
    939     // TODO(rtenneti): Delete this code, after getting data.
    940     UMA_HISTOGRAM_TIMES("StartupTimeBomb.Alarm.TimeDuration",
    941                         base::Time::Now() - start_time_clock_);
    942     UMA_HISTOGRAM_TIMES("StartupTimeBomb.Alarm.TimeTicksDuration",
    943                         base::TimeTicks::Now() - start_time_monotonic_);
    944     if (base::TimeTicks::IsThreadNowSupported()) {
    945       UMA_HISTOGRAM_TIMES(
    946           "StartupTimeBomb.Alarm.ThreadNowDuration",
    947           base::TimeTicks::ThreadNow() - start_time_thread_now_);
    948     }
    949     return;
    950 #endif  // OS_ANDROID
    951   }
    952 
    953  private:
    954 #if defined(OS_ANDROID)
    955   // TODO(rtenneti): Delete this code, after getting data.
    956   base::Time start_time_clock_;
    957   base::TimeTicks start_time_monotonic_;
    958   base::TimeTicks start_time_thread_now_;
    959 #endif  // OS_ANDROID
    960 
    961   DISALLOW_COPY_AND_ASSIGN(StartupWatchDogThread);
    962 };
    963 
    964 // ShutdownWatchDogThread methods and members.
    965 //
    966 // Class for detecting hangs during shutdown.
    967 class ShutdownWatchDogThread : public base::Watchdog {
    968  public:
    969   // Constructor specifies how long the ShutdownWatchDogThread will wait before
    970   // alarming.
    971   explicit ShutdownWatchDogThread(const base::TimeDelta& duration)
    972       : base::Watchdog(duration, "Shutdown watchdog thread", true) {
    973   }
    974 
    975   // Alarm is called if the time expires after an Arm() without someone calling
    976   // Disarm(). We crash the browser if this method is called.
    977   virtual void Alarm() OVERRIDE {
    978     ShutdownHang();
    979   }
    980 
    981  private:
    982   DISALLOW_COPY_AND_ASSIGN(ShutdownWatchDogThread);
    983 };
    984 }  // namespace
    985 
    986 // StartupTimeBomb methods and members.
    987 //
    988 // static
    989 StartupTimeBomb* StartupTimeBomb::g_startup_timebomb_ = NULL;
    990 
    991 StartupTimeBomb::StartupTimeBomb()
    992     : startup_watchdog_(NULL),
    993       thread_id_(base::PlatformThread::CurrentId()) {
    994   CHECK(!g_startup_timebomb_);
    995   g_startup_timebomb_ = this;
    996 }
    997 
    998 StartupTimeBomb::~StartupTimeBomb() {
    999   DCHECK(this == g_startup_timebomb_);
   1000   DCHECK_EQ(thread_id_, base::PlatformThread::CurrentId());
   1001   if (startup_watchdog_)
   1002     Disarm();
   1003   g_startup_timebomb_ = NULL;
   1004 }
   1005 
   1006 void StartupTimeBomb::Arm(const base::TimeDelta& duration) {
   1007   DCHECK_EQ(thread_id_, base::PlatformThread::CurrentId());
   1008   DCHECK(!startup_watchdog_);
   1009   startup_watchdog_ = new StartupWatchDogThread(duration);
   1010   startup_watchdog_->Arm();
   1011   return;
   1012 }
   1013 
   1014 void StartupTimeBomb::Disarm() {
   1015   DCHECK_EQ(thread_id_, base::PlatformThread::CurrentId());
   1016   if (startup_watchdog_) {
   1017     startup_watchdog_->Disarm();
   1018     startup_watchdog_->Cleanup();
   1019     DeleteStartupWatchdog();
   1020   }
   1021 }
   1022 
   1023 void StartupTimeBomb::DeleteStartupWatchdog() {
   1024   DCHECK_EQ(thread_id_, base::PlatformThread::CurrentId());
   1025   if (startup_watchdog_->IsJoinable()) {
   1026     // Allow the watchdog thread to shutdown on UI. Watchdog thread shutdowns
   1027     // very fast.
   1028     base::ThreadRestrictions::SetIOAllowed(true);
   1029     delete startup_watchdog_;
   1030     startup_watchdog_ = NULL;
   1031     return;
   1032   }
   1033   base::MessageLoop::current()->PostDelayedTask(
   1034       FROM_HERE,
   1035       base::Bind(&StartupTimeBomb::DeleteStartupWatchdog,
   1036                  base::Unretained(this)),
   1037       base::TimeDelta::FromSeconds(10));
   1038 }
   1039 
   1040 // static
   1041 void StartupTimeBomb::DisarmStartupTimeBomb() {
   1042   DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
   1043   if (g_startup_timebomb_)
   1044     g_startup_timebomb_->Disarm();
   1045 }
   1046 
   1047 // ShutdownWatcherHelper methods and members.
   1048 //
   1049 // ShutdownWatcherHelper is a wrapper class for detecting hangs during
   1050 // shutdown.
   1051 ShutdownWatcherHelper::ShutdownWatcherHelper()
   1052     : shutdown_watchdog_(NULL),
   1053       thread_id_(base::PlatformThread::CurrentId()) {
   1054 }
   1055 
   1056 ShutdownWatcherHelper::~ShutdownWatcherHelper() {
   1057   DCHECK_EQ(thread_id_, base::PlatformThread::CurrentId());
   1058   if (shutdown_watchdog_) {
   1059     shutdown_watchdog_->Disarm();
   1060     delete shutdown_watchdog_;
   1061     shutdown_watchdog_ = NULL;
   1062   }
   1063 }
   1064 
   1065 void ShutdownWatcherHelper::Arm(const base::TimeDelta& duration) {
   1066   DCHECK_EQ(thread_id_, base::PlatformThread::CurrentId());
   1067   DCHECK(!shutdown_watchdog_);
   1068   base::TimeDelta actual_duration = duration;
   1069 
   1070   chrome::VersionInfo::Channel channel = chrome::VersionInfo::GetChannel();
   1071   if (channel == chrome::VersionInfo::CHANNEL_STABLE) {
   1072     actual_duration *= 20;
   1073   } else if (channel == chrome::VersionInfo::CHANNEL_BETA ||
   1074              channel == chrome::VersionInfo::CHANNEL_DEV) {
   1075     actual_duration *= 10;
   1076   }
   1077 
   1078 #if defined(OS_WIN)
   1079   // On Windows XP, give twice the time for shutdown.
   1080   if (base::win::GetVersion() <= base::win::VERSION_XP)
   1081     actual_duration *= 2;
   1082 #endif
   1083 
   1084   shutdown_watchdog_ = new ShutdownWatchDogThread(actual_duration);
   1085   shutdown_watchdog_->Arm();
   1086 }
   1087