// Copyright (c) 2012 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. // This file defines a WatchDog thread that monitors the responsiveness of other // browser threads like UI, IO, DB, FILE and CACHED threads. It also defines // ThreadWatcher class which performs health check on threads that would like to // be watched. This file also defines ThreadWatcherList class that has list of // all active ThreadWatcher objects. // // ThreadWatcher class sends ping message to the watched thread and the watched // thread responds back with a pong message. It uploads response time // (difference between ping and pong times) as a histogram. // // TODO(raman): ThreadWatcher can detect hung threads. If a hung thread is // detected, we should probably just crash, and allow the crash system to gather // then stack trace. // // Example Usage: // // The following is an example for watching responsiveness of watched (IO) // thread. |sleep_time| specifies how often ping messages have to be sent to // watched (IO) thread. |unresponsive_time| is the wait time after ping // message is sent, to check if we have received pong message or not. // |unresponsive_threshold| specifies the number of unanswered ping messages // after which watched (IO) thread is considered as not responsive. // |crash_on_hang| specifies if we want to crash the browser when the watched // (IO) thread has become sufficiently unresponsive, while other threads are // sufficiently responsive. |live_threads_threshold| specifies the number of // browser threads that are to be responsive when we want to crash the browser // because of hung watched (IO) thread. // // base::TimeDelta sleep_time = base::TimeDelta::FromSeconds(5); // base::TimeDelta unresponsive_time = base::TimeDelta::FromSeconds(10); // uint32 unresponsive_threshold = ThreadWatcherList::kUnresponsiveCount; // bool crash_on_hang = false; // uint32 live_threads_threshold = ThreadWatcherList::kLiveThreadsThreshold; // ThreadWatcher::StartWatching( // BrowserThread::IO, "IO", sleep_time, unresponsive_time, // unresponsive_threshold, crash_on_hang, live_threads_threshold); #ifndef CHROME_BROWSER_METRICS_THREAD_WATCHER_H_ #define CHROME_BROWSER_METRICS_THREAD_WATCHER_H_ #include <map> #include <string> #include <vector> #include "base/basictypes.h" #include "base/command_line.h" #include "base/gtest_prod_util.h" #include "base/memory/ref_counted.h" #include "base/memory/weak_ptr.h" #include "base/message_loop/message_loop.h" #include "base/metrics/histogram.h" #include "base/synchronization/lock.h" #include "base/threading/platform_thread.h" #include "base/threading/thread.h" #include "base/threading/watchdog.h" #include "base/time/time.h" #include "content/public/browser/browser_thread.h" #include "content/public/browser/notification_observer.h" #include "content/public/browser/notification_registrar.h" class CustomThreadWatcher; class StartupTimeBomb; class ThreadWatcherList; class ThreadWatcherObserver; // This class performs health check on threads that would like to be watched. class ThreadWatcher { public: // base::Bind supports methods with up to 6 parameters. WatchingParams is used // as a workaround that limitation for invoking ThreadWatcher::StartWatching. struct WatchingParams { const content::BrowserThread::ID& thread_id; const std::string& thread_name; const base::TimeDelta& sleep_time; const base::TimeDelta& unresponsive_time; uint32 unresponsive_threshold; bool crash_on_hang; uint32 live_threads_threshold; WatchingParams(const content::BrowserThread::ID& thread_id_in, const std::string& thread_name_in, const base::TimeDelta& sleep_time_in, const base::TimeDelta& unresponsive_time_in, uint32 unresponsive_threshold_in, bool crash_on_hang_in, uint32 live_threads_threshold_in) : thread_id(thread_id_in), thread_name(thread_name_in), sleep_time(sleep_time_in), unresponsive_time(unresponsive_time_in), unresponsive_threshold(unresponsive_threshold_in), crash_on_hang(crash_on_hang_in), live_threads_threshold(live_threads_threshold_in) { } }; // This method starts performing health check on the given |thread_id|. It // will create ThreadWatcher object for the given |thread_id|, |thread_name|. // |sleep_time| is the wait time between ping messages. |unresponsive_time| is // the wait time after ping message is sent, to check if we have received pong // message or not. |unresponsive_threshold| is used to determine if the thread // is responsive or not. The watched thread is considered unresponsive if it // hasn't responded with a pong message for |unresponsive_threshold| number of // ping messages. |crash_on_hang| specifies if browser should be crashed when // the watched thread is unresponsive. |live_threads_threshold| specifies the // number of browser threads that are to be responsive when we want to crash // the browser and watched thread has become sufficiently unresponsive. It // will register that ThreadWatcher object and activate the thread watching of // the given thread_id. static void StartWatching(const WatchingParams& params); // Return the |thread_id_| of the thread being watched. content::BrowserThread::ID thread_id() const { return thread_id_; } // Return the name of the thread being watched. std::string thread_name() const { return thread_name_; } // Return the sleep time between ping messages to be sent to the thread. base::TimeDelta sleep_time() const { return sleep_time_; } // Return the the wait time to check the responsiveness of the thread. base::TimeDelta unresponsive_time() const { return unresponsive_time_; } // Returns true if we are montioring the thread. bool active() const { return active_; } // Returns |ping_time_| (used by unit tests). base::TimeTicks ping_time() const { return ping_time_; } // Returns |ping_sequence_number_| (used by unit tests). uint64 ping_sequence_number() const { return ping_sequence_number_; } protected: // Construct a ThreadWatcher for the given |thread_id|. |sleep_time| is the // wait time between ping messages. |unresponsive_time| is the wait time after // ping message is sent, to check if we have received pong message or not. explicit ThreadWatcher(const WatchingParams& params); virtual ~ThreadWatcher(); // This method activates the thread watching which starts ping/pong messaging. virtual void ActivateThreadWatching(); // This method de-activates the thread watching and revokes all tasks. virtual void DeActivateThreadWatching(); // This will ensure that the watching is actively taking place, and awaken // (i.e., post a PostPingMessage()) if the watcher has stopped pinging due to // lack of user activity. It will also reset |ping_count_| to // |unresponsive_threshold_|. virtual void WakeUp(); // This method records when ping message was sent and it will Post a task // (OnPingMessage()) to the watched thread that does nothing but respond with // OnPongMessage(). It also posts a task (OnCheckResponsiveness()) to check // responsiveness of monitored thread that would be called after waiting // |unresponsive_time_|. // This method is accessible on WatchDogThread. virtual void PostPingMessage(); // This method handles a Pong Message from watched thread. It will track the // response time (pong time minus ping time) via histograms. It posts a // PostPingMessage() task that would be called after waiting |sleep_time_|. It // increments |ping_sequence_number_| by 1. // This method is accessible on WatchDogThread. virtual void OnPongMessage(uint64 ping_sequence_number); // This method will determine if the watched thread is responsive or not. If // the latest |ping_sequence_number_| is not same as the // |ping_sequence_number| that is passed in, then we can assume that watched // thread has responded with a pong message. // This method is accessible on WatchDogThread. virtual void OnCheckResponsiveness(uint64 ping_sequence_number); // Set by OnCheckResponsiveness when it determines if the watched thread is // responsive or not. bool responsive_; private: friend class ThreadWatcherList; friend class CustomThreadWatcher; // Allow tests to access our innards for testing purposes. FRIEND_TEST_ALL_PREFIXES(ThreadWatcherTest, Registration); FRIEND_TEST_ALL_PREFIXES(ThreadWatcherTest, ThreadResponding); FRIEND_TEST_ALL_PREFIXES(ThreadWatcherTest, ThreadNotResponding); FRIEND_TEST_ALL_PREFIXES(ThreadWatcherTest, MultipleThreadsResponding); FRIEND_TEST_ALL_PREFIXES(ThreadWatcherTest, MultipleThreadsNotResponding); // Post constructor initialization. void Initialize(); // Watched thread does nothing except post callback_task to the WATCHDOG // Thread. This method is called on watched thread. static void OnPingMessage(const content::BrowserThread::ID& thread_id, const base::Closure& callback_task); // This method resets |unresponsive_count_| to zero because watched thread is // responding to the ping message with a pong message. void ResetHangCounters(); // This method records watched thread is not responding to the ping message. // It increments |unresponsive_count_| by 1. void GotNoResponse(); // This method returns true if the watched thread has not responded with a // pong message for |unresponsive_threshold_| number of ping messages. bool IsVeryUnresponsive(); // The |thread_id_| of the thread being watched. Only one instance can exist // for the given |thread_id_| of the thread being watched. const content::BrowserThread::ID thread_id_; // The name of the thread being watched. const std::string thread_name_; // Used to post messages to watched thread. scoped_refptr<base::MessageLoopProxy> watched_loop_; // It is the sleep time between the receipt of a pong message back, and the // sending of another ping message. const base::TimeDelta sleep_time_; // It is the duration from sending a ping message, until we check status to be // sure a pong message has been returned. const base::TimeDelta unresponsive_time_; // This is the last time when ping message was sent. base::TimeTicks ping_time_; // This is the last time when we got pong message. base::TimeTicks pong_time_; // This is the sequence number of the next ping for which there is no pong. If // the instance is sleeping, then it will be the sequence number for the next // ping. uint64 ping_sequence_number_; // This is set to true if thread watcher is watching. bool active_; // The counter tracks least number of ping messages that will be sent to // watched thread before the ping-pong mechanism will go into an extended // sleep. If this value is zero, then the mechanism is in an extended sleep, // and awaiting some observed user action before continuing. int ping_count_; // Histogram that keeps track of response times for the watched thread. base::HistogramBase* response_time_histogram_; // Histogram that keeps track of unresponsive time since the last pong message // when we got no response (GotNoResponse()) from the watched thread. base::HistogramBase* unresponsive_time_histogram_; // Histogram that keeps track of how many threads are responding when we got // no response (GotNoResponse()) from the watched thread. base::HistogramBase* responsive_count_histogram_; // Histogram that keeps track of how many threads are not responding when we // got no response (GotNoResponse()) from the watched thread. Count includes // the thread that got no response. base::HistogramBase* unresponsive_count_histogram_; // This counter tracks the unresponsiveness of watched thread. If this value // is zero then watched thread has responded with a pong message. This is // incremented by 1 when we got no response (GotNoResponse()) from the watched // thread. uint32 unresponsive_count_; // This is set to true when we would have crashed the browser because the // watched thread hasn't responded at least |unresponsive_threshold_| times. // It is reset to false when watched thread responds with a pong message. bool hung_processing_complete_; // This is used to determine if the watched thread is responsive or not. If // watched thread's |unresponsive_count_| is greater than or equal to // |unresponsive_threshold_| then we would consider it as unresponsive. uint32 unresponsive_threshold_; // This is set to true if we want to crash the browser when the watched thread // has become sufficiently unresponsive, while other threads are sufficiently // responsive. bool crash_on_hang_; // This specifies the number of browser threads that are to be responsive when // we want to crash the browser because watched thread has become sufficiently // unresponsive. uint32 live_threads_threshold_; // We use this factory to create callback tasks for ThreadWatcher object. We // use this during ping-pong messaging between WatchDog thread and watched // thread. base::WeakPtrFactory<ThreadWatcher> weak_ptr_factory_; DISALLOW_COPY_AND_ASSIGN(ThreadWatcher); }; // Class with a list of all active thread watchers. A thread watcher is active // if it has been registered, which includes determing the histogram name. This // class provides utility functions to start and stop watching all browser // threads. Only one instance of this class exists. class ThreadWatcherList { public: // A map from BrowserThread to the actual instances. typedef std::map<content::BrowserThread::ID, ThreadWatcher*> RegistrationList; // A map from thread names (UI, IO, etc) to |CrashDataThresholds|. // |live_threads_threshold| specifies the maximum number of browser threads // that have to be responsive when we want to crash the browser because of // hung watched thread. This threshold allows us to either look for a system // deadlock, or look for a solo hung thread. A small live_threads_threshold // looks for a broad deadlock (few browser threads left running), and a large // threshold looks for a single hung thread (this in only appropriate for a // thread that *should* never have much jank, such as the IO). // // |unresponsive_threshold| specifies the number of unanswered ping messages // after which watched (UI, IO, etc) thread is considered as not responsive. // We translate "time" (given in seconds) into a number of pings. As a result, // we only declare a thread unresponsive when a lot of "time" has passed (many // pings), and yet our pinging thread has continued to process messages (so we // know the entire PC is not hung). Set this number higher to crash less // often, and lower to crash more often. // // The map lists all threads (by name) that can induce a crash by hanging. It // is populated from the command line, or given a default list. See // InitializeAndStartWatching() for the separate list of all threads that are // watched, as they provide the system context of how hung *other* threads // are. // // ThreadWatcher monitors five browser threads (i.e., UI, IO, DB, FILE, // and CACHE). Out of the 5 threads, any subset may be watched, to potentially // cause a crash. The following example's command line causes exactly 3 // threads to be watched. // // The example command line argument consists of "UI:3:18,IO:3:18,FILE:5:90". // In that string, the first parameter specifies the thread_id: UI, IO or // FILE. The second parameter specifies |live_threads_threshold|. For UI and // IO threads, we would crash if the number of threads responding is less than // or equal to 3. The third parameter specifies the unresponsive threshold // seconds. This number is used to calculate |unresponsive_threshold|. In this // example for UI and IO threads, we would crash if those threads don't // respond for 18 seconds (or 9 unanswered ping messages) and for FILE thread, // crash_seconds is set to 90 seconds (or 45 unanswered ping messages). // // The following examples explain how the data in |CrashDataThresholds| // controls the crashes. // // Example 1: If the |live_threads_threshold| value for "IO" was 3 and // unresponsive threshold seconds is 18 (or |unresponsive_threshold| is 9), // then we would crash if the IO thread was hung (9 unanswered ping messages) // and if at least one thread is responding and total responding threads is // less than or equal to 3 (this thread, plus at least one other thread is // unresponsive). We would not crash if none of the threads are responding, as // we'd assume such large hang counts mean that the system is generally // unresponsive. // Example 2: If the |live_threads_threshold| value for "UI" was any number // higher than 6 and unresponsive threshold seconds is 18 (or // |unresponsive_threshold| is 9), then we would always crash if the UI thread // was hung (9 unanswered ping messages), no matter what the other threads are // doing. // Example 3: If the |live_threads_threshold| value of "FILE" was 5 and // unresponsive threshold seconds is 90 (or |unresponsive_threshold| is 45), // then we would only crash if the FILE thread was the ONLY hung thread // (because we watch 6 threads). If there was another unresponsive thread, we // would not consider this a problem worth crashing for. FILE thread would be // considered as hung if it didn't respond for 45 ping messages. struct CrashDataThresholds { CrashDataThresholds(uint32 live_threads_threshold, uint32 unresponsive_threshold); CrashDataThresholds(); uint32 live_threads_threshold; uint32 unresponsive_threshold; }; typedef std::map<std::string, CrashDataThresholds> CrashOnHangThreadMap; // This method posts a task on WatchDogThread to start watching all browser // threads. // This method is accessible on UI thread. static void StartWatchingAll(const CommandLine& command_line); // This method posts a task on WatchDogThread to RevokeAll tasks and to // deactive thread watching of other threads and tell NotificationService to // stop calling Observe. // This method is accessible on UI thread. static void StopWatchingAll(); // Register() stores a pointer to the given ThreadWatcher in a global map. static void Register(ThreadWatcher* watcher); // This method returns true if the ThreadWatcher object is registerd. static bool IsRegistered(const content::BrowserThread::ID thread_id); // This method returns number of responsive and unresponsive watched threads. static void GetStatusOfThreads(uint32* responding_thread_count, uint32* unresponding_thread_count); // This will ensure that the watching is actively taking place, and awaken // all thread watchers that are registered. static void WakeUpAll(); private: // Allow tests to access our innards for testing purposes. friend class CustomThreadWatcher; friend class ThreadWatcherTest; FRIEND_TEST_ALL_PREFIXES(ThreadWatcherTest, ThreadNamesOnlyArgs); FRIEND_TEST_ALL_PREFIXES(ThreadWatcherTest, ThreadNamesAndLiveThresholdArgs); FRIEND_TEST_ALL_PREFIXES(ThreadWatcherTest, CrashOnHangThreadsAllArgs); // This singleton holds the global list of registered ThreadWatchers. ThreadWatcherList(); // Destructor deletes all registered ThreadWatcher instances. virtual ~ThreadWatcherList(); // Parses the command line to get |crash_on_hang_threads| map from // switches::kCrashOnHangThreads. |crash_on_hang_threads| is a map of // |crash_on_hang| thread's names to |CrashDataThresholds|. static void ParseCommandLine( const CommandLine& command_line, uint32* unresponsive_threshold, CrashOnHangThreadMap* crash_on_hang_threads); // Parses the argument |crash_on_hang_thread_names| and creates // |crash_on_hang_threads| map of |crash_on_hang| thread's names to // |CrashDataThresholds|. If |crash_on_hang_thread_names| doesn't specify // |live_threads_threshold|, then it uses |default_live_threads_threshold| as // the value. If |crash_on_hang_thread_names| doesn't specify |crash_seconds|, // then it uses |default_crash_seconds| as the value. static void ParseCommandLineCrashOnHangThreads( const std::string& crash_on_hang_thread_names, uint32 default_live_threads_threshold, uint32 default_crash_seconds, CrashOnHangThreadMap* crash_on_hang_threads); // This constructs the |ThreadWatcherList| singleton and starts watching // browser threads by calling StartWatching() on each browser thread that is // watched. It disarms StartupTimeBomb. static void InitializeAndStartWatching( uint32 unresponsive_threshold, const CrashOnHangThreadMap& crash_on_hang_threads); // This method calls ThreadWatcher::StartWatching() to perform health check on // the given |thread_id|. static void StartWatching( const content::BrowserThread::ID& thread_id, const std::string& thread_name, const base::TimeDelta& sleep_time, const base::TimeDelta& unresponsive_time, uint32 unresponsive_threshold, const CrashOnHangThreadMap& crash_on_hang_threads); // Delete all thread watcher objects and remove them from global map. It also // deletes |g_thread_watcher_list_|. static void DeleteAll(); // The Find() method can be used to test to see if a given ThreadWatcher was // already registered, or to retrieve a pointer to it from the global map. static ThreadWatcher* Find(const content::BrowserThread::ID& thread_id); // The singleton of this class and is used to keep track of information about // threads that are being watched. static ThreadWatcherList* g_thread_watcher_list_; // This is the wait time between ping messages. static const int kSleepSeconds; // This is the wait time after ping message is sent, to check if we have // received pong message or not. static const int kUnresponsiveSeconds; // Default values for |unresponsive_threshold|. static const int kUnresponsiveCount; // Default values for |live_threads_threshold|. static const int kLiveThreadsThreshold; // Map of all registered watched threads, from thread_id to ThreadWatcher. RegistrationList registered_; DISALLOW_COPY_AND_ASSIGN(ThreadWatcherList); }; // This class ensures that the thread watching is actively taking place. Only // one instance of this class exists. class ThreadWatcherObserver : public content::NotificationObserver { public: // Registers |g_thread_watcher_observer_| as the Notifications observer. // |wakeup_interval| specifies how often to wake up thread watchers. This // method is accessible on UI thread. static void SetupNotifications(const base::TimeDelta& wakeup_interval); // Removes all ints from |registrar_| and deletes // |g_thread_watcher_observer_|. This method is accessible on UI thread. static void RemoveNotifications(); private: // Constructor of |g_thread_watcher_observer_| singleton. explicit ThreadWatcherObserver(const base::TimeDelta& wakeup_interval); // Destructor of |g_thread_watcher_observer_| singleton. virtual ~ThreadWatcherObserver(); // This ensures all thread watchers are active because there is some user // activity. It will wake up all thread watchers every |wakeup_interval_| // seconds. This is the implementation of content::NotificationObserver. When // a matching notification is posted to the notification service, this method // is called. virtual void Observe(int type, const content::NotificationSource& source, const content::NotificationDetails& details) OVERRIDE; // The singleton of this class. static ThreadWatcherObserver* g_thread_watcher_observer_; // The registrar that holds ints to be observed. content::NotificationRegistrar registrar_; // This is the last time when woke all thread watchers up. base::TimeTicks last_wakeup_time_; // It is the time interval between wake up calls to thread watchers. const base::TimeDelta wakeup_interval_; DISALLOW_COPY_AND_ASSIGN(ThreadWatcherObserver); }; // Class for WatchDogThread and in its Init method, we start watching UI, IO, // DB, FILE, CACHED threads. class WatchDogThread : public base::Thread { public: // Constructor. WatchDogThread(); // Destroys the thread and stops the thread. virtual ~WatchDogThread(); // Callable on any thread. Returns whether you're currently on a // WatchDogThread. static bool CurrentlyOnWatchDogThread(); // These are the same methods in message_loop.h, but are guaranteed to either // get posted to the MessageLoop if it's still alive, or be deleted otherwise. // They return true iff the watchdog thread existed and the task was posted. // Note that even if the task is posted, there's no guarantee that it will // run, since the target thread may already have a Quit message in its queue. static bool PostTask(const tracked_objects::Location& from_here, const base::Closure& task); static bool PostDelayedTask(const tracked_objects::Location& from_here, const base::Closure& task, base::TimeDelta delay); protected: virtual void Init() OVERRIDE; virtual void CleanUp() OVERRIDE; private: static bool PostTaskHelper( const tracked_objects::Location& from_here, const base::Closure& task, base::TimeDelta delay); DISALLOW_COPY_AND_ASSIGN(WatchDogThread); }; // This is a wrapper class for getting the crash dumps of the hangs during // startup. class StartupTimeBomb { public: // This singleton is instantiated when the browser process is launched. StartupTimeBomb(); // Destructor disarm's startup_watchdog_ (if it is arm'ed) so that alarm // doesn't go off. ~StartupTimeBomb(); // Constructs |startup_watchdog_| which spawns a thread and starts timer. // |duration| specifies how long |startup_watchdog_| will wait before it // calls alarm. void Arm(const base::TimeDelta& duration); // Disarms |startup_watchdog_| thread and then deletes it which stops the // Watchdog thread. void Disarm(); // Disarms |g_startup_timebomb_|. static void DisarmStartupTimeBomb(); private: // Deletes |startup_watchdog_| if it is joinable. If |startup_watchdog_| is // not joinable, then it will post a delayed task to try again. void DeleteStartupWatchdog(); // The singleton of this class. static StartupTimeBomb* g_startup_timebomb_; // Watches for hangs during startup until it is disarm'ed. base::Watchdog* startup_watchdog_; // The |thread_id_| on which this object is constructed. const base::PlatformThreadId thread_id_; DISALLOW_COPY_AND_ASSIGN(StartupTimeBomb); }; // This is a wrapper class for detecting hangs during shutdown. class ShutdownWatcherHelper { public: // Create an empty holder for |shutdown_watchdog_|. ShutdownWatcherHelper(); // Destructor disarm's shutdown_watchdog_ so that alarm doesn't go off. ~ShutdownWatcherHelper(); // Constructs ShutdownWatchDogThread which spawns a thread and starts timer. // |duration| specifies how long it will wait before it calls alarm. void Arm(const base::TimeDelta& duration); private: // shutdown_watchdog_ watches for hangs during shutdown. base::Watchdog* shutdown_watchdog_; // The |thread_id_| on which this object is constructed. const base::PlatformThreadId thread_id_; DISALLOW_COPY_AND_ASSIGN(ShutdownWatcherHelper); }; #endif // CHROME_BROWSER_METRICS_THREAD_WATCHER_H_