/* * Copyright (C) 2008 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.android.server; import android.app.IActivityController; import android.os.Binder; import android.os.Build; import android.os.RemoteException; import android.system.ErrnoException; import android.system.OsConstants; import android.system.StructRlimit; import com.android.internal.os.ZygoteConnectionConstants; import com.android.server.am.ActivityManagerService; import android.content.BroadcastReceiver; import android.content.ContentResolver; import android.content.Context; import android.content.Intent; import android.content.IntentFilter; import android.hidl.manager.V1_0.IServiceManager; import android.os.Debug; import android.os.Handler; import android.os.IPowerManager; import android.os.Looper; import android.os.Process; import android.os.ServiceManager; import android.os.SystemClock; import android.os.SystemProperties; import android.util.EventLog; import android.util.Log; import android.util.Slog; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashSet; import java.util.List; /** This class calls its monitor every minute. Killing this process if they don't return **/ public class Watchdog extends Thread { static final String TAG = "Watchdog"; // Set this to true to use debug default values. static final boolean DB = false; // Note 1: Do not lower this value below thirty seconds without tightening the invoke-with // timeout in com.android.internal.os.ZygoteConnection, or wrapped applications // can trigger the watchdog. // Note 2: The debug value is already below the wait time in ZygoteConnection. Wrapped // applications may not work with a debug build. CTS will fail. static final long DEFAULT_TIMEOUT = DB ? 10*1000 : 60*1000; static final long CHECK_INTERVAL = DEFAULT_TIMEOUT / 2; // These are temporally ordered: larger values as lateness increases static final int COMPLETED = 0; static final int WAITING = 1; static final int WAITED_HALF = 2; static final int OVERDUE = 3; // Which native processes to dump into dropbox's stack traces public static final String[] NATIVE_STACKS_OF_INTEREST = new String[] { "/system/bin/audioserver", "/system/bin/cameraserver", "/system/bin/drmserver", "/system/bin/mediadrmserver", "/system/bin/mediaserver", "/system/bin/sdcard", "/system/bin/surfaceflinger", "media.extractor", // system/bin/mediaextractor "media.metrics", // system/bin/mediametrics "media.codec", // vendor/bin/hw/android.hardware.media.omx@1.0-service "com.android.bluetooth", // Bluetooth service "statsd", // Stats daemon }; public static final List<String> HAL_INTERFACES_OF_INTEREST = Arrays.asList( "android.hardware.audio@2.0::IDevicesFactory", "android.hardware.audio@4.0::IDevicesFactory", "android.hardware.bluetooth@1.0::IBluetoothHci", "android.hardware.camera.provider@2.4::ICameraProvider", "android.hardware.graphics.composer@2.1::IComposer", "android.hardware.media.omx@1.0::IOmx", "android.hardware.media.omx@1.0::IOmxStore", "android.hardware.sensors@1.0::ISensors", "android.hardware.vr@1.0::IVr" ); static Watchdog sWatchdog; /* This handler will be used to post message back onto the main thread */ final ArrayList<HandlerChecker> mHandlerCheckers = new ArrayList<>(); final HandlerChecker mMonitorChecker; ContentResolver mResolver; ActivityManagerService mActivity; int mPhonePid; IActivityController mController; boolean mAllowRestart = true; final OpenFdMonitor mOpenFdMonitor; /** * Used for checking status of handle threads and scheduling monitor callbacks. */ public final class HandlerChecker implements Runnable { private final Handler mHandler; private final String mName; private final long mWaitMax; private final ArrayList<Monitor> mMonitors = new ArrayList<Monitor>(); private boolean mCompleted; private Monitor mCurrentMonitor; private long mStartTime; HandlerChecker(Handler handler, String name, long waitMaxMillis) { mHandler = handler; mName = name; mWaitMax = waitMaxMillis; mCompleted = true; } public void addMonitor(Monitor monitor) { mMonitors.add(monitor); } public void scheduleCheckLocked() { if (mMonitors.size() == 0 && mHandler.getLooper().getQueue().isPolling()) { // If the target looper has recently been polling, then // there is no reason to enqueue our checker on it since that // is as good as it not being deadlocked. This avoid having // to do a context switch to check the thread. Note that we // only do this if mCheckReboot is false and we have no // monitors, since those would need to be executed at this point. mCompleted = true; return; } if (!mCompleted) { // we already have a check in flight, so no need return; } mCompleted = false; mCurrentMonitor = null; mStartTime = SystemClock.uptimeMillis(); mHandler.postAtFrontOfQueue(this); } public boolean isOverdueLocked() { return (!mCompleted) && (SystemClock.uptimeMillis() > mStartTime + mWaitMax); } public int getCompletionStateLocked() { if (mCompleted) { return COMPLETED; } else { long latency = SystemClock.uptimeMillis() - mStartTime; if (latency < mWaitMax/2) { return WAITING; } else if (latency < mWaitMax) { return WAITED_HALF; } } return OVERDUE; } public Thread getThread() { return mHandler.getLooper().getThread(); } public String getName() { return mName; } public String describeBlockedStateLocked() { if (mCurrentMonitor == null) { return "Blocked in handler on " + mName + " (" + getThread().getName() + ")"; } else { return "Blocked in monitor " + mCurrentMonitor.getClass().getName() + " on " + mName + " (" + getThread().getName() + ")"; } } @Override public void run() { final int size = mMonitors.size(); for (int i = 0 ; i < size ; i++) { synchronized (Watchdog.this) { mCurrentMonitor = mMonitors.get(i); } mCurrentMonitor.monitor(); } synchronized (Watchdog.this) { mCompleted = true; mCurrentMonitor = null; } } } final class RebootRequestReceiver extends BroadcastReceiver { @Override public void onReceive(Context c, Intent intent) { if (intent.getIntExtra("nowait", 0) != 0) { rebootSystem("Received ACTION_REBOOT broadcast"); return; } Slog.w(TAG, "Unsupported ACTION_REBOOT broadcast: " + intent); } } /** Monitor for checking the availability of binder threads. The monitor will block until * there is a binder thread available to process in coming IPCs to make sure other processes * can still communicate with the service. */ private static final class BinderThreadMonitor implements Watchdog.Monitor { @Override public void monitor() { Binder.blockUntilThreadAvailable(); } } public interface Monitor { void monitor(); } public static Watchdog getInstance() { if (sWatchdog == null) { sWatchdog = new Watchdog(); } return sWatchdog; } private Watchdog() { super("watchdog"); // Initialize handler checkers for each common thread we want to check. Note // that we are not currently checking the background thread, since it can // potentially hold longer running operations with no guarantees about the timeliness // of operations there. // The shared foreground thread is the main checker. It is where we // will also dispatch monitor checks and do other work. mMonitorChecker = new HandlerChecker(FgThread.getHandler(), "foreground thread", DEFAULT_TIMEOUT); mHandlerCheckers.add(mMonitorChecker); // Add checker for main thread. We only do a quick check since there // can be UI running on the thread. mHandlerCheckers.add(new HandlerChecker(new Handler(Looper.getMainLooper()), "main thread", DEFAULT_TIMEOUT)); // Add checker for shared UI thread. mHandlerCheckers.add(new HandlerChecker(UiThread.getHandler(), "ui thread", DEFAULT_TIMEOUT)); // And also check IO thread. mHandlerCheckers.add(new HandlerChecker(IoThread.getHandler(), "i/o thread", DEFAULT_TIMEOUT)); // And the display thread. mHandlerCheckers.add(new HandlerChecker(DisplayThread.getHandler(), "display thread", DEFAULT_TIMEOUT)); // Initialize monitor for Binder threads. addMonitor(new BinderThreadMonitor()); mOpenFdMonitor = OpenFdMonitor.create(); // See the notes on DEFAULT_TIMEOUT. assert DB || DEFAULT_TIMEOUT > ZygoteConnectionConstants.WRAPPED_PID_TIMEOUT_MILLIS; } public void init(Context context, ActivityManagerService activity) { mResolver = context.getContentResolver(); mActivity = activity; context.registerReceiver(new RebootRequestReceiver(), new IntentFilter(Intent.ACTION_REBOOT), android.Manifest.permission.REBOOT, null); } public void processStarted(String name, int pid) { synchronized (this) { if ("com.android.phone".equals(name)) { mPhonePid = pid; } } } public void setActivityController(IActivityController controller) { synchronized (this) { mController = controller; } } public void setAllowRestart(boolean allowRestart) { synchronized (this) { mAllowRestart = allowRestart; } } public void addMonitor(Monitor monitor) { synchronized (this) { if (isAlive()) { throw new RuntimeException("Monitors can't be added once the Watchdog is running"); } mMonitorChecker.addMonitor(monitor); } } public void addThread(Handler thread) { addThread(thread, DEFAULT_TIMEOUT); } public void addThread(Handler thread, long timeoutMillis) { synchronized (this) { if (isAlive()) { throw new RuntimeException("Threads can't be added once the Watchdog is running"); } final String name = thread.getLooper().getThread().getName(); mHandlerCheckers.add(new HandlerChecker(thread, name, timeoutMillis)); } } /** * Perform a full reboot of the system. */ void rebootSystem(String reason) { Slog.i(TAG, "Rebooting system because: " + reason); IPowerManager pms = (IPowerManager)ServiceManager.getService(Context.POWER_SERVICE); try { pms.reboot(false, reason, false); } catch (RemoteException ex) { } } private int evaluateCheckerCompletionLocked() { int state = COMPLETED; for (int i=0; i<mHandlerCheckers.size(); i++) { HandlerChecker hc = mHandlerCheckers.get(i); state = Math.max(state, hc.getCompletionStateLocked()); } return state; } private ArrayList<HandlerChecker> getBlockedCheckersLocked() { ArrayList<HandlerChecker> checkers = new ArrayList<HandlerChecker>(); for (int i=0; i<mHandlerCheckers.size(); i++) { HandlerChecker hc = mHandlerCheckers.get(i); if (hc.isOverdueLocked()) { checkers.add(hc); } } return checkers; } private String describeCheckersLocked(List<HandlerChecker> checkers) { StringBuilder builder = new StringBuilder(128); for (int i=0; i<checkers.size(); i++) { if (builder.length() > 0) { builder.append(", "); } builder.append(checkers.get(i).describeBlockedStateLocked()); } return builder.toString(); } private ArrayList<Integer> getInterestingHalPids() { try { IServiceManager serviceManager = IServiceManager.getService(); ArrayList<IServiceManager.InstanceDebugInfo> dump = serviceManager.debugDump(); HashSet<Integer> pids = new HashSet<>(); for (IServiceManager.InstanceDebugInfo info : dump) { if (info.pid == IServiceManager.PidConstant.NO_PID) { continue; } if (!HAL_INTERFACES_OF_INTEREST.contains(info.interfaceName)) { continue; } pids.add(info.pid); } return new ArrayList<Integer>(pids); } catch (RemoteException e) { return new ArrayList<Integer>(); } } private ArrayList<Integer> getInterestingNativePids() { ArrayList<Integer> pids = getInterestingHalPids(); int[] nativePids = Process.getPidsForCommands(NATIVE_STACKS_OF_INTEREST); if (nativePids != null) { pids.ensureCapacity(pids.size() + nativePids.length); for (int i : nativePids) { pids.add(i); } } return pids; } @Override public void run() { boolean waitedHalf = false; while (true) { final List<HandlerChecker> blockedCheckers; final String subject; final boolean allowRestart; int debuggerWasConnected = 0; synchronized (this) { long timeout = CHECK_INTERVAL; // Make sure we (re)spin the checkers that have become idle within // this wait-and-check interval for (int i=0; i<mHandlerCheckers.size(); i++) { HandlerChecker hc = mHandlerCheckers.get(i); hc.scheduleCheckLocked(); } if (debuggerWasConnected > 0) { debuggerWasConnected--; } // NOTE: We use uptimeMillis() here because we do not want to increment the time we // wait while asleep. If the device is asleep then the thing that we are waiting // to timeout on is asleep as well and won't have a chance to run, causing a false // positive on when to kill things. long start = SystemClock.uptimeMillis(); while (timeout > 0) { if (Debug.isDebuggerConnected()) { debuggerWasConnected = 2; } try { wait(timeout); } catch (InterruptedException e) { Log.wtf(TAG, e); } if (Debug.isDebuggerConnected()) { debuggerWasConnected = 2; } timeout = CHECK_INTERVAL - (SystemClock.uptimeMillis() - start); } boolean fdLimitTriggered = false; if (mOpenFdMonitor != null) { fdLimitTriggered = mOpenFdMonitor.monitor(); } if (!fdLimitTriggered) { final int waitState = evaluateCheckerCompletionLocked(); if (waitState == COMPLETED) { // The monitors have returned; reset waitedHalf = false; continue; } else if (waitState == WAITING) { // still waiting but within their configured intervals; back off and recheck continue; } else if (waitState == WAITED_HALF) { if (!waitedHalf) { // We've waited half the deadlock-detection interval. Pull a stack // trace and wait another half. ArrayList<Integer> pids = new ArrayList<Integer>(); pids.add(Process.myPid()); ActivityManagerService.dumpStackTraces(true, pids, null, null, getInterestingNativePids()); waitedHalf = true; } continue; } // something is overdue! blockedCheckers = getBlockedCheckersLocked(); subject = describeCheckersLocked(blockedCheckers); } else { blockedCheckers = Collections.emptyList(); subject = "Open FD high water mark reached"; } allowRestart = mAllowRestart; } // If we got here, that means that the system is most likely hung. // First collect stack traces from all threads of the system process. // Then kill this process so that the system will restart. EventLog.writeEvent(EventLogTags.WATCHDOG, subject); ArrayList<Integer> pids = new ArrayList<>(); pids.add(Process.myPid()); if (mPhonePid > 0) pids.add(mPhonePid); // Pass !waitedHalf so that just in case we somehow wind up here without having // dumped the halfway stacks, we properly re-initialize the trace file. final File stack = ActivityManagerService.dumpStackTraces( !waitedHalf, pids, null, null, getInterestingNativePids()); // Give some extra time to make sure the stack traces get written. // The system's been hanging for a minute, another second or two won't hurt much. SystemClock.sleep(2000); // Trigger the kernel to dump all blocked threads, and backtraces on all CPUs to the kernel log doSysRq('w'); doSysRq('l'); // Try to add the error to the dropbox, but assuming that the ActivityManager // itself may be deadlocked. (which has happened, causing this statement to // deadlock and the watchdog as a whole to be ineffective) Thread dropboxThread = new Thread("watchdogWriteToDropbox") { public void run() { mActivity.addErrorToDropBox( "watchdog", null, "system_server", null, null, subject, null, stack, null); } }; dropboxThread.start(); try { dropboxThread.join(2000); // wait up to 2 seconds for it to return. } catch (InterruptedException ignored) {} IActivityController controller; synchronized (this) { controller = mController; } if (controller != null) { Slog.i(TAG, "Reporting stuck state to activity controller"); try { Binder.setDumpDisabled("Service dumps disabled due to hung system process."); // 1 = keep waiting, -1 = kill system int res = controller.systemNotResponding(subject); if (res >= 0) { Slog.i(TAG, "Activity controller requested to coninue to wait"); waitedHalf = false; continue; } } catch (RemoteException e) { } } // Only kill the process if the debugger is not attached. if (Debug.isDebuggerConnected()) { debuggerWasConnected = 2; } if (debuggerWasConnected >= 2) { Slog.w(TAG, "Debugger connected: Watchdog is *not* killing the system process"); } else if (debuggerWasConnected > 0) { Slog.w(TAG, "Debugger was connected: Watchdog is *not* killing the system process"); } else if (!allowRestart) { Slog.w(TAG, "Restart not allowed: Watchdog is *not* killing the system process"); } else { Slog.w(TAG, "*** WATCHDOG KILLING SYSTEM PROCESS: " + subject); WatchdogDiagnostics.diagnoseCheckers(blockedCheckers); Slog.w(TAG, "*** GOODBYE!"); Process.killProcess(Process.myPid()); System.exit(10); } waitedHalf = false; } } private void doSysRq(char c) { try { FileWriter sysrq_trigger = new FileWriter("/proc/sysrq-trigger"); sysrq_trigger.write(c); sysrq_trigger.close(); } catch (IOException e) { Slog.w(TAG, "Failed to write to /proc/sysrq-trigger", e); } } public static final class OpenFdMonitor { /** * Number of FDs below the soft limit that we trigger a runtime restart at. This was * chosen arbitrarily, but will need to be at least 6 in order to have a sufficient number * of FDs in reserve to complete a dump. */ private static final int FD_HIGH_WATER_MARK = 12; private final File mDumpDir; private final File mFdHighWaterMark; public static OpenFdMonitor create() { // Only run the FD monitor on debuggable builds (such as userdebug and eng builds). if (!Build.IS_DEBUGGABLE) { return null; } // Don't run the FD monitor on builds that have a global ANR trace file. We're using // the ANR trace directory as a quick hack in order to get these traces in bugreports // and we wouldn't want to overwrite something important. final String dumpDirStr = SystemProperties.get("dalvik.vm.stack-trace-dir", ""); if (dumpDirStr.isEmpty()) { return null; } final StructRlimit rlimit; try { rlimit = android.system.Os.getrlimit(OsConstants.RLIMIT_NOFILE); } catch (ErrnoException errno) { Slog.w(TAG, "Error thrown from getrlimit(RLIMIT_NOFILE)", errno); return null; } // The assumption we're making here is that FD numbers are allocated (more or less) // sequentially, which is currently (and historically) true since open is currently // specified to always return the lowest-numbered non-open file descriptor for the // current process. // // We do this to avoid having to enumerate the contents of /proc/self/fd in order to // count the number of descriptors open in the process. final File fdThreshold = new File("/proc/self/fd/" + (rlimit.rlim_cur - FD_HIGH_WATER_MARK)); return new OpenFdMonitor(new File(dumpDirStr), fdThreshold); } OpenFdMonitor(File dumpDir, File fdThreshold) { mDumpDir = dumpDir; mFdHighWaterMark = fdThreshold; } private void dumpOpenDescriptors() { try { File dumpFile = File.createTempFile("anr_fd_", "", mDumpDir); java.lang.Process proc = new ProcessBuilder() .command("/system/bin/lsof", "-p", String.valueOf(Process.myPid())) .redirectErrorStream(true) .redirectOutput(dumpFile) .start(); int returnCode = proc.waitFor(); if (returnCode != 0) { Slog.w(TAG, "Unable to dump open descriptors, lsof return code: " + returnCode); dumpFile.delete(); } } catch (IOException | InterruptedException ex) { Slog.w(TAG, "Unable to dump open descriptors: " + ex); } } /** * @return {@code true} if the high water mark was breached and a dump was written, * {@code false} otherwise. */ public boolean monitor() { if (mFdHighWaterMark.exists()) { dumpOpenDescriptors(); return true; } return false; } } }