Home | History | Annotate | Download | only in server
      1 /*
      2  * Copyright (C) 2008 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 package com.android.server;
     18 
     19 import android.app.IActivityController;
     20 import android.os.Binder;
     21 import android.os.RemoteException;
     22 import com.android.server.am.ActivityManagerService;
     23 
     24 import android.content.BroadcastReceiver;
     25 import android.content.ContentResolver;
     26 import android.content.Context;
     27 import android.content.Intent;
     28 import android.content.IntentFilter;
     29 import android.hidl.manager.V1_0.IServiceManager;
     30 import android.os.Debug;
     31 import android.os.Handler;
     32 import android.os.IPowerManager;
     33 import android.os.Looper;
     34 import android.os.Process;
     35 import android.os.ServiceManager;
     36 import android.os.SystemClock;
     37 import android.os.SystemProperties;
     38 import android.util.EventLog;
     39 import android.util.Log;
     40 import android.util.Slog;
     41 
     42 import java.io.File;
     43 import java.io.FileWriter;
     44 import java.io.IOException;
     45 import java.util.ArrayList;
     46 import java.util.Arrays;
     47 import java.util.HashSet;
     48 import java.util.List;
     49 
     50 /** This class calls its monitor every minute. Killing this process if they don't return **/
     51 public class Watchdog extends Thread {
     52     static final String TAG = "Watchdog";
     53 
     54     // Set this to true to use debug default values.
     55     static final boolean DB = false;
     56 
     57     // Set this to true to have the watchdog record kernel thread stacks when it fires
     58     static final boolean RECORD_KERNEL_THREADS = true;
     59 
     60     static final long DEFAULT_TIMEOUT = DB ? 10*1000 : 60*1000;
     61     static final long CHECK_INTERVAL = DEFAULT_TIMEOUT / 2;
     62 
     63     // These are temporally ordered: larger values as lateness increases
     64     static final int COMPLETED = 0;
     65     static final int WAITING = 1;
     66     static final int WAITED_HALF = 2;
     67     static final int OVERDUE = 3;
     68 
     69     // Which native processes to dump into dropbox's stack traces
     70     public static final String[] NATIVE_STACKS_OF_INTEREST = new String[] {
     71         "/system/bin/audioserver",
     72         "/system/bin/cameraserver",
     73         "/system/bin/drmserver",
     74         "/system/bin/mediadrmserver",
     75         "/system/bin/mediaserver",
     76         "/system/bin/sdcard",
     77         "/system/bin/surfaceflinger",
     78         "media.extractor", // system/bin/mediaextractor
     79         "media.codec", // vendor/bin/hw/android.hardware.media.omx@1.0-service
     80         "com.android.bluetooth",  // Bluetooth service
     81     };
     82 
     83     public static final List<String> HAL_INTERFACES_OF_INTEREST = Arrays.asList(
     84         "android.hardware.audio (at) 2.0::IDevicesFactory",
     85         "android.hardware.bluetooth (at) 1.0::IBluetoothHci",
     86         "android.hardware.camera.provider (at) 2.4::ICameraProvider",
     87         "android.hardware.graphics.composer (at) 2.1::IComposer",
     88         "android.hardware.vr (at) 1.0::IVr",
     89         "android.hardware.media.omx (at) 1.0::IOmx"
     90     );
     91 
     92     static Watchdog sWatchdog;
     93 
     94     /* This handler will be used to post message back onto the main thread */
     95     final ArrayList<HandlerChecker> mHandlerCheckers = new ArrayList<>();
     96     final HandlerChecker mMonitorChecker;
     97     ContentResolver mResolver;
     98     ActivityManagerService mActivity;
     99 
    100     int mPhonePid;
    101     IActivityController mController;
    102     boolean mAllowRestart = true;
    103 
    104     /**
    105      * Used for checking status of handle threads and scheduling monitor callbacks.
    106      */
    107     public final class HandlerChecker implements Runnable {
    108         private final Handler mHandler;
    109         private final String mName;
    110         private final long mWaitMax;
    111         private final ArrayList<Monitor> mMonitors = new ArrayList<Monitor>();
    112         private boolean mCompleted;
    113         private Monitor mCurrentMonitor;
    114         private long mStartTime;
    115 
    116         HandlerChecker(Handler handler, String name, long waitMaxMillis) {
    117             mHandler = handler;
    118             mName = name;
    119             mWaitMax = waitMaxMillis;
    120             mCompleted = true;
    121         }
    122 
    123         public void addMonitor(Monitor monitor) {
    124             mMonitors.add(monitor);
    125         }
    126 
    127         public void scheduleCheckLocked() {
    128             if (mMonitors.size() == 0 && mHandler.getLooper().getQueue().isPolling()) {
    129                 // If the target looper has recently been polling, then
    130                 // there is no reason to enqueue our checker on it since that
    131                 // is as good as it not being deadlocked.  This avoid having
    132                 // to do a context switch to check the thread.  Note that we
    133                 // only do this if mCheckReboot is false and we have no
    134                 // monitors, since those would need to be executed at this point.
    135                 mCompleted = true;
    136                 return;
    137             }
    138 
    139             if (!mCompleted) {
    140                 // we already have a check in flight, so no need
    141                 return;
    142             }
    143 
    144             mCompleted = false;
    145             mCurrentMonitor = null;
    146             mStartTime = SystemClock.uptimeMillis();
    147             mHandler.postAtFrontOfQueue(this);
    148         }
    149 
    150         public boolean isOverdueLocked() {
    151             return (!mCompleted) && (SystemClock.uptimeMillis() > mStartTime + mWaitMax);
    152         }
    153 
    154         public int getCompletionStateLocked() {
    155             if (mCompleted) {
    156                 return COMPLETED;
    157             } else {
    158                 long latency = SystemClock.uptimeMillis() - mStartTime;
    159                 if (latency < mWaitMax/2) {
    160                     return WAITING;
    161                 } else if (latency < mWaitMax) {
    162                     return WAITED_HALF;
    163                 }
    164             }
    165             return OVERDUE;
    166         }
    167 
    168         public Thread getThread() {
    169             return mHandler.getLooper().getThread();
    170         }
    171 
    172         public String getName() {
    173             return mName;
    174         }
    175 
    176         public String describeBlockedStateLocked() {
    177             if (mCurrentMonitor == null) {
    178                 return "Blocked in handler on " + mName + " (" + getThread().getName() + ")";
    179             } else {
    180                 return "Blocked in monitor " + mCurrentMonitor.getClass().getName()
    181                         + " on " + mName + " (" + getThread().getName() + ")";
    182             }
    183         }
    184 
    185         @Override
    186         public void run() {
    187             final int size = mMonitors.size();
    188             for (int i = 0 ; i < size ; i++) {
    189                 synchronized (Watchdog.this) {
    190                     mCurrentMonitor = mMonitors.get(i);
    191                 }
    192                 mCurrentMonitor.monitor();
    193             }
    194 
    195             synchronized (Watchdog.this) {
    196                 mCompleted = true;
    197                 mCurrentMonitor = null;
    198             }
    199         }
    200     }
    201 
    202     final class RebootRequestReceiver extends BroadcastReceiver {
    203         @Override
    204         public void onReceive(Context c, Intent intent) {
    205             if (intent.getIntExtra("nowait", 0) != 0) {
    206                 rebootSystem("Received ACTION_REBOOT broadcast");
    207                 return;
    208             }
    209             Slog.w(TAG, "Unsupported ACTION_REBOOT broadcast: " + intent);
    210         }
    211     }
    212 
    213     /** Monitor for checking the availability of binder threads. The monitor will block until
    214      * there is a binder thread available to process in coming IPCs to make sure other processes
    215      * can still communicate with the service.
    216      */
    217     private static final class BinderThreadMonitor implements Watchdog.Monitor {
    218         @Override
    219         public void monitor() {
    220             Binder.blockUntilThreadAvailable();
    221         }
    222     }
    223 
    224     public interface Monitor {
    225         void monitor();
    226     }
    227 
    228     public static Watchdog getInstance() {
    229         if (sWatchdog == null) {
    230             sWatchdog = new Watchdog();
    231         }
    232 
    233         return sWatchdog;
    234     }
    235 
    236     private Watchdog() {
    237         super("watchdog");
    238         // Initialize handler checkers for each common thread we want to check.  Note
    239         // that we are not currently checking the background thread, since it can
    240         // potentially hold longer running operations with no guarantees about the timeliness
    241         // of operations there.
    242 
    243         // The shared foreground thread is the main checker.  It is where we
    244         // will also dispatch monitor checks and do other work.
    245         mMonitorChecker = new HandlerChecker(FgThread.getHandler(),
    246                 "foreground thread", DEFAULT_TIMEOUT);
    247         mHandlerCheckers.add(mMonitorChecker);
    248         // Add checker for main thread.  We only do a quick check since there
    249         // can be UI running on the thread.
    250         mHandlerCheckers.add(new HandlerChecker(new Handler(Looper.getMainLooper()),
    251                 "main thread", DEFAULT_TIMEOUT));
    252         // Add checker for shared UI thread.
    253         mHandlerCheckers.add(new HandlerChecker(UiThread.getHandler(),
    254                 "ui thread", DEFAULT_TIMEOUT));
    255         // And also check IO thread.
    256         mHandlerCheckers.add(new HandlerChecker(IoThread.getHandler(),
    257                 "i/o thread", DEFAULT_TIMEOUT));
    258         // And the display thread.
    259         mHandlerCheckers.add(new HandlerChecker(DisplayThread.getHandler(),
    260                 "display thread", DEFAULT_TIMEOUT));
    261 
    262         // Initialize monitor for Binder threads.
    263         addMonitor(new BinderThreadMonitor());
    264     }
    265 
    266     public void init(Context context, ActivityManagerService activity) {
    267         mResolver = context.getContentResolver();
    268         mActivity = activity;
    269 
    270         context.registerReceiver(new RebootRequestReceiver(),
    271                 new IntentFilter(Intent.ACTION_REBOOT),
    272                 android.Manifest.permission.REBOOT, null);
    273     }
    274 
    275     public void processStarted(String name, int pid) {
    276         synchronized (this) {
    277             if ("com.android.phone".equals(name)) {
    278                 mPhonePid = pid;
    279             }
    280         }
    281     }
    282 
    283     public void setActivityController(IActivityController controller) {
    284         synchronized (this) {
    285             mController = controller;
    286         }
    287     }
    288 
    289     public void setAllowRestart(boolean allowRestart) {
    290         synchronized (this) {
    291             mAllowRestart = allowRestart;
    292         }
    293     }
    294 
    295     public void addMonitor(Monitor monitor) {
    296         synchronized (this) {
    297             if (isAlive()) {
    298                 throw new RuntimeException("Monitors can't be added once the Watchdog is running");
    299             }
    300             mMonitorChecker.addMonitor(monitor);
    301         }
    302     }
    303 
    304     public void addThread(Handler thread) {
    305         addThread(thread, DEFAULT_TIMEOUT);
    306     }
    307 
    308     public void addThread(Handler thread, long timeoutMillis) {
    309         synchronized (this) {
    310             if (isAlive()) {
    311                 throw new RuntimeException("Threads can't be added once the Watchdog is running");
    312             }
    313             final String name = thread.getLooper().getThread().getName();
    314             mHandlerCheckers.add(new HandlerChecker(thread, name, timeoutMillis));
    315         }
    316     }
    317 
    318     /**
    319      * Perform a full reboot of the system.
    320      */
    321     void rebootSystem(String reason) {
    322         Slog.i(TAG, "Rebooting system because: " + reason);
    323         IPowerManager pms = (IPowerManager)ServiceManager.getService(Context.POWER_SERVICE);
    324         try {
    325             pms.reboot(false, reason, false);
    326         } catch (RemoteException ex) {
    327         }
    328     }
    329 
    330     private int evaluateCheckerCompletionLocked() {
    331         int state = COMPLETED;
    332         for (int i=0; i<mHandlerCheckers.size(); i++) {
    333             HandlerChecker hc = mHandlerCheckers.get(i);
    334             state = Math.max(state, hc.getCompletionStateLocked());
    335         }
    336         return state;
    337     }
    338 
    339     private ArrayList<HandlerChecker> getBlockedCheckersLocked() {
    340         ArrayList<HandlerChecker> checkers = new ArrayList<HandlerChecker>();
    341         for (int i=0; i<mHandlerCheckers.size(); i++) {
    342             HandlerChecker hc = mHandlerCheckers.get(i);
    343             if (hc.isOverdueLocked()) {
    344                 checkers.add(hc);
    345             }
    346         }
    347         return checkers;
    348     }
    349 
    350     private String describeCheckersLocked(ArrayList<HandlerChecker> checkers) {
    351         StringBuilder builder = new StringBuilder(128);
    352         for (int i=0; i<checkers.size(); i++) {
    353             if (builder.length() > 0) {
    354                 builder.append(", ");
    355             }
    356             builder.append(checkers.get(i).describeBlockedStateLocked());
    357         }
    358         return builder.toString();
    359     }
    360 
    361     private ArrayList<Integer> getInterestingHalPids() {
    362         try {
    363             IServiceManager serviceManager = IServiceManager.getService();
    364             ArrayList<IServiceManager.InstanceDebugInfo> dump =
    365                     serviceManager.debugDump();
    366             HashSet<Integer> pids = new HashSet<>();
    367             for (IServiceManager.InstanceDebugInfo info : dump) {
    368                 if (info.pid == IServiceManager.PidConstant.NO_PID) {
    369                     continue;
    370                 }
    371 
    372                 if (!HAL_INTERFACES_OF_INTEREST.contains(info.interfaceName)) {
    373                     continue;
    374                 }
    375 
    376                 pids.add(info.pid);
    377             }
    378             return new ArrayList<Integer>(pids);
    379         } catch (RemoteException e) {
    380             return new ArrayList<Integer>();
    381         }
    382     }
    383 
    384     private ArrayList<Integer> getInterestingNativePids() {
    385         ArrayList<Integer> pids = getInterestingHalPids();
    386 
    387         int[] nativePids = Process.getPidsForCommands(NATIVE_STACKS_OF_INTEREST);
    388         if (nativePids != null) {
    389             pids.ensureCapacity(pids.size() + nativePids.length);
    390             for (int i : nativePids) {
    391                 pids.add(i);
    392             }
    393         }
    394 
    395         return pids;
    396     }
    397 
    398     @Override
    399     public void run() {
    400         boolean waitedHalf = false;
    401         while (true) {
    402             final ArrayList<HandlerChecker> blockedCheckers;
    403             final String subject;
    404             final boolean allowRestart;
    405             int debuggerWasConnected = 0;
    406             synchronized (this) {
    407                 long timeout = CHECK_INTERVAL;
    408                 // Make sure we (re)spin the checkers that have become idle within
    409                 // this wait-and-check interval
    410                 for (int i=0; i<mHandlerCheckers.size(); i++) {
    411                     HandlerChecker hc = mHandlerCheckers.get(i);
    412                     hc.scheduleCheckLocked();
    413                 }
    414 
    415                 if (debuggerWasConnected > 0) {
    416                     debuggerWasConnected--;
    417                 }
    418 
    419                 // NOTE: We use uptimeMillis() here because we do not want to increment the time we
    420                 // wait while asleep. If the device is asleep then the thing that we are waiting
    421                 // to timeout on is asleep as well and won't have a chance to run, causing a false
    422                 // positive on when to kill things.
    423                 long start = SystemClock.uptimeMillis();
    424                 while (timeout > 0) {
    425                     if (Debug.isDebuggerConnected()) {
    426                         debuggerWasConnected = 2;
    427                     }
    428                     try {
    429                         wait(timeout);
    430                     } catch (InterruptedException e) {
    431                         Log.wtf(TAG, e);
    432                     }
    433                     if (Debug.isDebuggerConnected()) {
    434                         debuggerWasConnected = 2;
    435                     }
    436                     timeout = CHECK_INTERVAL - (SystemClock.uptimeMillis() - start);
    437                 }
    438 
    439                 final int waitState = evaluateCheckerCompletionLocked();
    440                 if (waitState == COMPLETED) {
    441                     // The monitors have returned; reset
    442                     waitedHalf = false;
    443                     continue;
    444                 } else if (waitState == WAITING) {
    445                     // still waiting but within their configured intervals; back off and recheck
    446                     continue;
    447                 } else if (waitState == WAITED_HALF) {
    448                     if (!waitedHalf) {
    449                         // We've waited half the deadlock-detection interval.  Pull a stack
    450                         // trace and wait another half.
    451                         ArrayList<Integer> pids = new ArrayList<Integer>();
    452                         pids.add(Process.myPid());
    453                         ActivityManagerService.dumpStackTraces(true, pids, null, null,
    454                             getInterestingNativePids());
    455                         waitedHalf = true;
    456                     }
    457                     continue;
    458                 }
    459 
    460                 // something is overdue!
    461                 blockedCheckers = getBlockedCheckersLocked();
    462                 subject = describeCheckersLocked(blockedCheckers);
    463                 allowRestart = mAllowRestart;
    464             }
    465 
    466             // If we got here, that means that the system is most likely hung.
    467             // First collect stack traces from all threads of the system process.
    468             // Then kill this process so that the system will restart.
    469             EventLog.writeEvent(EventLogTags.WATCHDOG, subject);
    470 
    471             ArrayList<Integer> pids = new ArrayList<>();
    472             pids.add(Process.myPid());
    473             if (mPhonePid > 0) pids.add(mPhonePid);
    474             // Pass !waitedHalf so that just in case we somehow wind up here without having
    475             // dumped the halfway stacks, we properly re-initialize the trace file.
    476             final File stack = ActivityManagerService.dumpStackTraces(
    477                     !waitedHalf, pids, null, null, getInterestingNativePids());
    478 
    479             // Give some extra time to make sure the stack traces get written.
    480             // The system's been hanging for a minute, another second or two won't hurt much.
    481             SystemClock.sleep(2000);
    482 
    483             // Pull our own kernel thread stacks as well if we're configured for that
    484             if (RECORD_KERNEL_THREADS) {
    485                 dumpKernelStackTraces();
    486             }
    487 
    488             // Trigger the kernel to dump all blocked threads, and backtraces on all CPUs to the kernel log
    489             doSysRq('w');
    490             doSysRq('l');
    491 
    492             // Try to add the error to the dropbox, but assuming that the ActivityManager
    493             // itself may be deadlocked.  (which has happened, causing this statement to
    494             // deadlock and the watchdog as a whole to be ineffective)
    495             Thread dropboxThread = new Thread("watchdogWriteToDropbox") {
    496                     public void run() {
    497                         mActivity.addErrorToDropBox(
    498                                 "watchdog", null, "system_server", null, null,
    499                                 subject, null, stack, null);
    500                     }
    501                 };
    502             dropboxThread.start();
    503             try {
    504                 dropboxThread.join(2000);  // wait up to 2 seconds for it to return.
    505             } catch (InterruptedException ignored) {}
    506 
    507             IActivityController controller;
    508             synchronized (this) {
    509                 controller = mController;
    510             }
    511             if (controller != null) {
    512                 Slog.i(TAG, "Reporting stuck state to activity controller");
    513                 try {
    514                     Binder.setDumpDisabled("Service dumps disabled due to hung system process.");
    515                     // 1 = keep waiting, -1 = kill system
    516                     int res = controller.systemNotResponding(subject);
    517                     if (res >= 0) {
    518                         Slog.i(TAG, "Activity controller requested to coninue to wait");
    519                         waitedHalf = false;
    520                         continue;
    521                     }
    522                 } catch (RemoteException e) {
    523                 }
    524             }
    525 
    526             // Only kill the process if the debugger is not attached.
    527             if (Debug.isDebuggerConnected()) {
    528                 debuggerWasConnected = 2;
    529             }
    530             if (debuggerWasConnected >= 2) {
    531                 Slog.w(TAG, "Debugger connected: Watchdog is *not* killing the system process");
    532             } else if (debuggerWasConnected > 0) {
    533                 Slog.w(TAG, "Debugger was connected: Watchdog is *not* killing the system process");
    534             } else if (!allowRestart) {
    535                 Slog.w(TAG, "Restart not allowed: Watchdog is *not* killing the system process");
    536             } else {
    537                 Slog.w(TAG, "*** WATCHDOG KILLING SYSTEM PROCESS: " + subject);
    538                 for (int i=0; i<blockedCheckers.size(); i++) {
    539                     Slog.w(TAG, blockedCheckers.get(i).getName() + " stack trace:");
    540                     StackTraceElement[] stackTrace
    541                             = blockedCheckers.get(i).getThread().getStackTrace();
    542                     for (StackTraceElement element: stackTrace) {
    543                         Slog.w(TAG, "    at " + element);
    544                     }
    545                 }
    546                 Slog.w(TAG, "*** GOODBYE!");
    547                 Process.killProcess(Process.myPid());
    548                 System.exit(10);
    549             }
    550 
    551             waitedHalf = false;
    552         }
    553     }
    554 
    555     private void doSysRq(char c) {
    556         try {
    557             FileWriter sysrq_trigger = new FileWriter("/proc/sysrq-trigger");
    558             sysrq_trigger.write(c);
    559             sysrq_trigger.close();
    560         } catch (IOException e) {
    561             Slog.w(TAG, "Failed to write to /proc/sysrq-trigger", e);
    562         }
    563     }
    564 
    565     private File dumpKernelStackTraces() {
    566         String tracesPath = SystemProperties.get("dalvik.vm.stack-trace-file", null);
    567         if (tracesPath == null || tracesPath.length() == 0) {
    568             return null;
    569         }
    570 
    571         native_dumpKernelStacks(tracesPath);
    572         return new File(tracesPath);
    573     }
    574 
    575     private native void native_dumpKernelStacks(String tracesPath);
    576 }
    577