Home | History | Annotate | Download | only in server
      1 /*
      2  * Copyright (C) 2008 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 package com.android.server;
     18 
     19 import android.app.IActivityController;
     20 import android.os.Binder;
     21 import android.os.RemoteException;
     22 import com.android.server.am.ActivityManagerService;
     23 
     24 import android.content.BroadcastReceiver;
     25 import android.content.ContentResolver;
     26 import android.content.Context;
     27 import android.content.Intent;
     28 import android.content.IntentFilter;
     29 import android.os.Debug;
     30 import android.os.Handler;
     31 import android.os.IPowerManager;
     32 import android.os.Looper;
     33 import android.os.Process;
     34 import android.os.ServiceManager;
     35 import android.os.SystemClock;
     36 import android.os.SystemProperties;
     37 import android.util.EventLog;
     38 import android.util.Log;
     39 import android.util.Slog;
     40 
     41 import java.io.File;
     42 import java.io.FileWriter;
     43 import java.io.IOException;
     44 import java.util.ArrayList;
     45 
     46 /** This class calls its monitor every minute. Killing this process if they don't return **/
     47 public class Watchdog extends Thread {
     48     static final String TAG = "Watchdog";
     49     static final boolean localLOGV = false || false;
     50 
     51     // Set this to true to use debug default values.
     52     static final boolean DB = false;
     53 
     54     // Set this to true to have the watchdog record kernel thread stacks when it fires
     55     static final boolean RECORD_KERNEL_THREADS = true;
     56 
     57     static final long DEFAULT_TIMEOUT = DB ? 10*1000 : 60*1000;
     58     static final long CHECK_INTERVAL = DEFAULT_TIMEOUT / 2;
     59 
     60     // These are temporally ordered: larger values as lateness increases
     61     static final int COMPLETED = 0;
     62     static final int WAITING = 1;
     63     static final int WAITED_HALF = 2;
     64     static final int OVERDUE = 3;
     65 
     66     // Which native processes to dump into dropbox's stack traces
     67     public static final String[] NATIVE_STACKS_OF_INTEREST = new String[] {
     68         "/system/bin/mediaserver",
     69         "/system/bin/sdcard",
     70         "/system/bin/surfaceflinger"
     71     };
     72 
     73     static Watchdog sWatchdog;
     74 
     75     /* This handler will be used to post message back onto the main thread */
     76     final ArrayList<HandlerChecker> mHandlerCheckers = new ArrayList<HandlerChecker>();
     77     final HandlerChecker mMonitorChecker;
     78     ContentResolver mResolver;
     79     ActivityManagerService mActivity;
     80 
     81     int mPhonePid;
     82     IActivityController mController;
     83     boolean mAllowRestart = true;
     84 
     85     /**
     86      * Used for checking status of handle threads and scheduling monitor callbacks.
     87      */
     88     public final class HandlerChecker implements Runnable {
     89         private final Handler mHandler;
     90         private final String mName;
     91         private final long mWaitMax;
     92         private final ArrayList<Monitor> mMonitors = new ArrayList<Monitor>();
     93         private boolean mCompleted;
     94         private Monitor mCurrentMonitor;
     95         private long mStartTime;
     96 
     97         HandlerChecker(Handler handler, String name, long waitMaxMillis) {
     98             mHandler = handler;
     99             mName = name;
    100             mWaitMax = waitMaxMillis;
    101             mCompleted = true;
    102         }
    103 
    104         public void addMonitor(Monitor monitor) {
    105             mMonitors.add(monitor);
    106         }
    107 
    108         public void scheduleCheckLocked() {
    109             if (mMonitors.size() == 0 && mHandler.getLooper().isIdling()) {
    110                 // If the target looper is or just recently was idling, then
    111                 // there is no reason to enqueue our checker on it since that
    112                 // is as good as it not being deadlocked.  This avoid having
    113                 // to do a context switch to check the thread.  Note that we
    114                 // only do this if mCheckReboot is false and we have no
    115                 // monitors, since those would need to be executed at this point.
    116                 mCompleted = true;
    117                 return;
    118             }
    119 
    120             if (!mCompleted) {
    121                 // we already have a check in flight, so no need
    122                 return;
    123             }
    124 
    125             mCompleted = false;
    126             mCurrentMonitor = null;
    127             mStartTime = SystemClock.uptimeMillis();
    128             mHandler.postAtFrontOfQueue(this);
    129         }
    130 
    131         public boolean isOverdueLocked() {
    132             return (!mCompleted) && (SystemClock.uptimeMillis() > mStartTime + mWaitMax);
    133         }
    134 
    135         public int getCompletionStateLocked() {
    136             if (mCompleted) {
    137                 return COMPLETED;
    138             } else {
    139                 long latency = SystemClock.uptimeMillis() - mStartTime;
    140                 if (latency < mWaitMax/2) {
    141                     return WAITING;
    142                 } else if (latency < mWaitMax) {
    143                     return WAITED_HALF;
    144                 }
    145             }
    146             return OVERDUE;
    147         }
    148 
    149         public Thread getThread() {
    150             return mHandler.getLooper().getThread();
    151         }
    152 
    153         public String getName() {
    154             return mName;
    155         }
    156 
    157         public String describeBlockedStateLocked() {
    158             if (mCurrentMonitor == null) {
    159                 return "Blocked in handler on " + mName + " (" + getThread().getName() + ")";
    160             } else {
    161                 return "Blocked in monitor " + mCurrentMonitor.getClass().getName()
    162                         + " on " + mName + " (" + getThread().getName() + ")";
    163             }
    164         }
    165 
    166         @Override
    167         public void run() {
    168             final int size = mMonitors.size();
    169             for (int i = 0 ; i < size ; i++) {
    170                 synchronized (Watchdog.this) {
    171                     mCurrentMonitor = mMonitors.get(i);
    172                 }
    173                 mCurrentMonitor.monitor();
    174             }
    175 
    176             synchronized (Watchdog.this) {
    177                 mCompleted = true;
    178                 mCurrentMonitor = null;
    179             }
    180         }
    181     }
    182 
    183     final class RebootRequestReceiver extends BroadcastReceiver {
    184         @Override
    185         public void onReceive(Context c, Intent intent) {
    186             if (intent.getIntExtra("nowait", 0) != 0) {
    187                 rebootSystem("Received ACTION_REBOOT broadcast");
    188                 return;
    189             }
    190             Slog.w(TAG, "Unsupported ACTION_REBOOT broadcast: " + intent);
    191         }
    192     }
    193 
    194     public interface Monitor {
    195         void monitor();
    196     }
    197 
    198     public static Watchdog getInstance() {
    199         if (sWatchdog == null) {
    200             sWatchdog = new Watchdog();
    201         }
    202 
    203         return sWatchdog;
    204     }
    205 
    206     private Watchdog() {
    207         super("watchdog");
    208         // Initialize handler checkers for each common thread we want to check.  Note
    209         // that we are not currently checking the background thread, since it can
    210         // potentially hold longer running operations with no guarantees about the timeliness
    211         // of operations there.
    212 
    213         // The shared foreground thread is the main checker.  It is where we
    214         // will also dispatch monitor checks and do other work.
    215         mMonitorChecker = new HandlerChecker(FgThread.getHandler(),
    216                 "foreground thread", DEFAULT_TIMEOUT);
    217         mHandlerCheckers.add(mMonitorChecker);
    218         // Add checker for main thread.  We only do a quick check since there
    219         // can be UI running on the thread.
    220         mHandlerCheckers.add(new HandlerChecker(new Handler(Looper.getMainLooper()),
    221                 "main thread", DEFAULT_TIMEOUT));
    222         // Add checker for shared UI thread.
    223         mHandlerCheckers.add(new HandlerChecker(UiThread.getHandler(),
    224                 "ui thread", DEFAULT_TIMEOUT));
    225         // And also check IO thread.
    226         mHandlerCheckers.add(new HandlerChecker(IoThread.getHandler(),
    227                 "i/o thread", DEFAULT_TIMEOUT));
    228         // And the display thread.
    229         mHandlerCheckers.add(new HandlerChecker(DisplayThread.getHandler(),
    230                 "display thread", DEFAULT_TIMEOUT));
    231     }
    232 
    233     public void init(Context context, ActivityManagerService activity) {
    234         mResolver = context.getContentResolver();
    235         mActivity = activity;
    236 
    237         context.registerReceiver(new RebootRequestReceiver(),
    238                 new IntentFilter(Intent.ACTION_REBOOT),
    239                 android.Manifest.permission.REBOOT, null);
    240     }
    241 
    242     public void processStarted(String name, int pid) {
    243         synchronized (this) {
    244             if ("com.android.phone".equals(name)) {
    245                 mPhonePid = pid;
    246             }
    247         }
    248     }
    249 
    250     public void setActivityController(IActivityController controller) {
    251         synchronized (this) {
    252             mController = controller;
    253         }
    254     }
    255 
    256     public void setAllowRestart(boolean allowRestart) {
    257         synchronized (this) {
    258             mAllowRestart = allowRestart;
    259         }
    260     }
    261 
    262     public void addMonitor(Monitor monitor) {
    263         synchronized (this) {
    264             if (isAlive()) {
    265                 throw new RuntimeException("Monitors can't be added once the Watchdog is running");
    266             }
    267             mMonitorChecker.addMonitor(monitor);
    268         }
    269     }
    270 
    271     public void addThread(Handler thread) {
    272         addThread(thread, DEFAULT_TIMEOUT);
    273     }
    274 
    275     public void addThread(Handler thread, long timeoutMillis) {
    276         synchronized (this) {
    277             if (isAlive()) {
    278                 throw new RuntimeException("Threads can't be added once the Watchdog is running");
    279             }
    280             final String name = thread.getLooper().getThread().getName();
    281             mHandlerCheckers.add(new HandlerChecker(thread, name, timeoutMillis));
    282         }
    283     }
    284 
    285     /**
    286      * Perform a full reboot of the system.
    287      */
    288     void rebootSystem(String reason) {
    289         Slog.i(TAG, "Rebooting system because: " + reason);
    290         IPowerManager pms = (IPowerManager)ServiceManager.getService(Context.POWER_SERVICE);
    291         try {
    292             pms.reboot(false, reason, false);
    293         } catch (RemoteException ex) {
    294         }
    295     }
    296 
    297     private int evaluateCheckerCompletionLocked() {
    298         int state = COMPLETED;
    299         for (int i=0; i<mHandlerCheckers.size(); i++) {
    300             HandlerChecker hc = mHandlerCheckers.get(i);
    301             state = Math.max(state, hc.getCompletionStateLocked());
    302         }
    303         return state;
    304     }
    305 
    306     private ArrayList<HandlerChecker> getBlockedCheckersLocked() {
    307         ArrayList<HandlerChecker> checkers = new ArrayList<HandlerChecker>();
    308         for (int i=0; i<mHandlerCheckers.size(); i++) {
    309             HandlerChecker hc = mHandlerCheckers.get(i);
    310             if (hc.isOverdueLocked()) {
    311                 checkers.add(hc);
    312             }
    313         }
    314         return checkers;
    315     }
    316 
    317     private String describeCheckersLocked(ArrayList<HandlerChecker> checkers) {
    318         StringBuilder builder = new StringBuilder(128);
    319         for (int i=0; i<checkers.size(); i++) {
    320             if (builder.length() > 0) {
    321                 builder.append(", ");
    322             }
    323             builder.append(checkers.get(i).describeBlockedStateLocked());
    324         }
    325         return builder.toString();
    326     }
    327 
    328     @Override
    329     public void run() {
    330         boolean waitedHalf = false;
    331         while (true) {
    332             final ArrayList<HandlerChecker> blockedCheckers;
    333             final String subject;
    334             final boolean allowRestart;
    335             int debuggerWasConnected = 0;
    336             synchronized (this) {
    337                 long timeout = CHECK_INTERVAL;
    338                 // Make sure we (re)spin the checkers that have become idle within
    339                 // this wait-and-check interval
    340                 for (int i=0; i<mHandlerCheckers.size(); i++) {
    341                     HandlerChecker hc = mHandlerCheckers.get(i);
    342                     hc.scheduleCheckLocked();
    343                 }
    344 
    345                 if (debuggerWasConnected > 0) {
    346                     debuggerWasConnected--;
    347                 }
    348 
    349                 // NOTE: We use uptimeMillis() here because we do not want to increment the time we
    350                 // wait while asleep. If the device is asleep then the thing that we are waiting
    351                 // to timeout on is asleep as well and won't have a chance to run, causing a false
    352                 // positive on when to kill things.
    353                 long start = SystemClock.uptimeMillis();
    354                 while (timeout > 0) {
    355                     if (Debug.isDebuggerConnected()) {
    356                         debuggerWasConnected = 2;
    357                     }
    358                     try {
    359                         wait(timeout);
    360                     } catch (InterruptedException e) {
    361                         Log.wtf(TAG, e);
    362                     }
    363                     if (Debug.isDebuggerConnected()) {
    364                         debuggerWasConnected = 2;
    365                     }
    366                     timeout = CHECK_INTERVAL - (SystemClock.uptimeMillis() - start);
    367                 }
    368 
    369                 final int waitState = evaluateCheckerCompletionLocked();
    370                 if (waitState == COMPLETED) {
    371                     // The monitors have returned; reset
    372                     waitedHalf = false;
    373                     continue;
    374                 } else if (waitState == WAITING) {
    375                     // still waiting but within their configured intervals; back off and recheck
    376                     continue;
    377                 } else if (waitState == WAITED_HALF) {
    378                     if (!waitedHalf) {
    379                         // We've waited half the deadlock-detection interval.  Pull a stack
    380                         // trace and wait another half.
    381                         ArrayList<Integer> pids = new ArrayList<Integer>();
    382                         pids.add(Process.myPid());
    383                         ActivityManagerService.dumpStackTraces(true, pids, null, null,
    384                                 NATIVE_STACKS_OF_INTEREST);
    385                         waitedHalf = true;
    386                     }
    387                     continue;
    388                 }
    389 
    390                 // something is overdue!
    391                 blockedCheckers = getBlockedCheckersLocked();
    392                 subject = describeCheckersLocked(blockedCheckers);
    393                 allowRestart = mAllowRestart;
    394             }
    395 
    396             // If we got here, that means that the system is most likely hung.
    397             // First collect stack traces from all threads of the system process.
    398             // Then kill this process so that the system will restart.
    399             EventLog.writeEvent(EventLogTags.WATCHDOG, subject);
    400 
    401             ArrayList<Integer> pids = new ArrayList<Integer>();
    402             pids.add(Process.myPid());
    403             if (mPhonePid > 0) pids.add(mPhonePid);
    404             // Pass !waitedHalf so that just in case we somehow wind up here without having
    405             // dumped the halfway stacks, we properly re-initialize the trace file.
    406             final File stack = ActivityManagerService.dumpStackTraces(
    407                     !waitedHalf, pids, null, null, NATIVE_STACKS_OF_INTEREST);
    408 
    409             // Give some extra time to make sure the stack traces get written.
    410             // The system's been hanging for a minute, another second or two won't hurt much.
    411             SystemClock.sleep(2000);
    412 
    413             // Pull our own kernel thread stacks as well if we're configured for that
    414             if (RECORD_KERNEL_THREADS) {
    415                 dumpKernelStackTraces();
    416             }
    417 
    418             // Trigger the kernel to dump all blocked threads, and backtraces on all CPUs to the kernel log
    419             doSysRq('w');
    420             doSysRq('l');
    421 
    422             // Try to add the error to the dropbox, but assuming that the ActivityManager
    423             // itself may be deadlocked.  (which has happened, causing this statement to
    424             // deadlock and the watchdog as a whole to be ineffective)
    425             Thread dropboxThread = new Thread("watchdogWriteToDropbox") {
    426                     public void run() {
    427                         mActivity.addErrorToDropBox(
    428                                 "watchdog", null, "system_server", null, null,
    429                                 subject, null, stack, null);
    430                     }
    431                 };
    432             dropboxThread.start();
    433             try {
    434                 dropboxThread.join(2000);  // wait up to 2 seconds for it to return.
    435             } catch (InterruptedException ignored) {}
    436 
    437             IActivityController controller;
    438             synchronized (this) {
    439                 controller = mController;
    440             }
    441             if (controller != null) {
    442                 Slog.i(TAG, "Reporting stuck state to activity controller");
    443                 try {
    444                     Binder.setDumpDisabled("Service dumps disabled due to hung system process.");
    445                     // 1 = keep waiting, -1 = kill system
    446                     int res = controller.systemNotResponding(subject);
    447                     if (res >= 0) {
    448                         Slog.i(TAG, "Activity controller requested to coninue to wait");
    449                         waitedHalf = false;
    450                         continue;
    451                     }
    452                 } catch (RemoteException e) {
    453                 }
    454             }
    455 
    456             // Only kill the process if the debugger is not attached.
    457             if (Debug.isDebuggerConnected()) {
    458                 debuggerWasConnected = 2;
    459             }
    460             if (debuggerWasConnected >= 2) {
    461                 Slog.w(TAG, "Debugger connected: Watchdog is *not* killing the system process");
    462             } else if (debuggerWasConnected > 0) {
    463                 Slog.w(TAG, "Debugger was connected: Watchdog is *not* killing the system process");
    464             } else if (!allowRestart) {
    465                 Slog.w(TAG, "Restart not allowed: Watchdog is *not* killing the system process");
    466             } else {
    467                 Slog.w(TAG, "*** WATCHDOG KILLING SYSTEM PROCESS: " + subject);
    468                 for (int i=0; i<blockedCheckers.size(); i++) {
    469                     Slog.w(TAG, blockedCheckers.get(i).getName() + " stack trace:");
    470                     StackTraceElement[] stackTrace
    471                             = blockedCheckers.get(i).getThread().getStackTrace();
    472                     for (StackTraceElement element: stackTrace) {
    473                         Slog.w(TAG, "    at " + element);
    474                     }
    475                 }
    476                 Slog.w(TAG, "*** GOODBYE!");
    477                 Process.killProcess(Process.myPid());
    478                 System.exit(10);
    479             }
    480 
    481             waitedHalf = false;
    482         }
    483     }
    484 
    485     private void doSysRq(char c) {
    486         try {
    487             FileWriter sysrq_trigger = new FileWriter("/proc/sysrq-trigger");
    488             sysrq_trigger.write(c);
    489             sysrq_trigger.close();
    490         } catch (IOException e) {
    491             Slog.w(TAG, "Failed to write to /proc/sysrq-trigger", e);
    492         }
    493     }
    494 
    495     private File dumpKernelStackTraces() {
    496         String tracesPath = SystemProperties.get("dalvik.vm.stack-trace-file", null);
    497         if (tracesPath == null || tracesPath.length() == 0) {
    498             return null;
    499         }
    500 
    501         native_dumpKernelStacks(tracesPath);
    502         return new File(tracesPath);
    503     }
    504 
    505     private native void native_dumpKernelStacks(String tracesPath);
    506 }
    507