Home | History | Annotate | Download | only in server
      1 /*
      2  * Copyright (C) 2008 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 package com.android.server;
     18 
     19 import android.app.IActivityController;
     20 import android.os.Binder;
     21 import android.os.RemoteException;
     22 import com.android.server.am.ActivityManagerService;
     23 import com.android.server.power.PowerManagerService;
     24 
     25 import android.app.AlarmManager;
     26 import android.app.PendingIntent;
     27 import android.content.BroadcastReceiver;
     28 import android.content.ContentResolver;
     29 import android.content.Context;
     30 import android.content.Intent;
     31 import android.content.IntentFilter;
     32 import android.os.BatteryManager;
     33 import android.os.Debug;
     34 import android.os.Handler;
     35 import android.os.Looper;
     36 import android.os.Process;
     37 import android.os.ServiceManager;
     38 import android.os.SystemClock;
     39 import android.os.SystemProperties;
     40 import android.util.EventLog;
     41 import android.util.Log;
     42 import android.util.Slog;
     43 
     44 import java.io.File;
     45 import java.io.FileWriter;
     46 import java.io.IOException;
     47 import java.util.ArrayList;
     48 import java.util.Calendar;
     49 
     50 /** This class calls its monitor every minute. Killing this process if they don't return **/
     51 public class Watchdog extends Thread {
     52     static final String TAG = "Watchdog";
     53     static final boolean localLOGV = false || false;
     54 
     55     // Set this to true to use debug default values.
     56     static final boolean DB = false;
     57 
     58     // Set this to true to have the watchdog record kernel thread stacks when it fires
     59     static final boolean RECORD_KERNEL_THREADS = true;
     60 
     61     static final long DEFAULT_TIMEOUT = DB ? 10*1000 : 60*1000;
     62     static final long CHECK_INTERVAL = DEFAULT_TIMEOUT / 2;
     63 
     64     // These are temporally ordered: larger values as lateness increases
     65     static final int COMPLETED = 0;
     66     static final int WAITING = 1;
     67     static final int WAITED_HALF = 2;
     68     static final int OVERDUE = 3;
     69 
     70     // Which native processes to dump into dropbox's stack traces
     71     public static final String[] NATIVE_STACKS_OF_INTEREST = new String[] {
     72         "/system/bin/mediaserver",
     73         "/system/bin/sdcard",
     74         "/system/bin/surfaceflinger"
     75     };
     76 
     77     static Watchdog sWatchdog;
     78 
     79     /* This handler will be used to post message back onto the main thread */
     80     final ArrayList<HandlerChecker> mHandlerCheckers = new ArrayList<HandlerChecker>();
     81     final HandlerChecker mMonitorChecker;
     82     ContentResolver mResolver;
     83     BatteryService mBattery;
     84     PowerManagerService mPower;
     85     AlarmManagerService mAlarm;
     86     ActivityManagerService mActivity;
     87 
     88     int mPhonePid;
     89     IActivityController mController;
     90     boolean mAllowRestart = true;
     91 
     92     /**
     93      * Used for checking status of handle threads and scheduling monitor callbacks.
     94      */
     95     public final class HandlerChecker implements Runnable {
     96         private final Handler mHandler;
     97         private final String mName;
     98         private final long mWaitMax;
     99         private final ArrayList<Monitor> mMonitors = new ArrayList<Monitor>();
    100         private boolean mCompleted;
    101         private Monitor mCurrentMonitor;
    102         private long mStartTime;
    103 
    104         HandlerChecker(Handler handler, String name, long waitMaxMillis) {
    105             mHandler = handler;
    106             mName = name;
    107             mWaitMax = waitMaxMillis;
    108             mCompleted = true;
    109         }
    110 
    111         public void addMonitor(Monitor monitor) {
    112             mMonitors.add(monitor);
    113         }
    114 
    115         public void scheduleCheckLocked() {
    116             if (mMonitors.size() == 0 && mHandler.getLooper().isIdling()) {
    117                 // If the target looper is or just recently was idling, then
    118                 // there is no reason to enqueue our checker on it since that
    119                 // is as good as it not being deadlocked.  This avoid having
    120                 // to do a context switch to check the thread.  Note that we
    121                 // only do this if mCheckReboot is false and we have no
    122                 // monitors, since those would need to be executed at this point.
    123                 mCompleted = true;
    124                 return;
    125             }
    126 
    127             if (!mCompleted) {
    128                 // we already have a check in flight, so no need
    129                 return;
    130             }
    131 
    132             mCompleted = false;
    133             mCurrentMonitor = null;
    134             mStartTime = SystemClock.uptimeMillis();
    135             mHandler.postAtFrontOfQueue(this);
    136         }
    137 
    138         public boolean isOverdueLocked() {
    139             return (!mCompleted) && (SystemClock.uptimeMillis() > mStartTime + mWaitMax);
    140         }
    141 
    142         public int getCompletionStateLocked() {
    143             if (mCompleted) {
    144                 return COMPLETED;
    145             } else {
    146                 long latency = SystemClock.uptimeMillis() - mStartTime;
    147                 if (latency < mWaitMax/2) {
    148                     return WAITING;
    149                 } else if (latency < mWaitMax) {
    150                     return WAITED_HALF;
    151                 }
    152             }
    153             return OVERDUE;
    154         }
    155 
    156         public Thread getThread() {
    157             return mHandler.getLooper().getThread();
    158         }
    159 
    160         public String getName() {
    161             return mName;
    162         }
    163 
    164         public String describeBlockedStateLocked() {
    165             if (mCurrentMonitor == null) {
    166                 return "Blocked in handler on " + mName + " (" + getThread().getName() + ")";
    167             } else {
    168                 return "Blocked in monitor " + mCurrentMonitor.getClass().getName()
    169                         + " on " + mName + " (" + getThread().getName() + ")";
    170             }
    171         }
    172 
    173         @Override
    174         public void run() {
    175             final int size = mMonitors.size();
    176             for (int i = 0 ; i < size ; i++) {
    177                 synchronized (Watchdog.this) {
    178                     mCurrentMonitor = mMonitors.get(i);
    179                 }
    180                 mCurrentMonitor.monitor();
    181             }
    182 
    183             synchronized (Watchdog.this) {
    184                 mCompleted = true;
    185                 mCurrentMonitor = null;
    186             }
    187         }
    188     }
    189 
    190     final class RebootRequestReceiver extends BroadcastReceiver {
    191         @Override
    192         public void onReceive(Context c, Intent intent) {
    193             if (intent.getIntExtra("nowait", 0) != 0) {
    194                 rebootSystem("Received ACTION_REBOOT broadcast");
    195                 return;
    196             }
    197             Slog.w(TAG, "Unsupported ACTION_REBOOT broadcast: " + intent);
    198         }
    199     }
    200 
    201     public interface Monitor {
    202         void monitor();
    203     }
    204 
    205     public static Watchdog getInstance() {
    206         if (sWatchdog == null) {
    207             sWatchdog = new Watchdog();
    208         }
    209 
    210         return sWatchdog;
    211     }
    212 
    213     private Watchdog() {
    214         super("watchdog");
    215         // Initialize handler checkers for each common thread we want to check.  Note
    216         // that we are not currently checking the background thread, since it can
    217         // potentially hold longer running operations with no guarantees about the timeliness
    218         // of operations there.
    219 
    220         // The shared foreground thread is the main checker.  It is where we
    221         // will also dispatch monitor checks and do other work.
    222         mMonitorChecker = new HandlerChecker(FgThread.getHandler(),
    223                 "foreground thread", DEFAULT_TIMEOUT);
    224         mHandlerCheckers.add(mMonitorChecker);
    225         // Add checker for main thread.  We only do a quick check since there
    226         // can be UI running on the thread.
    227         mHandlerCheckers.add(new HandlerChecker(new Handler(Looper.getMainLooper()),
    228                 "main thread", DEFAULT_TIMEOUT));
    229         // Add checker for shared UI thread.
    230         mHandlerCheckers.add(new HandlerChecker(UiThread.getHandler(),
    231                 "ui thread", DEFAULT_TIMEOUT));
    232         // And also check IO thread.
    233         mHandlerCheckers.add(new HandlerChecker(IoThread.getHandler(),
    234                 "i/o thread", DEFAULT_TIMEOUT));
    235     }
    236 
    237     public void init(Context context, BatteryService battery,
    238             PowerManagerService power, AlarmManagerService alarm,
    239             ActivityManagerService activity) {
    240         mResolver = context.getContentResolver();
    241         mBattery = battery;
    242         mPower = power;
    243         mAlarm = alarm;
    244         mActivity = activity;
    245 
    246         context.registerReceiver(new RebootRequestReceiver(),
    247                 new IntentFilter(Intent.ACTION_REBOOT),
    248                 android.Manifest.permission.REBOOT, null);
    249     }
    250 
    251     public void processStarted(String name, int pid) {
    252         synchronized (this) {
    253             if ("com.android.phone".equals(name)) {
    254                 mPhonePid = pid;
    255             }
    256         }
    257     }
    258 
    259     public void setActivityController(IActivityController controller) {
    260         synchronized (this) {
    261             mController = controller;
    262         }
    263     }
    264 
    265     public void setAllowRestart(boolean allowRestart) {
    266         synchronized (this) {
    267             mAllowRestart = allowRestart;
    268         }
    269     }
    270 
    271     public void addMonitor(Monitor monitor) {
    272         synchronized (this) {
    273             if (isAlive()) {
    274                 throw new RuntimeException("Monitors can't be added once the Watchdog is running");
    275             }
    276             mMonitorChecker.addMonitor(monitor);
    277         }
    278     }
    279 
    280     public void addThread(Handler thread, String name) {
    281         addThread(thread, name, DEFAULT_TIMEOUT);
    282     }
    283 
    284     public void addThread(Handler thread, String name, long timeoutMillis) {
    285         synchronized (this) {
    286             if (isAlive()) {
    287                 throw new RuntimeException("Threads can't be added once the Watchdog is running");
    288             }
    289             mHandlerCheckers.add(new HandlerChecker(thread, name, timeoutMillis));
    290         }
    291     }
    292 
    293     /**
    294      * Perform a full reboot of the system.
    295      */
    296     void rebootSystem(String reason) {
    297         Slog.i(TAG, "Rebooting system because: " + reason);
    298         PowerManagerService pms = (PowerManagerService) ServiceManager.getService("power");
    299         pms.reboot(false, reason, false);
    300     }
    301 
    302     private int evaluateCheckerCompletionLocked() {
    303         int state = COMPLETED;
    304         for (int i=0; i<mHandlerCheckers.size(); i++) {
    305             HandlerChecker hc = mHandlerCheckers.get(i);
    306             state = Math.max(state, hc.getCompletionStateLocked());
    307         }
    308         return state;
    309     }
    310 
    311     private ArrayList<HandlerChecker> getBlockedCheckersLocked() {
    312         ArrayList<HandlerChecker> checkers = new ArrayList<HandlerChecker>();
    313         for (int i=0; i<mHandlerCheckers.size(); i++) {
    314             HandlerChecker hc = mHandlerCheckers.get(i);
    315             if (hc.isOverdueLocked()) {
    316                 checkers.add(hc);
    317             }
    318         }
    319         return checkers;
    320     }
    321 
    322     private String describeCheckersLocked(ArrayList<HandlerChecker> checkers) {
    323         StringBuilder builder = new StringBuilder(128);
    324         for (int i=0; i<checkers.size(); i++) {
    325             if (builder.length() > 0) {
    326                 builder.append(", ");
    327             }
    328             builder.append(checkers.get(i).describeBlockedStateLocked());
    329         }
    330         return builder.toString();
    331     }
    332 
    333     @Override
    334     public void run() {
    335         boolean waitedHalf = false;
    336         while (true) {
    337             final ArrayList<HandlerChecker> blockedCheckers;
    338             final String subject;
    339             final boolean allowRestart;
    340             synchronized (this) {
    341                 long timeout = CHECK_INTERVAL;
    342                 // Make sure we (re)spin the checkers that have become idle within
    343                 // this wait-and-check interval
    344                 for (int i=0; i<mHandlerCheckers.size(); i++) {
    345                     HandlerChecker hc = mHandlerCheckers.get(i);
    346                     hc.scheduleCheckLocked();
    347                 }
    348 
    349                 // NOTE: We use uptimeMillis() here because we do not want to increment the time we
    350                 // wait while asleep. If the device is asleep then the thing that we are waiting
    351                 // to timeout on is asleep as well and won't have a chance to run, causing a false
    352                 // positive on when to kill things.
    353                 long start = SystemClock.uptimeMillis();
    354                 while (timeout > 0) {
    355                     try {
    356                         wait(timeout);
    357                     } catch (InterruptedException e) {
    358                         Log.wtf(TAG, e);
    359                     }
    360                     timeout = CHECK_INTERVAL - (SystemClock.uptimeMillis() - start);
    361                 }
    362 
    363                 final int waitState = evaluateCheckerCompletionLocked();
    364                 if (waitState == COMPLETED) {
    365                     // The monitors have returned; reset
    366                     waitedHalf = false;
    367                     continue;
    368                 } else if (waitState == WAITING) {
    369                     // still waiting but within their configured intervals; back off and recheck
    370                     continue;
    371                 } else if (waitState == WAITED_HALF) {
    372                     if (!waitedHalf) {
    373                         // We've waited half the deadlock-detection interval.  Pull a stack
    374                         // trace and wait another half.
    375                         ArrayList<Integer> pids = new ArrayList<Integer>();
    376                         pids.add(Process.myPid());
    377                         ActivityManagerService.dumpStackTraces(true, pids, null, null,
    378                                 NATIVE_STACKS_OF_INTEREST);
    379                         waitedHalf = true;
    380                     }
    381                     continue;
    382                 }
    383 
    384                 // something is overdue!
    385                 blockedCheckers = getBlockedCheckersLocked();
    386                 subject = describeCheckersLocked(blockedCheckers);
    387                 allowRestart = mAllowRestart;
    388             }
    389 
    390             // If we got here, that means that the system is most likely hung.
    391             // First collect stack traces from all threads of the system process.
    392             // Then kill this process so that the system will restart.
    393             EventLog.writeEvent(EventLogTags.WATCHDOG, subject);
    394 
    395             ArrayList<Integer> pids = new ArrayList<Integer>();
    396             pids.add(Process.myPid());
    397             if (mPhonePid > 0) pids.add(mPhonePid);
    398             // Pass !waitedHalf so that just in case we somehow wind up here without having
    399             // dumped the halfway stacks, we properly re-initialize the trace file.
    400             final File stack = ActivityManagerService.dumpStackTraces(
    401                     !waitedHalf, pids, null, null, NATIVE_STACKS_OF_INTEREST);
    402 
    403             // Give some extra time to make sure the stack traces get written.
    404             // The system's been hanging for a minute, another second or two won't hurt much.
    405             SystemClock.sleep(2000);
    406 
    407             // Pull our own kernel thread stacks as well if we're configured for that
    408             if (RECORD_KERNEL_THREADS) {
    409                 dumpKernelStackTraces();
    410             }
    411 
    412             // Trigger the kernel to dump all blocked threads to the kernel log
    413             try {
    414                 FileWriter sysrq_trigger = new FileWriter("/proc/sysrq-trigger");
    415                 sysrq_trigger.write("w");
    416                 sysrq_trigger.close();
    417             } catch (IOException e) {
    418                 Slog.e(TAG, "Failed to write to /proc/sysrq-trigger");
    419                 Slog.e(TAG, e.getMessage());
    420             }
    421 
    422             // Try to add the error to the dropbox, but assuming that the ActivityManager
    423             // itself may be deadlocked.  (which has happened, causing this statement to
    424             // deadlock and the watchdog as a whole to be ineffective)
    425             Thread dropboxThread = new Thread("watchdogWriteToDropbox") {
    426                     public void run() {
    427                         mActivity.addErrorToDropBox(
    428                                 "watchdog", null, "system_server", null, null,
    429                                 subject, null, stack, null);
    430                     }
    431                 };
    432             dropboxThread.start();
    433             try {
    434                 dropboxThread.join(2000);  // wait up to 2 seconds for it to return.
    435             } catch (InterruptedException ignored) {}
    436 
    437             IActivityController controller;
    438             synchronized (this) {
    439                 controller = mController;
    440             }
    441             if (controller != null) {
    442                 Slog.i(TAG, "Reporting stuck state to activity controller");
    443                 try {
    444                     Binder.setDumpDisabled("Service dumps disabled due to hung system process.");
    445                     // 1 = keep waiting, -1 = kill system
    446                     int res = controller.systemNotResponding(subject);
    447                     if (res >= 0) {
    448                         Slog.i(TAG, "Activity controller requested to coninue to wait");
    449                         waitedHalf = false;
    450                         continue;
    451                     }
    452                 } catch (RemoteException e) {
    453                 }
    454             }
    455 
    456             // Only kill the process if the debugger is not attached.
    457             if (Debug.isDebuggerConnected()) {
    458                 Slog.w(TAG, "Debugger connected: Watchdog is *not* killing the system process");
    459             } else if (!allowRestart) {
    460                 Slog.w(TAG, "Restart not allowed: Watchdog is *not* killing the system process");
    461             } else {
    462                 Slog.w(TAG, "*** WATCHDOG KILLING SYSTEM PROCESS: " + subject);
    463                 for (int i=0; i<blockedCheckers.size(); i++) {
    464                     Slog.w(TAG, blockedCheckers.get(i).getName() + " stack trace:");
    465                     StackTraceElement[] stackTrace
    466                             = blockedCheckers.get(i).getThread().getStackTrace();
    467                     for (StackTraceElement element: stackTrace) {
    468                         Slog.w(TAG, "    at " + element);
    469                     }
    470                 }
    471                 Slog.w(TAG, "*** GOODBYE!");
    472                 Process.killProcess(Process.myPid());
    473                 System.exit(10);
    474             }
    475 
    476             waitedHalf = false;
    477         }
    478     }
    479 
    480     private File dumpKernelStackTraces() {
    481         String tracesPath = SystemProperties.get("dalvik.vm.stack-trace-file", null);
    482         if (tracesPath == null || tracesPath.length() == 0) {
    483             return null;
    484         }
    485 
    486         native_dumpKernelStacks(tracesPath);
    487         return new File(tracesPath);
    488     }
    489 
    490     private native void native_dumpKernelStacks(String tracesPath);
    491 }
    492