Home | History | Annotate | Download | only in server
      1 /*
      2  * Copyright (C) 2008 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 package com.android.server;
     18 
     19 import android.app.IActivityController;
     20 import android.os.Binder;
     21 import android.os.RemoteException;
     22 import com.android.server.am.ActivityManagerService;
     23 import com.android.server.power.PowerManagerService;
     24 
     25 import android.app.AlarmManager;
     26 import android.app.PendingIntent;
     27 import android.content.BroadcastReceiver;
     28 import android.content.ContentResolver;
     29 import android.content.Context;
     30 import android.content.Intent;
     31 import android.content.IntentFilter;
     32 import android.os.BatteryManager;
     33 import android.os.Debug;
     34 import android.os.Handler;
     35 import android.os.Looper;
     36 import android.os.Message;
     37 import android.os.Process;
     38 import android.os.ServiceManager;
     39 import android.os.SystemClock;
     40 import android.os.SystemProperties;
     41 import android.util.EventLog;
     42 import android.util.Log;
     43 import android.util.Slog;
     44 
     45 import java.io.File;
     46 import java.io.FileWriter;
     47 import java.io.IOException;
     48 import java.util.ArrayList;
     49 import java.util.Calendar;
     50 
     51 /** This class calls its monitor every minute. Killing this process if they don't return **/
     52 public class Watchdog extends Thread {
     53     static final String TAG = "Watchdog";
     54     static final boolean localLOGV = false || false;
     55 
     56     // Set this to true to use debug default values.
     57     static final boolean DB = false;
     58 
     59     // Set this to true to have the watchdog record kernel thread stacks when it fires
     60     static final boolean RECORD_KERNEL_THREADS = true;
     61 
     62     static final int MONITOR = 2718;
     63 
     64     static final int TIME_TO_RESTART = DB ? 15*1000 : 60*1000;
     65     static final int TIME_TO_WAIT = TIME_TO_RESTART / 2;
     66 
     67     static final int MEMCHECK_DEFAULT_MIN_SCREEN_OFF = DB ? 1*60 : 5*60;   // 5 minutes
     68     static final int MEMCHECK_DEFAULT_MIN_ALARM = DB ? 1*60 : 3*60;        // 3 minutes
     69     static final int MEMCHECK_DEFAULT_RECHECK_INTERVAL = DB ? 1*60 : 5*60; // 5 minutes
     70 
     71     static final int REBOOT_DEFAULT_INTERVAL = DB ? 1 : 0;                 // never force reboot
     72     static final int REBOOT_DEFAULT_START_TIME = 3*60*60;                  // 3:00am
     73     static final int REBOOT_DEFAULT_WINDOW = 60*60;                        // within 1 hour
     74 
     75     static final String REBOOT_ACTION = "com.android.service.Watchdog.REBOOT";
     76 
     77     static final String[] NATIVE_STACKS_OF_INTEREST = new String[] {
     78         "/system/bin/mediaserver",
     79         "/system/bin/sdcard",
     80         "/system/bin/surfaceflinger"
     81     };
     82 
     83     static Watchdog sWatchdog;
     84 
     85     /* This handler will be used to post message back onto the main thread */
     86     final Handler mHandler;
     87     final ArrayList<Monitor> mMonitors = new ArrayList<Monitor>();
     88     ContentResolver mResolver;
     89     BatteryService mBattery;
     90     PowerManagerService mPower;
     91     AlarmManagerService mAlarm;
     92     ActivityManagerService mActivity;
     93     boolean mCompleted;
     94     Monitor mCurrentMonitor;
     95 
     96     int mPhonePid;
     97     IActivityController mController;
     98     boolean mAllowRestart = true;
     99 
    100     final Calendar mCalendar = Calendar.getInstance();
    101     int mMinScreenOff = MEMCHECK_DEFAULT_MIN_SCREEN_OFF;
    102     int mMinAlarm = MEMCHECK_DEFAULT_MIN_ALARM;
    103     boolean mNeedScheduledCheck;
    104     PendingIntent mCheckupIntent;
    105     PendingIntent mRebootIntent;
    106 
    107     long mBootTime;
    108     int mRebootInterval;
    109 
    110     boolean mReqRebootNoWait;     // should wait for one interval before reboot?
    111     int mReqRebootInterval = -1;  // >= 0 if a reboot has been requested
    112     int mReqRebootStartTime = -1; // >= 0 if a specific start time has been requested
    113     int mReqRebootWindow = -1;    // >= 0 if a specific window has been requested
    114     int mReqMinScreenOff = -1;    // >= 0 if a specific screen off time has been requested
    115     int mReqMinNextAlarm = -1;    // >= 0 if specific time to next alarm has been requested
    116     int mReqRecheckInterval= -1;  // >= 0 if a specific recheck interval has been requested
    117 
    118     /**
    119      * Used for scheduling monitor callbacks and checking memory usage.
    120      */
    121     final class HeartbeatHandler extends Handler {
    122         HeartbeatHandler(Looper looper) {
    123             super(looper);
    124         }
    125 
    126         @Override
    127         public void handleMessage(Message msg) {
    128             switch (msg.what) {
    129                 case MONITOR: {
    130                     // See if we should force a reboot.
    131                     int rebootInterval = mReqRebootInterval >= 0
    132                             ? mReqRebootInterval : REBOOT_DEFAULT_INTERVAL;
    133                     if (mRebootInterval != rebootInterval) {
    134                         mRebootInterval = rebootInterval;
    135                         // We have been running long enough that a reboot can
    136                         // be considered...
    137                         checkReboot(false);
    138                     }
    139 
    140                     final int size = mMonitors.size();
    141                     for (int i = 0 ; i < size ; i++) {
    142                         synchronized (Watchdog.this) {
    143                             mCurrentMonitor = mMonitors.get(i);
    144                         }
    145                         mCurrentMonitor.monitor();
    146                     }
    147 
    148                     synchronized (Watchdog.this) {
    149                         mCompleted = true;
    150                         mCurrentMonitor = null;
    151                     }
    152                 } break;
    153             }
    154         }
    155     }
    156 
    157     final class RebootReceiver extends BroadcastReceiver {
    158         @Override
    159         public void onReceive(Context c, Intent intent) {
    160             if (localLOGV) Slog.v(TAG, "Alarm went off, checking reboot.");
    161             checkReboot(true);
    162         }
    163     }
    164 
    165     final class RebootRequestReceiver extends BroadcastReceiver {
    166         @Override
    167         public void onReceive(Context c, Intent intent) {
    168             mReqRebootNoWait = intent.getIntExtra("nowait", 0) != 0;
    169             mReqRebootInterval = intent.getIntExtra("interval", -1);
    170             mReqRebootStartTime = intent.getIntExtra("startTime", -1);
    171             mReqRebootWindow = intent.getIntExtra("window", -1);
    172             mReqMinScreenOff = intent.getIntExtra("minScreenOff", -1);
    173             mReqMinNextAlarm = intent.getIntExtra("minNextAlarm", -1);
    174             mReqRecheckInterval = intent.getIntExtra("recheckInterval", -1);
    175             EventLog.writeEvent(EventLogTags.WATCHDOG_REQUESTED_REBOOT,
    176                     mReqRebootNoWait ? 1 : 0, mReqRebootInterval,
    177                             mReqRecheckInterval, mReqRebootStartTime,
    178                     mReqRebootWindow, mReqMinScreenOff, mReqMinNextAlarm);
    179             checkReboot(true);
    180         }
    181     }
    182 
    183     public interface Monitor {
    184         void monitor();
    185     }
    186 
    187     public static Watchdog getInstance() {
    188         if (sWatchdog == null) {
    189             sWatchdog = new Watchdog();
    190         }
    191 
    192         return sWatchdog;
    193     }
    194 
    195     private Watchdog() {
    196         super("watchdog");
    197         // Explicitly bind the HeartbeatHandler to run on the ServerThread, so
    198         // that it can't get accidentally bound to another thread.
    199         mHandler = new HeartbeatHandler(Looper.getMainLooper());
    200     }
    201 
    202     public void init(Context context, BatteryService battery,
    203             PowerManagerService power, AlarmManagerService alarm,
    204             ActivityManagerService activity) {
    205         mResolver = context.getContentResolver();
    206         mBattery = battery;
    207         mPower = power;
    208         mAlarm = alarm;
    209         mActivity = activity;
    210 
    211         context.registerReceiver(new RebootReceiver(),
    212                 new IntentFilter(REBOOT_ACTION));
    213         mRebootIntent = PendingIntent.getBroadcast(context,
    214                 0, new Intent(REBOOT_ACTION), 0);
    215 
    216         context.registerReceiver(new RebootRequestReceiver(),
    217                 new IntentFilter(Intent.ACTION_REBOOT),
    218                 android.Manifest.permission.REBOOT, null);
    219 
    220         mBootTime = System.currentTimeMillis();
    221     }
    222 
    223     public void processStarted(String name, int pid) {
    224         synchronized (this) {
    225             if ("com.android.phone".equals(name)) {
    226                 mPhonePid = pid;
    227             }
    228         }
    229     }
    230 
    231     public void setActivityController(IActivityController controller) {
    232         synchronized (this) {
    233             mController = controller;
    234         }
    235     }
    236 
    237     public void setAllowRestart(boolean allowRestart) {
    238         synchronized (this) {
    239             mAllowRestart = allowRestart;
    240         }
    241     }
    242 
    243     public void addMonitor(Monitor monitor) {
    244         synchronized (this) {
    245             if (isAlive()) {
    246                 throw new RuntimeException("Monitors can't be added while the Watchdog is running");
    247             }
    248             mMonitors.add(monitor);
    249         }
    250     }
    251 
    252     void checkReboot(boolean fromAlarm) {
    253         int rebootInterval = mReqRebootInterval >= 0 ? mReqRebootInterval
    254                 : REBOOT_DEFAULT_INTERVAL;
    255         mRebootInterval = rebootInterval;
    256         if (rebootInterval <= 0) {
    257             // No reboot interval requested.
    258             if (localLOGV) Slog.v(TAG, "No need to schedule a reboot alarm!");
    259             mAlarm.remove(mRebootIntent);
    260             return;
    261         }
    262 
    263         long rebootStartTime = mReqRebootStartTime >= 0 ? mReqRebootStartTime
    264                 : REBOOT_DEFAULT_START_TIME;
    265         long rebootWindowMillis = (mReqRebootWindow >= 0 ? mReqRebootWindow
    266                 : REBOOT_DEFAULT_WINDOW) * 1000;
    267         long recheckInterval = (mReqRecheckInterval >= 0 ? mReqRecheckInterval
    268                 : MEMCHECK_DEFAULT_RECHECK_INTERVAL) * 1000;
    269 
    270         retrieveBrutalityAmount();
    271 
    272         long realStartTime;
    273         long now;
    274 
    275         synchronized (this) {
    276             now = System.currentTimeMillis();
    277             realStartTime = computeCalendarTime(mCalendar, now,
    278                     rebootStartTime);
    279 
    280             long rebootIntervalMillis = rebootInterval*24*60*60*1000;
    281             if (DB || mReqRebootNoWait ||
    282                     (now-mBootTime) >= (rebootIntervalMillis-rebootWindowMillis)) {
    283                 if (fromAlarm && rebootWindowMillis <= 0) {
    284                     // No reboot window -- just immediately reboot.
    285                     EventLog.writeEvent(EventLogTags.WATCHDOG_SCHEDULED_REBOOT, now,
    286                             (int)rebootIntervalMillis, (int)rebootStartTime*1000,
    287                             (int)rebootWindowMillis, "");
    288                     rebootSystem("Checkin scheduled forced");
    289                     return;
    290                 }
    291 
    292                 // Are we within the reboot window?
    293                 if (now < realStartTime) {
    294                     // Schedule alarm for next check interval.
    295                     realStartTime = computeCalendarTime(mCalendar,
    296                             now, rebootStartTime);
    297                 } else if (now < (realStartTime+rebootWindowMillis)) {
    298                     String doit = shouldWeBeBrutalLocked(now);
    299                     EventLog.writeEvent(EventLogTags.WATCHDOG_SCHEDULED_REBOOT, now,
    300                             (int)rebootInterval, (int)rebootStartTime*1000,
    301                             (int)rebootWindowMillis, doit != null ? doit : "");
    302                     if (doit == null) {
    303                         rebootSystem("Checked scheduled range");
    304                         return;
    305                     }
    306 
    307                     // Schedule next alarm either within the window or in the
    308                     // next interval.
    309                     if ((now+recheckInterval) >= (realStartTime+rebootWindowMillis)) {
    310                         realStartTime = computeCalendarTime(mCalendar,
    311                                 now + rebootIntervalMillis, rebootStartTime);
    312                     } else {
    313                         realStartTime = now + recheckInterval;
    314                     }
    315                 } else {
    316                     // Schedule alarm for next check interval.
    317                     realStartTime = computeCalendarTime(mCalendar,
    318                             now + rebootIntervalMillis, rebootStartTime);
    319                 }
    320             }
    321         }
    322 
    323         if (localLOGV) Slog.v(TAG, "Scheduling next reboot alarm for "
    324                 + ((realStartTime-now)/1000/60) + "m from now");
    325         mAlarm.remove(mRebootIntent);
    326         mAlarm.set(AlarmManager.RTC_WAKEUP, realStartTime, mRebootIntent);
    327     }
    328 
    329     /**
    330      * Perform a full reboot of the system.
    331      */
    332     void rebootSystem(String reason) {
    333         Slog.i(TAG, "Rebooting system because: " + reason);
    334         PowerManagerService pms = (PowerManagerService) ServiceManager.getService("power");
    335         pms.reboot(false, reason, false);
    336     }
    337 
    338     /**
    339      * Load the current Gservices settings for when
    340      * {@link #shouldWeBeBrutalLocked} will allow the brutality to happen.
    341      * Must not be called with the lock held.
    342      */
    343     void retrieveBrutalityAmount() {
    344         mMinScreenOff = (mReqMinScreenOff >= 0 ? mReqMinScreenOff
    345                 : MEMCHECK_DEFAULT_MIN_SCREEN_OFF) * 1000;
    346         mMinAlarm = (mReqMinNextAlarm >= 0 ? mReqMinNextAlarm
    347                 : MEMCHECK_DEFAULT_MIN_ALARM) * 1000;
    348     }
    349 
    350     /**
    351      * Determine whether it is a good time to kill, crash, or otherwise
    352      * plunder the current situation for the overall long-term benefit of
    353      * the world.
    354      *
    355      * @param curTime The current system time.
    356      * @return Returns null if this is a good time, else a String with the
    357      * text of why it is not a good time.
    358      */
    359     String shouldWeBeBrutalLocked(long curTime) {
    360         if (mBattery == null || !mBattery.isPowered(BatteryManager.BATTERY_PLUGGED_ANY)) {
    361             return "battery";
    362         }
    363 
    364         if (mMinScreenOff >= 0 && (mPower == null ||
    365                 mPower.timeSinceScreenWasLastOn() < mMinScreenOff)) {
    366             return "screen";
    367         }
    368 
    369         if (mMinAlarm >= 0 && (mAlarm == null ||
    370                 mAlarm.timeToNextAlarm() < mMinAlarm)) {
    371             return "alarm";
    372         }
    373 
    374         return null;
    375     }
    376 
    377     static long computeCalendarTime(Calendar c, long curTime,
    378             long secondsSinceMidnight) {
    379 
    380         // start with now
    381         c.setTimeInMillis(curTime);
    382 
    383         int val = (int)secondsSinceMidnight / (60*60);
    384         c.set(Calendar.HOUR_OF_DAY, val);
    385         secondsSinceMidnight -= val * (60*60);
    386         val = (int)secondsSinceMidnight / 60;
    387         c.set(Calendar.MINUTE, val);
    388         c.set(Calendar.SECOND, (int)secondsSinceMidnight - (val*60));
    389         c.set(Calendar.MILLISECOND, 0);
    390 
    391         long newTime = c.getTimeInMillis();
    392         if (newTime < curTime) {
    393             // The given time (in seconds since midnight) has already passed for today, so advance
    394             // by one day (due to daylight savings, etc., the delta may differ from 24 hours).
    395             c.add(Calendar.DAY_OF_MONTH, 1);
    396             newTime = c.getTimeInMillis();
    397         }
    398 
    399         return newTime;
    400     }
    401 
    402     @Override
    403     public void run() {
    404         boolean waitedHalf = false;
    405         while (true) {
    406             mCompleted = false;
    407             mHandler.sendEmptyMessage(MONITOR);
    408 
    409 
    410             final String name;
    411             final boolean allowRestart;
    412             synchronized (this) {
    413                 long timeout = TIME_TO_WAIT;
    414 
    415                 // NOTE: We use uptimeMillis() here because we do not want to increment the time we
    416                 // wait while asleep. If the device is asleep then the thing that we are waiting
    417                 // to timeout on is asleep as well and won't have a chance to run, causing a false
    418                 // positive on when to kill things.
    419                 long start = SystemClock.uptimeMillis();
    420                 while (timeout > 0) {
    421                     try {
    422                         wait(timeout);
    423                     } catch (InterruptedException e) {
    424                         Log.wtf(TAG, e);
    425                     }
    426                     timeout = TIME_TO_WAIT - (SystemClock.uptimeMillis() - start);
    427                 }
    428 
    429                 if (mCompleted) {
    430                     // The monitors have returned.
    431                     waitedHalf = false;
    432                     continue;
    433                 }
    434 
    435                 if (!waitedHalf) {
    436                     // We've waited half the deadlock-detection interval.  Pull a stack
    437                     // trace and wait another half.
    438                     ArrayList<Integer> pids = new ArrayList<Integer>();
    439                     pids.add(Process.myPid());
    440                     ActivityManagerService.dumpStackTraces(true, pids, null, null,
    441                             NATIVE_STACKS_OF_INTEREST);
    442                     waitedHalf = true;
    443                     continue;
    444                 }
    445 
    446                 name = (mCurrentMonitor != null) ?
    447                     mCurrentMonitor.getClass().getName() : "null";
    448                 allowRestart = mAllowRestart;
    449             }
    450 
    451             // If we got here, that means that the system is most likely hung.
    452             // First collect stack traces from all threads of the system process.
    453             // Then kill this process so that the system will restart.
    454             EventLog.writeEvent(EventLogTags.WATCHDOG, name);
    455 
    456             ArrayList<Integer> pids = new ArrayList<Integer>();
    457             pids.add(Process.myPid());
    458             if (mPhonePid > 0) pids.add(mPhonePid);
    459             // Pass !waitedHalf so that just in case we somehow wind up here without having
    460             // dumped the halfway stacks, we properly re-initialize the trace file.
    461             final File stack = ActivityManagerService.dumpStackTraces(
    462                     !waitedHalf, pids, null, null, NATIVE_STACKS_OF_INTEREST);
    463 
    464             // Give some extra time to make sure the stack traces get written.
    465             // The system's been hanging for a minute, another second or two won't hurt much.
    466             SystemClock.sleep(2000);
    467 
    468             // Pull our own kernel thread stacks as well if we're configured for that
    469             if (RECORD_KERNEL_THREADS) {
    470                 dumpKernelStackTraces();
    471             }
    472 
    473             // Trigger the kernel to dump all blocked threads to the kernel log
    474             try {
    475                 FileWriter sysrq_trigger = new FileWriter("/proc/sysrq-trigger");
    476                 sysrq_trigger.write("w");
    477                 sysrq_trigger.close();
    478             } catch (IOException e) {
    479                 Slog.e(TAG, "Failed to write to /proc/sysrq-trigger");
    480                 Slog.e(TAG, e.getMessage());
    481             }
    482 
    483             // Try to add the error to the dropbox, but assuming that the ActivityManager
    484             // itself may be deadlocked.  (which has happened, causing this statement to
    485             // deadlock and the watchdog as a whole to be ineffective)
    486             Thread dropboxThread = new Thread("watchdogWriteToDropbox") {
    487                     public void run() {
    488                         mActivity.addErrorToDropBox(
    489                                 "watchdog", null, "system_server", null, null,
    490                                 name, null, stack, null);
    491                     }
    492                 };
    493             dropboxThread.start();
    494             try {
    495                 dropboxThread.join(2000);  // wait up to 2 seconds for it to return.
    496             } catch (InterruptedException ignored) {}
    497 
    498             IActivityController controller;
    499             synchronized (this) {
    500                 controller = mController;
    501             }
    502             if (controller != null) {
    503                 Slog.i(TAG, "Reporting stuck state to activity controller");
    504                 try {
    505                     Binder.setDumpDisabled("Service dumps disabled due to hung system process.");
    506                     // 1 = keep waiting, -1 = kill system
    507                     int res = controller.systemNotResponding(name);
    508                     if (res >= 0) {
    509                         Slog.i(TAG, "Activity controller requested to coninue to wait");
    510                         waitedHalf = false;
    511                         continue;
    512                     }
    513                 } catch (RemoteException e) {
    514                 }
    515             }
    516 
    517             // Only kill the process if the debugger is not attached.
    518             if (Debug.isDebuggerConnected()) {
    519                 Slog.w(TAG, "Debugger connected: Watchdog is *not* killing the system process");
    520             } else if (!allowRestart) {
    521                 Slog.w(TAG, "Restart not allowed: Watchdog is *not* killing the system process");
    522             } else {
    523                 Slog.w(TAG, "*** WATCHDOG KILLING SYSTEM PROCESS: " + name);
    524                 Process.killProcess(Process.myPid());
    525                 System.exit(10);
    526             }
    527 
    528             waitedHalf = false;
    529         }
    530     }
    531 
    532     private File dumpKernelStackTraces() {
    533         String tracesPath = SystemProperties.get("dalvik.vm.stack-trace-file", null);
    534         if (tracesPath == null || tracesPath.length() == 0) {
    535             return null;
    536         }
    537 
    538         native_dumpKernelStacks(tracesPath);
    539         return new File(tracesPath);
    540     }
    541 
    542     private native void native_dumpKernelStacks(String tracesPath);
    543 }
    544