Home | History | Annotate | Download | only in ANRdaemon
      1 /*
      2  * Copyright (c) 2015, The Android Open Source Project
      3  * All rights reserved.
      4  *
      5  * Redistribution and use in source and binary forms, with or without
      6  * modification, are permitted provided that the following conditions
      7  * are met:
      8  *   * Redistributions of source code must retain the above copyright
      9  *     notice, this list of conditions and the following disclaimer.
     10  *   * Redistributions in binary form must reproduce the above copyright
     11  *     notice, this list of conditions and the following disclaimer
     12  *     in the documentation and/or other materials provided with the
     13  *     distribution.
     14  *   * Neither the name of Google, Inc. nor the names of its contributors
     15  *     may be used to endorse or promote products derived from this
     16  *     software without specific prior written permission.
     17  *
     18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     19  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
     21  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
     22  * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
     23  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
     24  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
     25  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
     26  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
     27  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
     28  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     29  * SUCH DAMAGE.
     30  */
     31 
     32 #include <binder/IBinder.h>
     33 #include <binder/IServiceManager.h>
     34 #include <binder/Parcel.h>
     35 
     36 #include <ctime>
     37 #include <cutils/properties.h>
     38 #include <signal.h>
     39 #include <stdbool.h>
     40 #include <stdio.h>
     41 #include <stdlib.h>
     42 #include <string.h>
     43 
     44 #include <sys/resource.h>
     45 #include <sys/stat.h>
     46 #include <sys/time.h>
     47 #include <sys/types.h>
     48 #include <unistd.h>
     49 
     50 #include <utils/Log.h>
     51 #include <utils/String8.h>
     52 #include <utils/Trace.h>
     53 #include <zlib.h>
     54 
     55 using namespace android;
     56 
     57 #ifdef LOG_TAG
     58 #undef LOG_TAG
     59 #endif
     60 
     61 #define LOG_TAG "anrdaemon"
     62 
     63 static const int check_period = 1;              // in sec
     64 static const int tracing_check_period = 500000; // in micro sec
     65 static const int cpu_stat_entries = 7;          // number of cpu stat entries
     66 static const int min_buffer_size = 16;
     67 static const int max_buffer_size = 2048;
     68 static const char *min_buffer_size_str = "16";
     69 static const char *max_buffer_size_str = "2048";
     70 static const int time_buf_size = 20;
     71 static const int path_buf_size = 60;
     72 
     73 typedef struct cpu_stat {
     74     unsigned long utime, ntime, stime, itime;
     75     unsigned long iowtime, irqtime, sirqtime, steal;
     76     unsigned long total;
     77 } cpu_stat_t;
     78 
     79 /*
     80  * Logging on/off threshold.
     81  * Uint: 0.01%; default to 99.90% cpu.
     82  */
     83 static int idle_threshold = 10;
     84 
     85 static bool quit = false;
     86 static bool suspend= false;
     87 static bool dump_requested = false;
     88 static bool err = false;
     89 static char err_msg[100];
     90 static bool tracing = false;
     91 
     92 static const char *buf_size_kb = "2048";
     93 static const char *apps = "";
     94 static uint64_t tag = 0;
     95 
     96 static cpu_stat_t new_cpu;
     97 static cpu_stat_t old_cpu;
     98 
     99 /* Log certain kernel activity when enabled */
    100 static bool log_sched = false;
    101 static bool log_stack = false;
    102 static bool log_irq   = false;
    103 static bool log_sync  = false;
    104 static bool log_workq = false;
    105 
    106 /* Paths for debugfs controls*/
    107 static const char* dfs_trace_output_path =
    108     "/d/tracing/trace";
    109 static const char* dfs_irq_path =
    110     "/d/tracing/events/irq/enable";
    111 static const char* dfs_sync_path =
    112     "/d/tracing/events/sync/enable";
    113 static const char* dfs_workq_path =
    114     "/d/tracing/events/workqueue/enable";
    115 static const char* dfs_stack_path =
    116     "/d/tracing/options/stacktrace";
    117 static const char* dfs_sched_switch_path =
    118     "/d/tracing/events/sched/sched_switch/enable";
    119 static const char* dfs_sched_wakeup_path =
    120     "/d/tracing/events/sched/sched_wakeup/enable";
    121 static const char* dfs_control_path =
    122     "/d/tracing/tracing_on";
    123 static const char* dfs_buffer_size_path =
    124     "/d/tracing/buffer_size_kb";
    125 static const char* dfs_tags_property = "debug.atrace.tags.enableflags";
    126 static const char* dfs_apps_property = "debug.atrace.app_cmdlines";
    127 
    128 /*
    129  * Read accumulated cpu data from /proc/stat
    130  */
    131 static void get_cpu_stat(cpu_stat_t *cpu) {
    132     FILE *fp = NULL;
    133     const char *params = "cpu  %lu %lu %lu %lu %lu %lu %lu %*d %*d %*d\n";
    134 
    135     if ((fp = fopen("/proc/stat", "r")) == NULL) {
    136         err = true;
    137         snprintf(err_msg, sizeof(err_msg), "can't read from /proc/stat with errno %d", errno);
    138     } else {
    139         if (fscanf(fp, params, &cpu->utime, &cpu->ntime,
    140                 &cpu->stime, &cpu->itime, &cpu->iowtime, &cpu->irqtime,
    141                 &cpu->sirqtime) != cpu_stat_entries) {
    142             /*
    143              * If failed in getting status, new_cpu won't be updated and
    144              * is_heavy_loaded() will return false.
    145              */
    146             ALOGE("Error in getting cpu status. Skipping this check.");
    147             fclose(fp);
    148             return;
    149         }
    150 
    151         cpu->total = cpu->utime + cpu->ntime + cpu->stime + cpu->itime
    152             + cpu->iowtime + cpu->irqtime + cpu->sirqtime;
    153 
    154         fclose(fp);
    155     }
    156 }
    157 
    158 /*
    159  * Calculate cpu usage in the past interval.
    160  * If tracing is on, increase the idle threshold by 1.00% so that we do not
    161  * turn on and off tracing frequently when the cpu load is right close to
    162  * threshold.
    163  */
    164 static bool is_heavy_load(void) {
    165     unsigned long diff_idle, diff_total;
    166     int threshold = idle_threshold + (tracing?100:0);
    167     get_cpu_stat(&new_cpu);
    168     diff_idle = new_cpu.itime - old_cpu.itime;
    169     diff_total = new_cpu.total - old_cpu.total;
    170     old_cpu = new_cpu;
    171     return (diff_idle * 10000 < diff_total * threshold);
    172 }
    173 
    174 /*
    175  * Force the userland processes to refresh their property for logging.
    176  */
    177 static void dfs_poke_binder(void) {
    178     sp<IServiceManager> sm = defaultServiceManager();
    179     Vector<String16> services = sm->listServices();
    180     for (size_t i = 0; i < services.size(); i++) {
    181         sp<IBinder> obj = sm->checkService(services[i]);
    182         if (obj != NULL) {
    183             Parcel data;
    184             obj->transact(IBinder::SYSPROPS_TRANSACTION, data, NULL, 0);
    185         }
    186     }
    187 }
    188 
    189 /*
    190  * Enable/disable a debugfs property by writing 0/1 to its path.
    191  */
    192 static int dfs_enable(bool enable, const char* path) {
    193     int fd = open(path, O_WRONLY);
    194     if (fd == -1) {
    195         err = true;
    196         snprintf(err_msg, sizeof(err_msg), "Can't open %s. Error: %d", path, errno);
    197         return -1;
    198     }
    199     const char* control = (enable?"1":"0");
    200     ssize_t len = strlen(control);
    201     int max_try = 10; // Fail if write was interrupted for 10 times
    202     while (write(fd, control, len) != len) {
    203         if (errno == EINTR && max_try-- > 0) {
    204             usleep(100);
    205             continue;
    206         }
    207 
    208         err = true;
    209         snprintf(err_msg, sizeof(err_msg), "Error %d in writing to %s.", errno, path);
    210     }
    211     close(fd);
    212     return (err?-1:0);
    213 }
    214 
    215 /*
    216  * Set the userland tracing properties.
    217  */
    218 static void dfs_set_property(uint64_t mtag, const char* mapp, bool enable) {
    219     char buf[64];
    220     snprintf(buf, sizeof(buf), "%#" PRIx64, mtag);
    221     if (property_set(dfs_tags_property, buf) < 0) {
    222         err = true;
    223         snprintf(err_msg, sizeof(err_msg), "Failed to set debug tags system properties.");
    224     }
    225 
    226     if (strlen(mapp) > 0
    227             && property_set(dfs_apps_property, mapp) < 0) {
    228         err = true;
    229         snprintf(err_msg, sizeof(err_msg), "Failed to set debug applications.");
    230     }
    231 
    232     if (log_sched) {
    233         dfs_enable(enable, dfs_sched_switch_path);
    234         dfs_enable(enable, dfs_sched_wakeup_path);
    235     }
    236     if (log_stack) {
    237         dfs_enable(enable, dfs_stack_path);
    238     }
    239     if (log_irq) {
    240         dfs_enable(enable, dfs_irq_path);
    241     }
    242     if (log_sync) {
    243         dfs_enable(enable, dfs_sync_path);
    244     }
    245     if (log_workq) {
    246         dfs_enable(enable, dfs_workq_path);
    247     }
    248 }
    249 
    250 /*
    251  * Dump the log in a compressed format for systrace to visualize.
    252  * Create a dump file "dump_of_anrdaemon.<current_time>" under /data/misc/anrd
    253  */
    254 static void dump_trace()
    255 {
    256     time_t now = time(0);
    257     struct tm  tstruct;
    258     char time_buf[time_buf_size];
    259     char path_buf[path_buf_size];
    260     const char* header = " done\nTRACE:\n";
    261     ssize_t header_len = strlen(header);
    262 
    263     ALOGI("Started to dump ANRdaemon trace.");
    264 
    265     tstruct = *localtime(&now);
    266     strftime(time_buf, time_buf_size, "%Y-%m-%d.%X", &tstruct);
    267     snprintf(path_buf, path_buf_size, "/data/misc/anrd/dump_of_anrdaemon.%s", time_buf);
    268     int output_fd = creat(path_buf, S_IRWXU);
    269     if (output_fd == -1) {
    270         ALOGE("Failed to create %s. Dump aborted.", path_buf);
    271         return;
    272     }
    273 
    274     if (write(output_fd, header, strlen(header)) != header_len) {
    275         ALOGE("Failed to write the header.");
    276         close(output_fd);
    277         return;
    278     }
    279 
    280     int trace_fd = open(dfs_trace_output_path, O_RDWR);
    281     if (trace_fd == -1) {
    282         ALOGE("Failed to open %s. Dump aborted.", dfs_trace_output_path);
    283         close(output_fd);
    284         return;
    285     }
    286 
    287     z_stream zs;
    288     uint8_t *in, *out;
    289     int result, flush;
    290 
    291     memset(&zs, 0, sizeof(zs));
    292     result = deflateInit(&zs, Z_DEFAULT_COMPRESSION);
    293     if (result != Z_OK) {
    294         ALOGE("error initializing zlib: %d\n", result);
    295         close(trace_fd);
    296         close(output_fd);
    297         return;
    298     }
    299 
    300     const size_t bufSize = 64*1024;
    301     in = (uint8_t*)malloc(bufSize);
    302     out = (uint8_t*)malloc(bufSize);
    303     flush = Z_NO_FLUSH;
    304 
    305     zs.next_out = out;
    306     zs.avail_out = bufSize;
    307 
    308     do {
    309         if (zs.avail_in == 0) {
    310             result = read(trace_fd, in, bufSize);
    311             if (result < 0) {
    312                 ALOGE("error reading trace: %s", strerror(errno));
    313                 result = Z_STREAM_END;
    314                 break;
    315             } else if (result == 0) {
    316                 flush = Z_FINISH;
    317             } else {
    318                 zs.next_in = in;
    319                 zs.avail_in = result;
    320             }
    321         }
    322 
    323         if (zs.avail_out == 0) {
    324             result = write(output_fd, out, bufSize);
    325             if ((size_t)result < bufSize) {
    326                 ALOGE("error writing deflated trace: %s", strerror(errno));
    327                 result = Z_STREAM_END;
    328                 zs.avail_out = bufSize;
    329                 break;
    330             }
    331             zs.next_out = out;
    332             zs.avail_out = bufSize;
    333         }
    334 
    335     } while ((result = deflate(&zs, flush)) == Z_OK);
    336 
    337     if (result != Z_STREAM_END) {
    338         ALOGE("error deflating trace: %s\n", zs.msg);
    339     }
    340 
    341     if (zs.avail_out < bufSize) {
    342         size_t bytes = bufSize - zs.avail_out;
    343         result = write(output_fd, out, bytes);
    344         if ((size_t)result < bytes) {
    345             ALOGE("error writing deflated trace: %s", strerror(errno));
    346         }
    347     }
    348 
    349     result = deflateEnd(&zs);
    350     if (result != Z_OK) {
    351         ALOGE("error cleaning up zlib: %d\n", result);
    352     }
    353 
    354     free(in);
    355     free(out);
    356 
    357     close(trace_fd);
    358     close(output_fd);
    359 
    360     ALOGI("Finished dump. Output file stored at: %s", path_buf);
    361 }
    362 
    363 /*
    364  * Start logging when cpu usage is high. Meanwhile, moniter the cpu usage and
    365  * stop logging when it drops down.
    366  */
    367 static void start_tracing(void) {
    368     ALOGD("High cpu usage, start logging.");
    369 
    370     if (dfs_enable(true, dfs_control_path) != 0) {
    371         ALOGE("Failed to start tracing.");
    372         return;
    373     }
    374     tracing = true;
    375 
    376     /* Stop logging when cpu usage drops or the daemon is suspended.*/
    377     do {
    378         usleep(tracing_check_period);
    379     } while (!suspend && !dump_requested && is_heavy_load());
    380 
    381     if (dfs_enable(false, dfs_control_path) != 0) {
    382         ALOGE("Failed to stop tracing.");
    383         err = true;
    384         return;
    385     }
    386     tracing = false;
    387 
    388     if (suspend) {
    389         ALOGI("trace stopped due to suspend. Send SIGCONT to resume.");
    390     } else if (dump_requested) {
    391         ALOGI("trace stopped due to dump request.");
    392         dump_trace();
    393         dump_requested = false;
    394     } else {
    395         ALOGD("Usage back to low, stop logging.");
    396     }
    397 }
    398 
    399 /*
    400  * Set the tracing log buffer size.
    401  * Note the actual buffer size will be buf_size_kb * number of cores.
    402  */
    403 static int set_tracing_buffer_size(void) {
    404     int fd = open(dfs_buffer_size_path, O_WRONLY);
    405     if (fd == -1) {
    406         err = true;
    407         snprintf(err_msg, sizeof(err_msg), "Can't open atrace buffer size file under /d/tracing.");
    408         return -1;
    409     }
    410     ssize_t len = strlen(buf_size_kb);
    411     if (write(fd, buf_size_kb, len) != len) {
    412         err = true;
    413         snprintf(err_msg, sizeof(err_msg), "Error in writing to atrace buffer size file.");
    414     }
    415     close(fd);
    416     return (err?-1:0);
    417 
    418 }
    419 
    420 /*
    421  * Main loop to moniter the cpu usage and decided whether to start logging.
    422  */
    423 static void start(void) {
    424     if ((set_tracing_buffer_size()) != 0)
    425         return;
    426 
    427     dfs_set_property(tag, apps, true);
    428     dfs_poke_binder();
    429 
    430     get_cpu_stat(&old_cpu);
    431     sleep(check_period);
    432 
    433     while (!quit && !err) {
    434         if (!suspend && is_heavy_load()) {
    435             /*
    436              * Increase process priority to make sure we can stop logging when
    437              * necessary and do not overwrite the buffer
    438              */
    439             setpriority(PRIO_PROCESS, 0, -20);
    440             start_tracing();
    441             setpriority(PRIO_PROCESS, 0, 0);
    442         }
    443         sleep(check_period);
    444     }
    445     return;
    446 }
    447 
    448 /*
    449  * If trace is not running, dump trace right away.
    450  * If trace is running, request to dump trace.
    451  */
    452 static void request_dump_trace()
    453 {
    454     if (!tracing) {
    455         dump_trace();
    456     } else if (!dump_requested) {
    457         dump_requested = true;
    458     }
    459 }
    460 
    461 static void handle_signal(int signo)
    462 {
    463     switch (signo) {
    464         case SIGQUIT:
    465             suspend = true;
    466             quit = true;
    467             break;
    468         case SIGSTOP:
    469             suspend = true;
    470             break;
    471         case SIGCONT:
    472             suspend = false;
    473             break;
    474         case SIGUSR1:
    475             request_dump_trace();
    476     }
    477 }
    478 
    479 /*
    480  * Set the signal handler:
    481  * SIGQUIT: Reset debugfs and tracing property and terminate the daemon.
    482  * SIGSTOP: Stop logging and suspend the daemon.
    483  * SIGCONT: Resume the daemon as normal.
    484  * SIGUSR1: Dump the logging to a compressed format for systrace to visualize.
    485  */
    486 static void register_sighandler(void)
    487 {
    488     struct sigaction sa;
    489     sigset_t block_mask;
    490 
    491     sigemptyset(&block_mask);
    492     sigaddset (&block_mask, SIGQUIT);
    493     sigaddset (&block_mask, SIGSTOP);
    494     sigaddset (&block_mask, SIGCONT);
    495     sigaddset (&block_mask, SIGUSR1);
    496 
    497     sa.sa_flags = 0;
    498     sa.sa_mask = block_mask;
    499     sa.sa_handler = handle_signal;
    500     sigaction(SIGQUIT, &sa, NULL);
    501     sigaction(SIGSTOP, &sa, NULL);
    502     sigaction(SIGCONT, &sa, NULL);
    503     sigaction(SIGUSR1, &sa, NULL);
    504 }
    505 
    506 static void show_help(void) {
    507 
    508     fprintf(stderr, "usage: ANRdaemon [options] [categoris...]\n");
    509     fprintf(stdout, "Options includes:\n"
    510                     "   -a appname  enable app-level tracing for a comma "
    511                        "separated list of cmdlines\n"
    512                     "   -t N        cpu threshold for logging to start "
    513                         "(uint = 0.01%%, min = 5000, max = 9999, default = 9990)\n"
    514                     "   -s N        use a trace buffer size of N KB "
    515                         "default to 2048KB\n"
    516                     "   -h          show helps\n");
    517     fprintf(stdout, "Categoris includes:\n"
    518                     "   am         - activity manager\n"
    519                     "   sm         - sync manager\n"
    520                     "   input      - input\n"
    521                     "   dalvik     - dalvik VM\n"
    522                     "   audio      - Audio\n"
    523                     "   gfx        - Graphics\n"
    524                     "   rs         - RenderScript\n"
    525                     "   hal        - Hardware Modules\n"
    526                     "   irq        - kernel irq events\n"
    527                     "   sched      - kernel scheduler activity\n"
    528                     "   stack      - kernel stack\n"
    529                     "   sync       - kernel sync activity\n"
    530                     "   workq      - kernel work queues\n");
    531     fprintf(stdout, "Control includes:\n"
    532                     "   SIGQUIT: terminate the process\n"
    533                     "   SIGSTOP: suspend all function of the daemon\n"
    534                     "   SIGCONT: resume the normal function\n"
    535                     "   SIGUSR1: dump the current logging in a compressed form\n");
    536     exit(0);
    537 }
    538 
    539 static int get_options(int argc, char *argv[]) {
    540     int opt = 0;
    541     int threshold;
    542     while ((opt = getopt(argc, argv, "a:s:t:h")) >= 0) {
    543         switch(opt) {
    544             case 'a':
    545                 apps = optarg;
    546                 break;
    547             case 's':
    548                 if (atoi(optarg) > max_buffer_size)
    549                     buf_size_kb = max_buffer_size_str;
    550                 else if (atoi(optarg) < min_buffer_size)
    551                     buf_size_kb = min_buffer_size_str;
    552                 else
    553                     buf_size_kb = optarg;
    554                 break;
    555             case 't':
    556                 threshold = atoi(optarg);
    557                 if (threshold > 9999 || threshold < 5000) {
    558                     fprintf(stderr, "logging threshold should be 5000-9999\n");
    559                     return 1;
    560                 }
    561                 idle_threshold = 10000 - threshold;
    562                 break;
    563             case 'h':
    564                 show_help();
    565                 break;
    566             default:
    567                 fprintf(stderr, "Error in getting options.\n"
    568                         "run \"%s -h\" for usage.\n", argv[0]);
    569                 return 1;
    570         }
    571     }
    572 
    573     for (int i = optind; i < argc; i++) {
    574         if (strcmp(argv[i], "am") == 0) {
    575             tag |= ATRACE_TAG_ACTIVITY_MANAGER;
    576         } else if (strcmp(argv[i], "input") == 0) {
    577             tag |= ATRACE_TAG_INPUT;
    578         } else if (strcmp(argv[i], "sm") == 0) {
    579             tag |= ATRACE_TAG_SYNC_MANAGER;
    580         } else if (strcmp(argv[i], "dalvik") == 0) {
    581             tag |= ATRACE_TAG_DALVIK;
    582         } else if (strcmp(argv[i], "gfx") == 0) {
    583             tag |= ATRACE_TAG_GRAPHICS;
    584         } else if (strcmp(argv[i], "audio") == 0) {
    585             tag |= ATRACE_TAG_AUDIO;
    586         } else if (strcmp(argv[i], "hal") == 0) {
    587             tag |= ATRACE_TAG_HAL;
    588         } else if (strcmp(argv[i], "rs") == 0) {
    589             tag |= ATRACE_TAG_RS;
    590         } else if (strcmp(argv[i], "sched") == 0) {
    591             log_sched = true;
    592         } else if (strcmp(argv[i], "stack") == 0) {
    593             log_stack = true;
    594         } else if (strcmp(argv[i], "workq") == 0) {
    595             log_workq = true;
    596         } else if (strcmp(argv[i], "irq") == 0) {
    597             log_irq = true;
    598         } else if (strcmp(argv[i], "sync") == 0) {
    599             log_sync = true;
    600         } else {
    601             fprintf(stderr, "invalid category: %s\n"
    602                     "run \"%s -h\" for usage.\n", argv[i], argv[0]);
    603             return 1;
    604         }
    605     }
    606 
    607     /* If nothing is enabled, don't run */
    608     if (!tag && !log_sched && !log_stack && !log_workq && !log_irq && !log_sync) {
    609         ALOGE("Specify at least one category to trace.");
    610         return 1;
    611     }
    612 
    613     return 0;
    614 }
    615 
    616 int main(int argc, char *argv[])
    617 {
    618     if(get_options(argc, argv) != 0)
    619         return 1;
    620 
    621     if (daemon(0, 0) != 0)
    622         return 1;
    623 
    624     register_sighandler();
    625 
    626     /* Clear any the trace log file by overwrite it with a new file */
    627     int fd = creat(dfs_trace_output_path, 0);
    628     if (fd == -1) {
    629         ALOGE("Faield to open and cleaup previous log");
    630         return 1;
    631     }
    632     close(fd);
    633 
    634     ALOGI("ANRdaemon starting");
    635     start();
    636 
    637     if (err)
    638         ALOGE("ANRdaemon stopped due to Error: %s", err_msg);
    639 
    640     ALOGI("ANRdaemon terminated.");
    641 
    642     return (err?1:0);
    643 }
    644