Home | History | Annotate | Download | only in app_startup
      1 #!/usr/bin/env python3
      2 #
      3 # Copyright 2018, The Android Open Source Project
      4 #
      5 # Licensed under the Apache License, Version 2.0 (the "License");
      6 # you may not use this file except in compliance with the License.
      7 # You may obtain a copy of the License at
      8 #
      9 #     http://www.apache.org/licenses/LICENSE-2.0
     10 #
     11 # Unless required by applicable law or agreed to in writing, software
     12 # distributed under the License is distributed on an "AS IS" BASIS,
     13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 # See the License for the specific language governing permissions and
     15 # limitations under the License.
     16 
     17 """
     18 Perform statistical analysis on measurements produced by app_startup_runner.py
     19 
     20 Install:
     21 $> sudo apt-get install python3-scipy
     22 
     23 Usage:
     24 $> ./analyze_metrics.py <filename.csv> [<filename2.csv> ...]
     25 $> ./analyze_metrics.py --help
     26 """
     27 
     28 import argparse
     29 import csv
     30 import itertools
     31 import os
     32 import subprocess
     33 import sys
     34 import tempfile
     35 from typing import Any, List, Dict, Iterable, TextIO, Tuple
     36 
     37 from scipy import stats as sc
     38 import numpy as np
     39 
     40 
     41 # These CSV columns are considered labels. Everything after them in the same row are metrics.
     42 _LABEL_COLUMNS=['packages', 'readaheads', 'compiler_filters']
     43 # The metric series with the 'cold' readahead is the baseline.
     44 # All others (warm, jit, etc) are the potential improvements.
     45 
     46 #fixme: this should probably be an option
     47 _BASELINE=('readaheads', 'cold')
     48 # ignore this for some statistic calculations
     49 _IGNORE_PAIR=('readaheads', 'warm')
     50 _PLOT_SUBKEY='readaheads'
     51 _PLOT_GROUPKEY='packages'
     52 _PLOT_DATA_INDEX = 0
     53 _DELTA=50
     54 _DELTA2=100
     55 _PVALUE_THRESHOLD=0.10
     56 _debug = False  # See -d/--debug flag.
     57 
     58 def parse_options(argv: List[str] = None):
     59   """Parse command line arguments and return an argparse Namespace object."""
     60   parser = argparse.ArgumentParser(description="Perform statistical analysis on measurements produced by app_start_runner.py.")
     61   parser.add_argument('input_files', metavar='file.csv', nargs='+', help='CSV file produced by app_startup_runner.py')
     62 
     63   parser.add_argument('-d', '--debug', dest='debug', action='store_true', help='Add extra debugging output')
     64   parser.add_argument('-os', '--output-samples', dest='output_samples', default='/dev/null', action='store', help='Store CSV for per-sample data')
     65   parser.add_argument('-oc', '--output-comparable', dest='output_comparable', default='/dev/null', action='store', help='Output CSV for comparable against baseline')
     66   parser.add_argument('-ocs', '--output-comparable-significant', dest='output_comparable_significant', default='/dev/null', action='store', help='Output CSV for comparable against baseline (significant only)')
     67   parser.add_argument('-pt', '--pvalue-threshold', dest='pvalue_threshold', type=float, default=_PVALUE_THRESHOLD, action='store')
     68   parser.add_argument('-dt', '--delta-threshold', dest='delta_threshold', type=int, default=_DELTA, action='store')
     69 
     70   return parser.parse_args(argv)
     71 
     72 def _debug_print(*args, **kwargs):
     73   """Print the args to sys.stderr if the --debug/-d flag was passed in."""
     74   global _debug
     75   if _debug:
     76     print(*args, **kwargs, file=sys.stderr)
     77 
     78 def _expand_gen_repr(args):
     79   new_args_list = []
     80   for i in args:
     81     # detect iterable objects that do not have their own override of __str__
     82     if hasattr(i, '__iter__'):
     83       to_str = getattr(i, '__str__')
     84       if to_str.__objclass__ == object:
     85         # the repr for a generator is just type+address, expand it out instead.
     86         new_args_list.append([_expand_gen_repr([j])[0] for j in i])
     87         continue
     88     # normal case: uses the built-in to-string
     89     new_args_list.append(i)
     90   return new_args_list
     91 
     92 def _debug_print_gen(*args, **kwargs):
     93   """Like _debug_print but will turn any iterable args into a list."""
     94   if not _debug:
     95     return
     96 
     97   new_args_list = _expand_gen_repr(args)
     98   _debug_print(*new_args_list, **kwargs)
     99 
    100 def read_headers(input_file: TextIO) -> Tuple[List[str], List[str]]:
    101   _debug_print("read_headers for file: ", input_file.name)
    102   csv_reader = csv.reader(input_file)
    103 
    104   label_num_columns = len(_LABEL_COLUMNS)
    105 
    106   try:
    107     header = next(csv_reader)
    108   except StopIteration:
    109     header = None
    110   _debug_print('header', header)
    111 
    112   if not header:
    113     return (None, None)
    114 
    115   labels = header[0:label_num_columns]
    116   data = header[label_num_columns:]
    117 
    118   return (labels, data)
    119 
    120 def read_labels_and_data(input_file: TextIO) -> Iterable[Tuple[List[str], List[int]]]:
    121   _debug_print("print_analysis for file: ", input_file.name)
    122   csv_reader = csv.reader(input_file)
    123 
    124   # Skip the header because it doesn't contain any data.
    125   # To get the header see read_headers function.
    126   try:
    127     header = next(csv_reader)
    128   except StopIteration:
    129     header = None
    130 
    131   label_num_columns = len(_LABEL_COLUMNS)
    132 
    133   for row in csv_reader:
    134     if len(row) > 0 and row[0][0] == ';':
    135       _debug_print("skip comment line", row)
    136       continue
    137 
    138     labels = row[0:label_num_columns]
    139     data = [int(i) for i in row[label_num_columns:]]
    140 
    141 #    _debug_print("labels:", labels)
    142 #    _debug_print("data:", data)
    143 
    144     yield (labels, data)
    145 
    146 def group_metrics_by_label(it: Iterable[Tuple[List[str], List[int]]]):
    147   prev_labels = None
    148   data_2d = []
    149 
    150   for label_list, data_list in it:
    151     if prev_labels != label_list:
    152       if prev_labels:
    153 #        _debug_print("grouped labels:", prev_labels, "data_2d:", data_2d)
    154         yield (prev_labels, data_2d)
    155       data_2d = []
    156 
    157     data_2d.append(data_list)
    158     prev_labels = label_list
    159 
    160   if prev_labels:
    161 #    _debug_print("grouped labels:", prev_labels, "data_2d:", data_2d)
    162     yield (prev_labels, data_2d)
    163 
    164 def data_to_numpy(it: Iterable[Tuple[List[str], List[List[int]]]]) -> Iterable[Tuple[List[str], Any]]:
    165   for label_list, data_2d in it:
    166     yield (label_list, np.asarray(data_2d, dtype=int))
    167 
    168 def iterate_columns(np_data_2d):
    169   for col in range(np_data_2d.shape[1]):
    170     col_as_array = np_data_2d[:, col]
    171     yield col_as_array
    172 
    173 def confidence_interval(np_data_2d, percent=0.95):
    174   """
    175   Given some data [[a,b,c],[d,e,f,]...]
    176 
    177   We assume the same metric is in the column (e.g. [a,d])
    178   and that data in the rows (e.g. [b,e]) are separate metric values.
    179 
    180   We then calculate the CI for each metric individually returning it as a list of tuples.
    181   """
    182   arr = []
    183   for col_2d in iterate_columns(np_data_2d):
    184     mean = col_2d.mean()
    185     sigma = col_2d.std()
    186 
    187     ci = sc.norm.interval(percent, loc=mean, scale=sigma / np.sqrt(len(col_2d)))
    188     arr.append(ci)
    189 
    190   # TODO: This seems to be returning NaN when all the samples have the same exact value
    191   # (e.g. stddev=0, which can trivially happen when sample count = 1).
    192 
    193   return arr
    194 
    195 def print_analysis(it, label_header: List[str], data_header: List[str], output_samples: str):
    196   print(label_header)
    197 
    198   with open(output_samples, "w") as output_file:
    199 
    200     csv_writer = csv.writer(output_file)
    201     csv_writer.writerow(label_header + ['mean', 'std', 'confidence_interval_a', 'confidence_interval_b'])
    202 
    203     for label_list, np_data_2d in it:
    204       print("**********************")
    205       print(label_list)
    206       print()
    207       print("      ", data_header)
    208       # aggregate computation column-wise
    209       print("Mean: ", np_data_2d.mean(axis=0))
    210       print("Std:  ", np_data_2d.std(axis=0))
    211       print("CI95%:", confidence_interval(np_data_2d))
    212       print("SEM:  ", stats_standard_error_one(np_data_2d, axis=0))
    213 
    214       #ci = confidence_interval(np_data_2d)[_PLOT_DATA_INDEX]
    215       sem = stats_standard_error_one(np_data_2d, axis=0)[_PLOT_DATA_INDEX]
    216       mean = np_data_2d.mean(axis=0)[_PLOT_DATA_INDEX]
    217 
    218       ci = (mean - sem, mean + sem)
    219 
    220       csv_writer.writerow(label_list + [mean, np_data_2d.std(axis=0)[_PLOT_DATA_INDEX], ci[0], ci[1]])
    221 
    222 def from_file_group_by_labels(input_file):
    223   (label_header, data_header) = read_headers(input_file)
    224   label_data_iter = read_labels_and_data(input_file)
    225   grouped_iter = group_metrics_by_label(label_data_iter)
    226   grouped_numpy_iter = data_to_numpy(grouped_iter)
    227 
    228   return grouped_numpy_iter, label_header, data_header
    229 
    230 def list_without_index(list, index):
    231   return list[:index] + list[index+1:]
    232 
    233 def group_by_without_baseline_key(grouped_numpy_iter, label_header):
    234   """
    235   Data is considered comparable if the only difference is the baseline key
    236   (i.e. the readahead is different but the package, compilation filter, etc, are the same).
    237 
    238   Returns iterator that's grouped by the non-baseline labels to an iterator of
    239   (label_list, data_2d).
    240   """
    241   baseline_index = label_header.index(_BASELINE[0])
    242 
    243   def get_label_without_baseline(tpl):
    244     label_list, _ = tpl
    245     return list_without_index(label_list, baseline_index)
    246   # [['pkgname', 'compfilter', 'warm'], [data]]
    247   # [['pkgname', 'compfilter', 'cold'], [data2]]
    248   # [['pkgname2', 'compfilter', 'warm'], [data3]]
    249   #
    250   #   ->
    251   # ( [['pkgname', 'compfilter', 'warm'], [data]]      # ignore baseline label change.
    252   #   [['pkgname', 'compfilter', 'cold'], [data2]] ),  # split here because the pkgname changed.
    253   # ( [['pkgname2', 'compfilter', 'warm'], [data3]] )
    254   for group_info, it in itertools.groupby(grouped_numpy_iter, key = get_label_without_baseline):
    255     yield it
    256 
    257   # TODO: replace this messy manual iteration/grouping with pandas
    258 
    259 def iterate_comparable_metrics(without_baseline_iter, label_header):
    260   baseline_index = label_header.index(_BASELINE[0])
    261   baseline_value = _BASELINE[1]
    262 
    263   _debug_print("iterate comparables")
    264 
    265   def is_baseline_fun(tp):
    266     ll, dat = tp
    267     return ll[baseline_index] == baseline_value
    268 
    269   # iterating here when everything but the baseline key is the same.
    270   for it in without_baseline_iter:
    271     it1, it2 = itertools.tee(it)
    272 
    273     # find all the baseline data.
    274     baseline_filter_it = filter(is_baseline_fun, it1)
    275 
    276     # find non-baseline data.
    277     nonbaseline_filter_it = itertools.filterfalse(is_baseline_fun, it2)
    278 
    279     yield itertools.product(baseline_filter_it, nonbaseline_filter_it)
    280 
    281 def stats_standard_error_one(a, axis):
    282   a_std = a.std(axis=axis, ddof=0)
    283   a_len = a.shape[axis]
    284 
    285   return a_std / np.sqrt(a_len)
    286 
    287 def stats_standard_error(a, b, axis):
    288   a_std = a.std(axis=axis, ddof=0)
    289   b_std = b.std(axis=axis, ddof=0)
    290 
    291   a_len = a.shape[axis]
    292   b_len = b.shape[axis]
    293 
    294   temp1 = a_std*a_std/a_len
    295   temp2 = b_std*b_std/b_len
    296 
    297   return np.sqrt(temp1 + temp2)
    298 
    299 def stats_tvalue(a, b, axis, delta = 0):
    300   a_mean = a.mean(axis=axis)
    301   b_mean = b.mean(axis=axis)
    302 
    303   return (a_mean - b_mean - delta) / stats_standard_error(a, b, axis)
    304 
    305 def stats_pvalue(a, b, axis, delta, left:bool = False):
    306   """
    307   Single-tailed 2-sample t-test.
    308 
    309   Returns p-value for the null hypothesis: mean(a) - mean(b) >= delta.
    310   :param a: numpy 2d array
    311   :param b: numpy 2d array
    312   :param axis: which axis to do the calculations across
    313   :param delta: test value of mean differences
    314   :param left: if true then use <= delta instead of >= delta
    315   :return: p-value
    316   """
    317   # implement our own pvalue calculation because the built-in t-test (t,p values)
    318   # only offer delta=0 , e.g. m1-m1 ? 0
    319   # we are however interested in m1-m2 ? delta
    320   t_value = stats_tvalue(a, b, axis, delta)
    321 
    322   # 2-sample degrees of freedom is using the array sizes - 2.
    323   dof = a.shape[axis] + b.shape[axis] - 2
    324 
    325   if left:
    326     # left tailed test. e.g. m1-m2 <= delta
    327     return sc.t.cdf(t_value, dof)
    328   else:
    329     # right tailed test. e.g. m1-m2 >= delta
    330     return sc.t.sf(t_value, dof)
    331   # a left+right tailed test is a 2-tail t-test and can be done using ttest_ind for delta=0
    332 
    333 def print_comparable_analysis(comparable_metrics_iter, label_header, data_header, output_comparable: str, output_comparable_significant: str):
    334   baseline_value = _BASELINE[1]
    335   baseline_index = label_header.index(_BASELINE[0])
    336 
    337   old_baseline_label_list = None
    338   delta = _DELTA
    339   filter_value = _IGNORE_PAIR[1]
    340   filter_index = label_header.index(_IGNORE_PAIR[0])
    341 
    342   pvalue_threshold = _PVALUE_THRESHOLD
    343   ci_threshold = (1 - _PVALUE_THRESHOLD) * 100.0
    344 
    345   with open(output_comparable, "w") as output_file:
    346 
    347     csv_writer = csv.writer(output_file)
    348     csv_writer.writerow(label_header + ['mean', 'mean_diff', 'sem', 'pvalue_2tailed', 'pvalue_gt%d' %(_DELTA), 'pvalue_gt%d' %(_DELTA2)])
    349 
    350     print("------------------------------------------------------------------")
    351     print("Comparison against the baseline %s = %s" %(_BASELINE, baseline_value))
    352     print("--- Right-tailed t-test checks if the baseline >= current %s by at least %d" %(_BASELINE[0], delta))
    353     print()
    354 
    355     global_stats = {'better_than_delta': [], 'better_than_delta_p95': []}
    356 
    357     for nested_it in comparable_metrics_iter:
    358       print("************************")
    359 
    360       better_than_delta = []
    361       better_than_delta_p95 = []
    362 
    363       saw_baseline_once = False
    364 
    365       for ((baseline_label_list, baseline_np_data_2d), (rest_label_list, rest_np_data_2d)) in nested_it:
    366         _debug_print("baseline_label_list:", baseline_label_list)
    367         _debug_print("baseline_np_data_2d:", baseline_np_data_2d)
    368         _debug_print("rest_label_list:", rest_label_list)
    369         _debug_print("rest_np_data_2d:", rest_np_data_2d)
    370 
    371         mean_diff = baseline_np_data_2d.mean(axis=0) - rest_np_data_2d.mean(axis=0)
    372         # 2-sample 2-tailed t-test with delta=0
    373         # e.g. "Is it true that usually the two sample means are different?"
    374         t_statistic, t_pvalue = sc.ttest_ind(baseline_np_data_2d, rest_np_data_2d, axis=0)
    375 
    376         # 2-sample 1-tailed t-test with delta=50
    377         # e.g. "Is it true that usually the sample means better than 50ms?"
    378         t2 = stats_tvalue(baseline_np_data_2d, rest_np_data_2d, axis=0, delta=delta)
    379         p2 = stats_pvalue(baseline_np_data_2d, rest_np_data_2d, axis=0, delta=delta)
    380 
    381         t2_b = stats_tvalue(baseline_np_data_2d, rest_np_data_2d, axis=0, delta=_DELTA2)
    382         p2_b = stats_pvalue(baseline_np_data_2d, rest_np_data_2d, axis=0, delta=_DELTA2)
    383 
    384         print("%s vs %s" %(rest_label_list, baseline_value))
    385         print("                           ", data_header)
    386         print("Mean Difference:           ", mean_diff)
    387         print("T-test (2-tailed) != 0:      t=%s, p=%s" %(t_statistic, t_pvalue))
    388         print("T-test (right-tailed) >= %d: t=%s, p=%s" %(_DELTA, t2, p2))
    389         print("T-test (right-tailed) >= %d: t=%s, p=%s" %(_DELTA2, t2_b, p2_b))
    390 
    391         def write_out_values(label_list, *args):
    392           csv_writer.writerow(label_list + [i[_PLOT_DATA_INDEX] for i in args])
    393 
    394         sem = stats_standard_error(baseline_np_data_2d, rest_np_data_2d, axis=0)
    395         if saw_baseline_once == False:
    396           saw_baseline_once = True
    397           base_sem = stats_standard_error_one(baseline_np_data_2d, axis=0)
    398           write_out_values(baseline_label_list, baseline_np_data_2d.mean(axis=0), [0], base_sem, [None], [None], [None])
    399         write_out_values(rest_label_list, rest_np_data_2d.mean(axis=0), mean_diff, sem, t_pvalue, p2, p2_b)
    400 
    401         # now do the global statistics aggregation
    402 
    403         if rest_label_list[filter_index] == filter_value:
    404           continue
    405 
    406         if mean_diff > delta:
    407           better_than_delta.append((mean_diff, p2, rest_label_list))
    408 
    409           if p2 <= pvalue_threshold:
    410             better_than_delta_p95.append((mean_diff, rest_label_list))
    411 
    412       if better_than_delta:
    413         global_stats['better_than_delta'].append(better_than_delta)
    414       if better_than_delta_p95:
    415         global_stats['better_than_delta_p95'].append(better_than_delta_p95)
    416 
    417     print("------------------------")
    418     print("Global statistics:")
    419     print("//// Rows with %s=%s are ignored here." %_IGNORE_PAIR)
    420     print("- # of results with mean diff better than delta(%d)       = %d" %(delta, len(global_stats['better_than_delta'])))
    421     print("    > (meandiff, pvalue, labels)")
    422     for i in global_stats['better_than_delta']:
    423       print("    > %s" %i)
    424     print("- # of results with mean diff better than delta(%d) CI%d%% = %d" %(delta, ci_threshold, len(global_stats['better_than_delta_p95'])))
    425     print("    > (meandiff, labels)")
    426     for i in global_stats['better_than_delta_p95']:
    427       print("    > %s" %i)
    428 
    429 def main():
    430   global _debug
    431   global _DELTA
    432   global _PVALUE_THRESHOLD
    433 
    434   opts = parse_options()
    435   _debug = opts.debug
    436   _debug_print("parsed options: ", opts)
    437 
    438   _PVALUE_THRESHOLD = opts.pvalue_threshold or _PVALUE_THRESHOLD
    439 
    440   for file_name in opts.input_files:
    441     with open(file_name, 'r') as input_file:
    442       (grouped_numpy_iter, label_header, data_header) = from_file_group_by_labels(input_file)
    443       print_analysis(grouped_numpy_iter, label_header, data_header, opts.output_samples)
    444 
    445     with open(file_name, 'r') as input_file:
    446       (grouped_numpy_iter, label_header, data_header) = from_file_group_by_labels(input_file)
    447       without_baseline_iter = group_by_without_baseline_key(grouped_numpy_iter, label_header)
    448       #_debug_print_gen(without_baseline_iter)
    449 
    450       comparable_metrics_iter = iterate_comparable_metrics(without_baseline_iter, label_header)
    451       print_comparable_analysis(comparable_metrics_iter, label_header, data_header, opts.output_comparable, opts.output_comparable_significant)
    452 
    453   return 0
    454 
    455 
    456 if __name__ == '__main__':
    457   sys.exit(main())
    458