Home | History | Annotate | Download | only in hist
      1 #!/usr/bin/env python2.7
      2 """ 
      3     Utility for converting *_clat_hist* files generated by fio into latency statistics.
      4     
      5     Example usage:
      6     
      7             $ fiologparser_hist.py *_clat_hist*
      8             end-time, samples, min, avg, median, 90%, 95%, 99%, max
      9             1000, 15, 192, 1678.107, 1788.859, 1856.076, 1880.040, 1899.208, 1888.000
     10             2000, 43, 152, 1642.368, 1714.099, 1816.659, 1845.552, 1888.131, 1888.000
     11             4000, 39, 1152, 1546.962, 1545.785, 1627.192, 1640.019, 1691.204, 1744
     12             ...
     13     
     14     @author Karl Cronburg <karl.cronburg (at] gmail.com>
     15 """
     16 import os
     17 import sys
     18 import pandas
     19 import numpy as np
     20 
     21 err = sys.stderr.write
     22 
     23 def weighted_percentile(percs, vs, ws):
     24     """ Use linear interpolation to calculate the weighted percentile.
     25         
     26         Value and weight arrays are first sorted by value. The cumulative
     27         distribution function (cdf) is then computed, after which np.interp
     28         finds the two values closest to our desired weighted percentile(s)
     29         and linearly interpolates them.
     30         
     31         percs  :: List of percentiles we want to calculate
     32         vs     :: Array of values we are computing the percentile of
     33         ws     :: Array of weights for our corresponding values
     34         return :: Array of percentiles
     35     """
     36     idx = np.argsort(vs)
     37     vs, ws = vs[idx], ws[idx] # weights and values sorted by value
     38     cdf = 100 * (ws.cumsum() - ws / 2.0) / ws.sum()
     39     return np.interp(percs, cdf, vs) # linear interpolation
     40 
     41 def weights(start_ts, end_ts, start, end):
     42     """ Calculate weights based on fraction of sample falling in the
     43         given interval [start,end]. Weights computed using vector / array
     44         computation instead of for-loops.
     45     
     46         Note that samples with zero time length are effectively ignored
     47         (we set their weight to zero).
     48 
     49         start_ts :: Array of start times for a set of samples
     50         end_ts   :: Array of end times for a set of samples
     51         start    :: int
     52         end      :: int
     53         return   :: Array of weights
     54     """
     55     sbounds = np.maximum(start_ts, start).astype(float)
     56     ebounds = np.minimum(end_ts,   end).astype(float)
     57     ws = (ebounds - sbounds) / (end_ts - start_ts)
     58     if np.any(np.isnan(ws)):
     59       err("WARNING: zero-length sample(s) detected. Log file corrupt"
     60           " / bad time values? Ignoring these samples.\n")
     61     ws[np.where(np.isnan(ws))] = 0.0;
     62     return ws
     63 
     64 def weighted_average(vs, ws):
     65     return np.sum(vs * ws) / np.sum(ws)
     66 
     67 columns = ["end-time", "samples", "min", "avg", "median", "90%", "95%", "99%", "max"]
     68 percs   = [50, 90, 95, 99]
     69 
     70 def fmt_float_list(ctx, num=1):
     71   """ Return a comma separated list of float formatters to the required number
     72       of decimal places. For instance:
     73 
     74         fmt_float_list(ctx.decimals=4, num=3) == "%.4f, %.4f, %.4f"
     75   """
     76   return ', '.join(["%%.%df" % ctx.decimals] * num)
     77 
     78 # Default values - see beginning of main() for how we detect number columns in
     79 # the input files:
     80 __HIST_COLUMNS = 1216
     81 __NON_HIST_COLUMNS = 3
     82 __TOTAL_COLUMNS = __HIST_COLUMNS + __NON_HIST_COLUMNS
     83     
     84 def read_chunk(rdr, sz):
     85     """ Read the next chunk of size sz from the given reader. """
     86     try:
     87         """ StopIteration occurs when the pandas reader is empty, and AttributeError
     88             occurs if rdr is None due to the file being empty. """
     89         new_arr = rdr.read().values
     90     except (StopIteration, AttributeError):
     91         return None    
     92 
     93     """ Extract array of just the times, and histograms matrix without times column. """
     94     times, rws, szs = new_arr[:,0], new_arr[:,1], new_arr[:,2]
     95     hists = new_arr[:,__NON_HIST_COLUMNS:]
     96     times = times.reshape((len(times),1))
     97     arr = np.append(times, hists, axis=1)
     98 
     99     return arr
    100 
    101 def get_min(fps, arrs):
    102     """ Find the file with the current first row with the smallest start time """
    103     return min([fp for fp in fps if not arrs[fp] is None], key=lambda fp: arrs.get(fp)[0][0])
    104 
    105 def histogram_generator(ctx, fps, sz):
    106     
    107     # Create a chunked pandas reader for each of the files:
    108     rdrs = {}
    109     for fp in fps:
    110         try:
    111             rdrs[fp] = pandas.read_csv(fp, dtype=int, header=None, chunksize=sz)
    112         except ValueError as e:
    113             if e.message == 'No columns to parse from file':
    114                 if ctx.warn: sys.stderr.write("WARNING: Empty input file encountered.\n")
    115                 rdrs[fp] = None
    116             else:
    117                 raise(e)
    118 
    119     # Initial histograms from disk:
    120     arrs = {fp: read_chunk(rdr, sz) for fp,rdr in rdrs.items()}
    121     while True:
    122 
    123         try:
    124             """ ValueError occurs when nothing more to read """
    125             fp = get_min(fps, arrs)
    126         except ValueError:
    127             return
    128         arr = arrs[fp]
    129         yield np.insert(arr[0], 1, fps.index(fp))
    130         arrs[fp] = arr[1:]
    131 
    132         if arrs[fp].shape[0] == 0:
    133             arrs[fp] = read_chunk(rdrs[fp], sz)
    134 
    135 def _plat_idx_to_val(idx, edge=0.5, FIO_IO_U_PLAT_BITS=6, FIO_IO_U_PLAT_VAL=64):
    136     """ Taken from fio's stat.c for calculating the latency value of a bin
    137         from that bin's index.
    138         
    139             idx  : the value of the index into the histogram bins
    140             edge : fractional value in the range [0,1]** indicating how far into
    141             the bin we wish to compute the latency value of.
    142         
    143         ** edge = 0.0 and 1.0 computes the lower and upper latency bounds
    144            respectively of the given bin index. """
    145 
    146     # MSB <= (FIO_IO_U_PLAT_BITS-1), cannot be rounded off. Use
    147     # all bits of the sample as index
    148     if (idx < (FIO_IO_U_PLAT_VAL << 1)):
    149         return idx 
    150 
    151     # Find the group and compute the minimum value of that group
    152     error_bits = (idx >> FIO_IO_U_PLAT_BITS) - 1 
    153     base = 1 << (error_bits + FIO_IO_U_PLAT_BITS)
    154 
    155     # Find its bucket number of the group
    156     k = idx % FIO_IO_U_PLAT_VAL
    157 
    158     # Return the mean (if edge=0.5) of the range of the bucket
    159     return base + ((k + edge) * (1 << error_bits))
    160     
    161 def plat_idx_to_val_coarse(idx, coarseness, edge=0.5):
    162     """ Converts the given *coarse* index into a non-coarse index as used by fio
    163         in stat.h:plat_idx_to_val(), subsequently computing the appropriate
    164         latency value for that bin.
    165         """
    166 
    167     # Multiply the index by the power of 2 coarseness to get the bin
    168     # bin index with a max of 1536 bins (FIO_IO_U_PLAT_GROUP_NR = 24 in stat.h)
    169     stride = 1 << coarseness
    170     idx = idx * stride
    171     lower = _plat_idx_to_val(idx, edge=0.0)
    172     upper = _plat_idx_to_val(idx + stride, edge=1.0)
    173     return lower + (upper - lower) * edge
    174 
    175 def print_all_stats(ctx, end, mn, ss_cnt, vs, ws, mx):
    176     ps = weighted_percentile(percs, vs, ws)
    177 
    178     avg = weighted_average(vs, ws)
    179     values = [mn, avg] + list(ps) + [mx]
    180     row = [end, ss_cnt] + map(lambda x: float(x) / ctx.divisor, values)
    181     fmt = "%d, %d, %d, " + fmt_float_list(ctx, 5) + ", %d"
    182     print (fmt % tuple(row))
    183 
    184 def update_extreme(val, fncn, new_val):
    185     """ Calculate min / max in the presence of None values """
    186     if val is None: return new_val
    187     else: return fncn(val, new_val)
    188 
    189 # See beginning of main() for how bin_vals are computed
    190 bin_vals = []
    191 lower_bin_vals = [] # lower edge of each bin
    192 upper_bin_vals = [] # upper edge of each bin 
    193 
    194 def process_interval(ctx, samples, iStart, iEnd):
    195     """ Construct the weighted histogram for the given interval by scanning
    196         through all the histograms and figuring out which of their bins have
    197         samples with latencies which overlap with the given interval
    198         [iStart,iEnd].
    199     """
    200     
    201     times, files, hists = samples[:,0], samples[:,1], samples[:,2:]
    202     iHist = np.zeros(__HIST_COLUMNS)
    203     ss_cnt = 0 # number of samples affecting this interval
    204     mn_bin_val, mx_bin_val = None, None
    205 
    206     for end_time,file,hist in zip(times,files,hists):
    207             
    208         # Only look at bins of the current histogram sample which
    209         # started before the end of the current time interval [start,end]
    210         start_times = (end_time - 0.5 * ctx.interval) - bin_vals / 1000.0
    211         idx = np.where(start_times < iEnd)
    212         s_ts, l_bvs, u_bvs, hs = start_times[idx], lower_bin_vals[idx], upper_bin_vals[idx], hist[idx]
    213 
    214         # Increment current interval histogram by weighted values of future histogram:
    215         ws = hs * weights(s_ts, end_time, iStart, iEnd)
    216         iHist[idx] += ws
    217     
    218         # Update total number of samples affecting current interval histogram:
    219         ss_cnt += np.sum(hs)
    220         
    221         # Update min and max bin values seen if necessary:
    222         idx = np.where(hs != 0)[0]
    223         if idx.size > 0:
    224             mn_bin_val = update_extreme(mn_bin_val, min, l_bvs[max(0,           idx[0]  - 1)])
    225             mx_bin_val = update_extreme(mx_bin_val, max, u_bvs[min(len(hs) - 1, idx[-1] + 1)])
    226 
    227     if ss_cnt > 0: print_all_stats(ctx, iEnd, mn_bin_val, ss_cnt, bin_vals, iHist, mx_bin_val)
    228 
    229 def guess_max_from_bins(ctx, hist_cols):
    230     """ Try to guess the GROUP_NR from given # of histogram
    231         columns seen in an input file """
    232     max_coarse = 8
    233     if ctx.group_nr < 19 or ctx.group_nr > 26:
    234         bins = [ctx.group_nr * (1 << 6)]
    235     else:
    236         bins = [1216,1280,1344,1408,1472,1536,1600,1664]
    237     coarses = range(max_coarse + 1)
    238     fncn = lambda z: list(map(lambda x: z/2**x if z % 2**x == 0 else -10, coarses))
    239     
    240     arr = np.transpose(list(map(fncn, bins)))
    241     idx = np.where(arr == hist_cols)
    242     if len(idx[1]) == 0:
    243         table = repr(arr.astype(int)).replace('-10', 'N/A').replace('array','     ')
    244         err("Unable to determine bin values from input clat_hist files. Namely \n"
    245             "the first line of file '%s' " % ctx.FILE[0] + "has %d \n" % (__TOTAL_COLUMNS,) +
    246             "columns of which we assume %d " % (hist_cols,) + "correspond to histogram bins. \n"
    247             "This number needs to be equal to one of the following numbers:\n\n"
    248             + table + "\n\n"
    249             "Possible reasons and corresponding solutions:\n"
    250             "  - Input file(s) does not contain histograms.\n"
    251             "  - You recompiled fio with a different GROUP_NR. If so please specify this\n"
    252             "    new GROUP_NR on the command line with --group_nr\n")
    253         exit(1)
    254     return bins[idx[1][0]]
    255 
    256 def main(ctx):
    257 
    258     if ctx.job_file:
    259         try:
    260             from configparser import SafeConfigParser, NoOptionError
    261         except ImportError:
    262             from ConfigParser import SafeConfigParser, NoOptionError
    263 
    264         cp = SafeConfigParser(allow_no_value=True)
    265         with open(ctx.job_file, 'r') as fp:
    266             cp.readfp(fp)
    267 
    268         if ctx.interval is None:
    269             # Auto detect --interval value
    270             for s in cp.sections():
    271                 try:
    272                     hist_msec = cp.get(s, 'log_hist_msec')
    273                     if hist_msec is not None:
    274                         ctx.interval = int(hist_msec)
    275                 except NoOptionError:
    276                     pass
    277 
    278     if ctx.interval is None:
    279         ctx.interval = 1000
    280 
    281     # Automatically detect how many columns are in the input files,
    282     # calculate the corresponding 'coarseness' parameter used to generate
    283     # those files, and calculate the appropriate bin latency values:
    284     with open(ctx.FILE[0], 'r') as fp:
    285         global bin_vals,lower_bin_vals,upper_bin_vals,__HIST_COLUMNS,__TOTAL_COLUMNS
    286         __TOTAL_COLUMNS = len(fp.readline().split(','))
    287         __HIST_COLUMNS = __TOTAL_COLUMNS - __NON_HIST_COLUMNS
    288 
    289         max_cols = guess_max_from_bins(ctx, __HIST_COLUMNS)
    290         coarseness = int(np.log2(float(max_cols) / __HIST_COLUMNS))
    291         bin_vals = np.array(map(lambda x: plat_idx_to_val_coarse(x, coarseness), np.arange(__HIST_COLUMNS)), dtype=float)
    292         lower_bin_vals = np.array(map(lambda x: plat_idx_to_val_coarse(x, coarseness, 0.0), np.arange(__HIST_COLUMNS)), dtype=float)
    293         upper_bin_vals = np.array(map(lambda x: plat_idx_to_val_coarse(x, coarseness, 1.0), np.arange(__HIST_COLUMNS)), dtype=float)
    294 
    295     fps = [open(f, 'r') for f in ctx.FILE]
    296     gen = histogram_generator(ctx, fps, ctx.buff_size)
    297 
    298     print(', '.join(columns))
    299 
    300     try:
    301         start, end = 0, ctx.interval
    302         arr = np.empty(shape=(0,__TOTAL_COLUMNS - 1))
    303         more_data = True
    304         while more_data or len(arr) > 0:
    305             
    306             # Read up to ctx.max_latency (default 20 seconds) of data from end of current interval.
    307             while len(arr) == 0 or arr[-1][0] < ctx.max_latency * 1000 + end:
    308                 try:
    309                     new_arr = next(gen)
    310                 except StopIteration:
    311                     more_data = False
    312                     break
    313                 arr = np.append(arr, new_arr.reshape((1,__TOTAL_COLUMNS - 1)), axis=0)
    314             arr = arr.astype(int)
    315             
    316             if arr.size > 0:
    317                 # Jump immediately to the start of the input, rounding
    318                 # down to the nearest multiple of the interval (useful when --log_unix_epoch
    319                 # was used to create these histograms):
    320                 if start == 0 and arr[0][0] - ctx.max_latency > end:
    321                     start = arr[0][0] - ctx.max_latency
    322                     start = start - (start % ctx.interval)
    323                     end = start + ctx.interval
    324 
    325                 process_interval(ctx, arr, start, end)
    326                 
    327                 # Update arr to throw away samples we no longer need - samples which
    328                 # end before the start of the next interval, i.e. the end of the
    329                 # current interval:
    330                 idx = np.where(arr[:,0] > end)
    331                 arr = arr[idx]
    332             
    333             start += ctx.interval
    334             end = start + ctx.interval
    335     finally:
    336         map(lambda f: f.close(), fps)
    337 
    338 
    339 if __name__ == '__main__':
    340     import argparse
    341     p = argparse.ArgumentParser()
    342     arg = p.add_argument
    343     arg("FILE", help='space separated list of latency log filenames', nargs='+')
    344     arg('--buff_size',
    345         default=10000,
    346         type=int,
    347         help='number of samples to buffer into numpy at a time')
    348 
    349     arg('--max_latency',
    350         default=20,
    351         type=float,
    352         help='number of seconds of data to process at a time')
    353 
    354     arg('-i', '--interval',
    355         type=int,
    356         help='interval width (ms), default 1000 ms')
    357 
    358     arg('-d', '--divisor',
    359         required=False,
    360         type=int,
    361         default=1,
    362         help='divide the results by this value.')
    363 
    364     arg('--decimals',
    365         default=3,
    366         type=int,
    367         help='number of decimal places to print floats to')
    368 
    369     arg('--warn',
    370         dest='warn',
    371         action='store_true',
    372         default=False,
    373         help='print warning messages to stderr')
    374 
    375     arg('--group_nr',
    376         default=19,
    377         type=int,
    378         help='FIO_IO_U_PLAT_GROUP_NR as defined in stat.h')
    379 
    380     arg('--job-file',
    381         default=None,
    382         type=str,
    383         help='Optional argument pointing to the job file used to create the '
    384              'given histogram files. Useful for auto-detecting --log_hist_msec and '
    385              '--log_unix_epoch (in fio) values.')
    386 
    387     main(p.parse_args())
    388 
    389