trappy/trappy/base.py

#    Copyright 2015-2017 ARM Limited
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

"""Base class to parse trace.dat dumps"""

import re
import pandas as pd
import warnings

from resource import getrusage, RUSAGE_SELF

def _get_free_memory_kb():
    try:
        with open("/proc/meminfo") as f:
            memfree_line = [l for l in f.readlines() if "MemFree" in l][0]
            _, num_kb, _ = memfree_line.split()
            return int(num_kb)
    except:
        # Probably either not running on Linux (no /proc/meminfo), or format has
        # changed (we didn't find num_kb).
        return None

def trace_parser_explode_array(string, array_lengths):
    """Explode an array in the trace into individual elements for easy parsing

    Basically, turn :code:`load={1 1 2 2}` into :code:`load0=1 load1=1 load2=2
    load3=2`.

    :param string: Input string from the trace
    :type string: str

    :param array_lengths: A dictionary of array names and their
        expected length.  If we get array that's shorter than the expected
        length, additional keys have to be introduced with value 0 to
        compensate.
    :type array_lengths: dict

    For example:
    ::

        trace_parser_explode_array(string="load={1 2}",
                                   array_lengths={"load": 4})
        "load0=1 load1=2 load2=0 load3=0"
    """

    while True:
        match = re.search(r"[^ ]+={[^}]+}", string)
        if match is None:
            break

        to_explode = match.group()
        col_basename = re.match(r"([^=]+)=", to_explode).groups()[0]
        vals_str = re.search(r"{(.+)}", to_explode).groups()[0]
        vals_array = vals_str.split(' ')

        exploded_str = ""
        for (idx, val) in enumerate(vals_array):
            exploded_str += "{}{}={} ".format(col_basename, idx, val)

        vals_added = len(vals_array)
        if vals_added < array_lengths[col_basename]:
            for idx in range(vals_added, array_lengths[col_basename]):
                exploded_str += "{}{}=0 ".format(col_basename, idx)

        exploded_str = exploded_str[:-1]
        begin_idx = match.start()
        end_idx = match.end()

        string = string[:begin_idx] + exploded_str + string[end_idx:]

    return string

class Base(object):
    """Base class to parse trace.dat dumps.

    Don't use directly, create a subclass that has a unique_word class
    variable.  unique_word is a string that can uniquely identify
    lines in the trace that correspond to this event.  This is usually
    the trace_name (optionally followed by a semicolong,
    e.g. "sched_switch:") but it can be anything else for trace points
    generated using trace_printk().

    :param parse_raw: If :code:`True`, raw trace data (-r option) to
        trace-cmd will be used

    :param fallback: If :code:`True`, the parsing class will be used
        only if no other candidate class's unique_word matched. subclasses
        should override this (for ex. TracingMarkWrite uses it)

    This class acts as a base class for all TRAPpy events

    """
    def __init__(self, parse_raw=False, fallback=False):
        self.fallback = fallback
        self.tracer = None
        self.data_frame = pd.DataFrame()
        self.line_array = []
        self.data_array = []
        self.time_array = []
        self.comm_array = []
        self.pid_array = []
        self.tgid_array = []
        self.cpu_array = []
        self.parse_raw = parse_raw
        self.cached = False

    def finalize_object(self):
        pass

    def __get_trace_array_lengths(self):
        """Calculate the lengths of all arrays in the trace

        Returns a dict with the name of each array found in the trace
        as keys and their corresponding length as value

        """
        from collections import defaultdict

        pat_array = re.compile(r"([A-Za-z0-9_]+)={([^}]+)}")

        ret = defaultdict(int)

        for line in self.data_array:
            while True:
                match = re.search(pat_array, line)
                if not match:
                    break

                (array_name, array_elements) = match.groups()

                array_len = len(array_elements.split(' '))

                if array_len > ret[array_name]:
                    ret[array_name] = array_len

                line = line[match.end():]

            # Stop scanning if the trace doesn't have arrays
            if len(ret) == 0:
                break

        return ret

    def append_data(self, time, comm, pid, tgid, cpu, line, data):
        """Append data parsed from a line to the corresponding arrays

        The :mod:`DataFrame` will be created from this when the whole trace
        has been parsed.

        :param time: The time for the line that was printed in the trace
        :type time: float

        :param comm: The command name or the execname from which the trace
            line originated
        :type comm: str

        :param pid: The PID of the process from which the trace
            line originated
        :type pid: int

        :param data: The data for matching line in the trace
        :type data: str
        """

        self.time_array.append(time)
        self.comm_array.append(comm)
        self.pid_array.append(pid)
        self.tgid_array.append(tgid)
        self.cpu_array.append(cpu)
        self.line_array.append(line)
        self.data_array.append(data)

    def string_cast(self, string, type):
        """ Attempt to convert string to another type

        Here we attempt to cast string to a type. Currently only
        integer conversion is supported with future expansion
        left open to other types.

        :param string: The value to convert.
        :type string: str

        :param type: The type to convert to.
        :type type: type
        """
        # Currently this function only supports int conversion
        if type != int:
            return
        # Handle false-positives for negative numbers
        if not string.lstrip("-").isdigit():
            return string
        return int(string)

    def generate_data_dict(self, data_str):
        data_dict = {}
        prev_key = None
        for field in data_str.split():
            if "=" not in field:
                # Concatenation is supported only for "string" values
                if type(data_dict[prev_key]) is not str:
                    continue
                data_dict[prev_key] += ' ' + field
                continue
            (key, value) = field.split('=', 1)
            value = self.string_cast(value, int)
            data_dict[key] = value
            prev_key = key
        return data_dict

    def generate_parsed_data(self):

        # Get a rough idea of how much memory we have to play with
        CHECK_MEM_COUNT = 10000
        kb_free = _get_free_memory_kb()
        starting_maxrss = getrusage(RUSAGE_SELF).ru_maxrss
        check_memory_usage = True
        check_memory_count = 1

        for (comm, pid, tgid, cpu, line, data_str) in zip(self.comm_array, self.pid_array,
                                              self.tgid_array, self.cpu_array,
                                              self.line_array, self.data_array):
            data_dict = {"__comm": comm, "__pid": pid, "__tgid": tgid, "__cpu": cpu, "__line": line}
            data_dict.update(self.generate_data_dict(data_str))

            # When running out of memory, Pandas has been observed to segfault
            # rather than throwing a proper Python error.
            # Look at how much memory our process is using and warn if we seem
            # to be getting close to the system's limit, check it only once
            # in the beginning and then every CHECK_MEM_COUNT events
            check_memory_count -= 1
            if check_memory_usage and check_memory_count == 0:
                kb_used = (getrusage(RUSAGE_SELF).ru_maxrss - starting_maxrss)
                if kb_free and kb_used > kb_free * 0.9:
                    warnings.warn("TRAPpy: Appear to be low on memory. "
                                  "If errors arise, try providing more RAM")
                    check_memory_usage = False
                check_memory_count = CHECK_MEM_COUNT

            yield data_dict

    def create_dataframe(self):
        """Create the final :mod:`pandas.DataFrame`"""
        if not self.time_array:
            return

        trace_arr_lengths = self.__get_trace_array_lengths()

        if trace_arr_lengths.items():
            for (idx, val) in enumerate(self.data_array):
                expl_val = trace_parser_explode_array(val, trace_arr_lengths)
                self.data_array[idx] = expl_val

        time_idx = pd.Index(self.time_array, name="Time")
        self.data_frame = pd.DataFrame(self.generate_parsed_data(), index=time_idx)

        self.time_array = []
        self.line_array = []
        self.comm_array = []
        self.pid_array = []
        self.cpu_array = []
        self.data_array = []

    def write_csv(self, fname):
        """Write the csv info into a CSV file

        :param fname: The name of the CSV file
        :type fname: str
        """
        self.data_frame.to_csv(fname)

    def read_csv(self, fname):
        """Read the csv data into a DataFrame

        :param fname: The name of the CSV file
        :type fname: str
        """
        self.data_frame = pd.read_csv(fname, index_col = 0)

    def normalize_time(self, basetime):
        """Substract basetime from the Time of the data frame

        :param basetime: The offset which needs to be subtracted from
            the time index
        :type basetime: float
        """
        if basetime and not self.data_frame.empty:
            self.data_frame.reset_index(inplace=True)
            self.data_frame["Time"] = self.data_frame["Time"] - basetime
            self.data_frame.set_index("Time", inplace=True)