This tutorial aims to show some example of data analysis and visualization from a set of trace events collected trace. These analysis are mainly supported by TRAPpy and other standard PANDAs APIs.
In [317]:
    
import logging
from conf import LisaLogging
LisaLogging.setup()
    
In [318]:
    
# Events we are interested to collect from the trace
my_events = [
    "sched_switch",
    "sched_wakeup",
    "sched_contrib_scale_f",
    "sched_load_avg_cpu",
    "sched_load_avg_task",
    "sched_tune_tasks_update",
    "sched_boost_cpu",
    "sched_boost_task",
    "sched_energy_diff",
    "sched_overutilized",
    "cpu_frequency",
    "cpu_capacity"
]
# Let's use an example trace
trace_file = './example_results/trace.dat'
    
In [319]:
    
# Use TRAPpy support to convert a trace into a PANDAs DataFrame
import trappy
ftrace = trappy.FTrace(
    name="PandasExamples",
    path=trace_file,
    scope='custom',
    events=my_events,
    window=(0,None))
    
In [320]:
    
# NOTE: The interactive trace visualization is available only if you run
#       the workload to generate a new trace-file
trappy.plotter.plot_trace(ftrace)
    
    
In [321]:
    
# Let's have a look at the generated file
!ls -la example_results
    
    
Notice that the binary trace has been used to generate both textual (i.e. trace.txt) trace as well as a raw trace (trace.raw.txt).
In [322]:
    
logging.info("Example of (formatted) 'sched_switch' event:")
with open('./example_results/trace.txt', 'r') as fh:
    for line in fh:
        if line.find('sched_switch') == -1:
            continue
        print line
        break
    
    
    
In [323]:
    
logging.info("Example of (raw) 'sched_switch' event:")
with open('./example_results/trace.raw.txt', 'r') as fh:
    for line in fh:
        if line.find('sched_switch') == -1:
            continue
        print line
        break
    
    
    
Every event correspond to a "table" which is exposed as an attribute of the TRAPpy::FTtrace object we created by parsing the trace.
We can use TRAPpy::FTrace::class_definitions, which is a dictionary tracking all the collected tables.
In [324]:
    
logging.info("List of events identified in the trace:")
for event in ftrace.class_definitions.keys():
    logging.info("   %s", event)
    
    
In [325]:
    
# Let's get a reference to the PANDAs DataFrame corresponding to the
# "sched_switch" events
logging.info("The 'sched_switch' events are collected into this DataFrame:")
df = ftrace.sched_switch.data_frame
df#.head(2)
    
    
    Out[325]:
NOTE: We can use head() to report only a limited number of events
In [326]:
    
# All events parsed in the trace have an associated DataFrame
logging.info("This is the DataFrame for 'cpu_frequency' events:")
df = ftrace.cpu_frequency.data_frame
df.head(2)
    
    
    Out[326]:
On DataFrames is possible to execute a number of different operations. Most commonly we are interested in:
In [330]:
    
df = ftrace.sched_switch.data_frame
df.head(2)
    
    Out[330]:
In [331]:
    
# The "ix" operator allow to select ranges of [rows:columns]
df = df.ix[:,'next_comm':'prev_state']
df.head(2)
    
    Out[331]:
In [332]:
    
# We can also filter by listing the columns we are interested into
tasks = df[['next_pid', 'next_comm']]
logging.info("Found %d sched_switch events: ", len(tasks))
tasks.head(2)
    
    
    Out[332]:
In [333]:
    
# Create a map of {PID: TaskName}
pid_to_task = {int(task[0]): task[1] for task in tasks.drop_duplicates().values}
logging.info("The trace has %s unique PIDs", len(pid_to_task))
pid_to_task
    
    
    Out[333]:
In [118]:
    
# Group events by "PID" and compute 
most_switching = df.groupby('next_pid').describe(include=['object'])
most_switching.head()
    
    Out[118]:
In [119]:
    
most_switching = most_switching.unstack()
most_switching.head()
    
    Out[119]:
In [124]:
    
most_switching = most_switching['next_comm']\
                 .sort_values(by=['count'], ascending=False)
most_switching.head()
    
    Out[124]:
In [231]:
    
most_switching_pid  = most_switching.index[1]
most_switching_task = most_switching.values[1][2]
task_name = "{}:{}".format(most_switching_pid, most_switching_task)
logging.info("The second most swithing task is: [%s]", task_name)
    
    
In [188]:
    
# Lets use the previous information to filter values of another DataFrame
# Here we use the events reporting the task utilization signal
df = ftrace.sched_load_avg_task.data_frame.ix[:,'comm':'util_sum']
logging.info("Total events: %d", len(df))
df.head()
    
    
    Out[188]:
In [189]:
    
# Select only events for the task of interest
select = (df.pid == most_switching_pid)
task_events = df[select]
logging.info("Selected events: %d", len(task_events))
task_events.head()
    
    
    Out[189]:
In [194]:
    
# We can also compose a query using multiple conditions
select = (((task_events.cpu != 1) & (task_events.cpu != 2)) & ~(task_events.util_avg < 450))
task_events = task_events[select]
logging.info("Selected events: %d", len(task_events))
task_events.head()
    
    
    Out[194]:
In [198]:
    
# Row selection is also possible, which in our case corresponds to limiting 
# the events on a specified time frame
task_events.ix[21.1:22.9,].head()
    
    Out[198]:
In [218]:
    
# Let's make use of yet another DataFrame, let's look at the frequencies
frequencies = ftrace.cpu_frequency.data_frame
select = (frequencies.cpu == 0)
frequencies = frequencies.ix[21.1:22.9,'cpu':'frequency'][select]
logging.info("Selected events: %d", len(frequencies))
frequencies.head()
    
    
    Out[218]:
In [222]:
    
# Let's merge the task signal frame with the CPUs frequencies
task_util_and_freq = task_events.join(
    frequencies,
    how='outer',
    lsuffix='lx_')#[['util_avg', 'frequency']]
logging.info("Selected events: %d", len(task_util_and_freq))
task_util_and_freq.tail()
    
    
    Out[222]:
In [223]:
    
# The new table has the union of events from the two original tables
len(task_events) + len(frequencies)
    
    Out[223]:
In [224]:
    
# Let's propagate missing columns values...
task_util_and_freq = task_util_and_freq.fillna(method='ffill')
task_util_and_freq.tail()
    
    Out[224]:
In [226]:
    
# NOTE: missing values at the beginning can be fixe as well
#       ...but that behind that demo!
# LISA has a set of methods which provides a complete and realiable solutions
# to do this kind of compositions on data frames
    
In [227]:
    
# Enable generation of Notebook emebedded plots
%pylab inline
    
    
    
In [229]:
    
task_util_and_freq[['util_avg', 'frequency']].plot(drawstyle="steps-post")
    
    Out[229]:
    
In [260]:
    
import matplotlib.gridspec as gridspec
t_min = 21.5
t_max = 23.0
gs = gridspec.GridSpec(2, 1, height_ratios=[2,1]);
plt.figure(figsize=(16, 8));
plt.suptitle("Task Utilization vs Frequency",
             y=.97, fontsize=16, horizontalalignment='center');
# Plot Utilization
data = task_util_and_freq['util_avg']
axes = plt.subplot(gs[0,0]);
data.plot(ax=axes, drawstyle='steps-post');
axes.set_title('Task [{}] Utilization'.format(task_name));
axes.set_ylabel('Utilization')
axes.set_xlim(t_min, t_max);
axes.grid(True);
axes.set_xticklabels([])
axes.set_xlabel('')
# Plot Frequencies
data = task_util_and_freq['frequency']/1000
axes = plt.subplot(gs[1,0]);
data.plot(ax=axes, drawstyle='steps-post');
axes.set_title('Frequencies');
axes.set_ylabel('MHz')
axes.set_xlim(t_min, t_max);
axes.grid(True);
    
    
Plots can be much better with just a little bit of setup and using the proper APIs.
That's why LISA aims at providing a predefined set of plot functions for commonly used analysis.
In [306]:
    
# Consider the data frame for the frequency switching events
logging.info("Table for 'cpu_frequency' events:")
ftrace.cpu_frequency.data_frame.head(1)
    
    
    Out[306]:
In [309]:
    
# It is possible to mix in the same plot multiple events
# The LinePlot module requires to specify a list of signals to plot.
# Each signal is defined as:
#       <event>:<column>
# where:
#  <event>   is one of the events collected from the trace by the FTrace object
#  <column>  is one of the column of the previously defined event
my_signals = [
    'cpu_frequency:frequency',
]
# These two paramatere are passed to the LinePlot call as long with the
# TRAPpy FTrace object
trappy.LinePlot(
    
    # FTrace object
    ftrace,
    
    # Signals to be plotted
    signals=my_signals,
    
    # Generate one plot for each value of the specified column
    pivot='cpu',
    
    # Generate only plots which satisfy these filters
    filters = {
        # Column
        # |     Values
        # |     |
        'cpu' : [0, 1]
    },
    
    # Formatting style
    drawstyle='steps-post',
    marker = '+'
).view()
    
    
In [298]:
    
# We can also produce interacive plots
trappy.ILinePlot(
    ftrace,                         # TRAPpy RUN object
    signals=[
      'cpu_frequency:frequency'
    ],
    drawstyle='steps-post',
    per_line=1,
#     sync_zoom=True,
#     group='UtilVsFrequency',
    marker = '+').view()
trappy.ILinePlot(
    ftrace,                         # TRAPpy RUN object
    signals=[
      'sched_load_avg_task:util_avg'
    ],
    filters={
        'pid': [most_switching_pid]
    },
    drawstyle='steps-post',
    per_line=1,
#     sync_zoom=True,
#     group='UtilVsFrequency',
    marker = '+').view()
    
    
    
You can find much more exaples of plotting in TRAPpy on the example
notebook available on the GitHub:
https://github.com/ARM-software/trappy/blob/master/doc/Plotter.ipynb
In [274]:
    
df = ftrace.cpu_frequency.data_frame
df.head()
    
    Out[274]:
In [275]:
    
# Lets compute the residency on each OPP
df.loc[:,'start'] = df.index
df.loc[:,'delta'] = (df['start'] - df['start'].shift()).fillna(0).shift(-1)
df.head()
    
    Out[275]:
In [276]:
    
import pandas as pd
# Now we can get group by frequency and sum the partional residency times
freq_residencies = df.groupby('frequency')['delta'].sum()
logging.info("Residency time per OPP:")
df = pd.DataFrame(freq_residencies)
df
    
    
    Out[276]:
In [283]:
    
# Compute the relative residency time
tot = sum(freq_residencies)
#df = df.apply(lambda delta : 100*delta/tot)
for f in freq_residencies.index:
    logging.info("Freq %10dHz : %5.1f%%", f, 100*freq_residencies[f]/tot)
    
    
In [281]:
    
# Plot residency time
fig, axes = plt.subplots(1, 1, figsize=(16, 5));
df.plot(kind='bar', ax=axes)
    
    Out[281]: