This tutorial aims to show some example of data analysis and visualization from a set of trace events collected trace. These analysis are mainly supported by TRAPpy and other standard PANDAs APIs.
In [317]:
import logging
from conf import LisaLogging
LisaLogging.setup()
In [318]:
# Events we are interested to collect from the trace
my_events = [
"sched_switch",
"sched_wakeup",
"sched_contrib_scale_f",
"sched_load_avg_cpu",
"sched_load_avg_task",
"sched_tune_tasks_update",
"sched_boost_cpu",
"sched_boost_task",
"sched_energy_diff",
"sched_overutilized",
"cpu_frequency",
"cpu_capacity"
]
# Let's use an example trace
trace_file = './example_results/trace.dat'
In [319]:
# Use TRAPpy support to convert a trace into a PANDAs DataFrame
import trappy
ftrace = trappy.FTrace(
name="PandasExamples",
path=trace_file,
scope='custom',
events=my_events,
window=(0,None))
In [320]:
# NOTE: The interactive trace visualization is available only if you run
# the workload to generate a new trace-file
trappy.plotter.plot_trace(ftrace)
In [321]:
# Let's have a look at the generated file
!ls -la example_results
Notice that the binary trace has been used to generate both textual (i.e. trace.txt) trace as well as a raw trace (trace.raw.txt).
In [322]:
logging.info("Example of (formatted) 'sched_switch' event:")
with open('./example_results/trace.txt', 'r') as fh:
for line in fh:
if line.find('sched_switch') == -1:
continue
print line
break
In [323]:
logging.info("Example of (raw) 'sched_switch' event:")
with open('./example_results/trace.raw.txt', 'r') as fh:
for line in fh:
if line.find('sched_switch') == -1:
continue
print line
break
Every event correspond to a "table" which is exposed as an attribute of the TRAPpy::FTtrace object we created by parsing the trace.
We can use TRAPpy::FTrace::class_definitions, which is a dictionary tracking all the collected tables.
In [324]:
logging.info("List of events identified in the trace:")
for event in ftrace.class_definitions.keys():
logging.info(" %s", event)
In [325]:
# Let's get a reference to the PANDAs DataFrame corresponding to the
# "sched_switch" events
logging.info("The 'sched_switch' events are collected into this DataFrame:")
df = ftrace.sched_switch.data_frame
df#.head(2)
Out[325]:
NOTE: We can use head() to report only a limited number of events
In [326]:
# All events parsed in the trace have an associated DataFrame
logging.info("This is the DataFrame for 'cpu_frequency' events:")
df = ftrace.cpu_frequency.data_frame
df.head(2)
Out[326]:
On DataFrames is possible to execute a number of different operations. Most commonly we are interested in:
In [330]:
df = ftrace.sched_switch.data_frame
df.head(2)
Out[330]:
In [331]:
# The "ix" operator allow to select ranges of [rows:columns]
df = df.ix[:,'next_comm':'prev_state']
df.head(2)
Out[331]:
In [332]:
# We can also filter by listing the columns we are interested into
tasks = df[['next_pid', 'next_comm']]
logging.info("Found %d sched_switch events: ", len(tasks))
tasks.head(2)
Out[332]:
In [333]:
# Create a map of {PID: TaskName}
pid_to_task = {int(task[0]): task[1] for task in tasks.drop_duplicates().values}
logging.info("The trace has %s unique PIDs", len(pid_to_task))
pid_to_task
Out[333]:
In [118]:
# Group events by "PID" and compute
most_switching = df.groupby('next_pid').describe(include=['object'])
most_switching.head()
Out[118]:
In [119]:
most_switching = most_switching.unstack()
most_switching.head()
Out[119]:
In [124]:
most_switching = most_switching['next_comm']\
.sort_values(by=['count'], ascending=False)
most_switching.head()
Out[124]:
In [231]:
most_switching_pid = most_switching.index[1]
most_switching_task = most_switching.values[1][2]
task_name = "{}:{}".format(most_switching_pid, most_switching_task)
logging.info("The second most swithing task is: [%s]", task_name)
In [188]:
# Lets use the previous information to filter values of another DataFrame
# Here we use the events reporting the task utilization signal
df = ftrace.sched_load_avg_task.data_frame.ix[:,'comm':'util_sum']
logging.info("Total events: %d", len(df))
df.head()
Out[188]:
In [189]:
# Select only events for the task of interest
select = (df.pid == most_switching_pid)
task_events = df[select]
logging.info("Selected events: %d", len(task_events))
task_events.head()
Out[189]:
In [194]:
# We can also compose a query using multiple conditions
select = (((task_events.cpu != 1) & (task_events.cpu != 2)) & ~(task_events.util_avg < 450))
task_events = task_events[select]
logging.info("Selected events: %d", len(task_events))
task_events.head()
Out[194]:
In [198]:
# Row selection is also possible, which in our case corresponds to limiting
# the events on a specified time frame
task_events.ix[21.1:22.9,].head()
Out[198]:
In [218]:
# Let's make use of yet another DataFrame, let's look at the frequencies
frequencies = ftrace.cpu_frequency.data_frame
select = (frequencies.cpu == 0)
frequencies = frequencies.ix[21.1:22.9,'cpu':'frequency'][select]
logging.info("Selected events: %d", len(frequencies))
frequencies.head()
Out[218]:
In [222]:
# Let's merge the task signal frame with the CPUs frequencies
task_util_and_freq = task_events.join(
frequencies,
how='outer',
lsuffix='lx_')#[['util_avg', 'frequency']]
logging.info("Selected events: %d", len(task_util_and_freq))
task_util_and_freq.tail()
Out[222]:
In [223]:
# The new table has the union of events from the two original tables
len(task_events) + len(frequencies)
Out[223]:
In [224]:
# Let's propagate missing columns values...
task_util_and_freq = task_util_and_freq.fillna(method='ffill')
task_util_and_freq.tail()
Out[224]:
In [226]:
# NOTE: missing values at the beginning can be fixe as well
# ...but that behind that demo!
# LISA has a set of methods which provides a complete and realiable solutions
# to do this kind of compositions on data frames
In [227]:
# Enable generation of Notebook emebedded plots
%pylab inline
In [229]:
task_util_and_freq[['util_avg', 'frequency']].plot(drawstyle="steps-post")
Out[229]:
In [260]:
import matplotlib.gridspec as gridspec
t_min = 21.5
t_max = 23.0
gs = gridspec.GridSpec(2, 1, height_ratios=[2,1]);
plt.figure(figsize=(16, 8));
plt.suptitle("Task Utilization vs Frequency",
y=.97, fontsize=16, horizontalalignment='center');
# Plot Utilization
data = task_util_and_freq['util_avg']
axes = plt.subplot(gs[0,0]);
data.plot(ax=axes, drawstyle='steps-post');
axes.set_title('Task [{}] Utilization'.format(task_name));
axes.set_ylabel('Utilization')
axes.set_xlim(t_min, t_max);
axes.grid(True);
axes.set_xticklabels([])
axes.set_xlabel('')
# Plot Frequencies
data = task_util_and_freq['frequency']/1000
axes = plt.subplot(gs[1,0]);
data.plot(ax=axes, drawstyle='steps-post');
axes.set_title('Frequencies');
axes.set_ylabel('MHz')
axes.set_xlim(t_min, t_max);
axes.grid(True);
Plots can be much better with just a little bit of setup and using the proper APIs.
That's why LISA aims at providing a predefined set of plot functions for commonly used analysis.
In [306]:
# Consider the data frame for the frequency switching events
logging.info("Table for 'cpu_frequency' events:")
ftrace.cpu_frequency.data_frame.head(1)
Out[306]:
In [309]:
# It is possible to mix in the same plot multiple events
# The LinePlot module requires to specify a list of signals to plot.
# Each signal is defined as:
# <event>:<column>
# where:
# <event> is one of the events collected from the trace by the FTrace object
# <column> is one of the column of the previously defined event
my_signals = [
'cpu_frequency:frequency',
]
# These two paramatere are passed to the LinePlot call as long with the
# TRAPpy FTrace object
trappy.LinePlot(
# FTrace object
ftrace,
# Signals to be plotted
signals=my_signals,
# Generate one plot for each value of the specified column
pivot='cpu',
# Generate only plots which satisfy these filters
filters = {
# Column
# | Values
# | |
'cpu' : [0, 1]
},
# Formatting style
drawstyle='steps-post',
marker = '+'
).view()
In [298]:
# We can also produce interacive plots
trappy.ILinePlot(
ftrace, # TRAPpy RUN object
signals=[
'cpu_frequency:frequency'
],
drawstyle='steps-post',
per_line=1,
# sync_zoom=True,
# group='UtilVsFrequency',
marker = '+').view()
trappy.ILinePlot(
ftrace, # TRAPpy RUN object
signals=[
'sched_load_avg_task:util_avg'
],
filters={
'pid': [most_switching_pid]
},
drawstyle='steps-post',
per_line=1,
# sync_zoom=True,
# group='UtilVsFrequency',
marker = '+').view()
You can find much more exaples of plotting in TRAPpy on the example
notebook available on the GitHub:
https://github.com/ARM-software/trappy/blob/master/doc/Plotter.ipynb
In [274]:
df = ftrace.cpu_frequency.data_frame
df.head()
Out[274]:
In [275]:
# Lets compute the residency on each OPP
df.loc[:,'start'] = df.index
df.loc[:,'delta'] = (df['start'] - df['start'].shift()).fillna(0).shift(-1)
df.head()
Out[275]:
In [276]:
import pandas as pd
# Now we can get group by frequency and sum the partional residency times
freq_residencies = df.groupby('frequency')['delta'].sum()
logging.info("Residency time per OPP:")
df = pd.DataFrame(freq_residencies)
df
Out[276]:
In [283]:
# Compute the relative residency time
tot = sum(freq_residencies)
#df = df.apply(lambda delta : 100*delta/tot)
for f in freq_residencies.index:
logging.info("Freq %10dHz : %5.1f%%", f, 100*freq_residencies[f]/tot)
In [281]:
# Plot residency time
fig, axes = plt.subplots(1, 1, figsize=(16, 5));
df.plot(kind='bar', ax=axes)
Out[281]: