In [1]:
import pandas as pd
blame_log = pd.read_csv("../demos/dataset/linux_blame_log.csv")
blame_log.head()
Out[1]:
In [2]:
blame_log.info()
In [5]:
top10 = blame_log.author.value_counts().head(10)
top10
Out[5]:
In [14]:
%matplotlib inline
top10_authors.plot.pie();
In [7]:
blame_log.timestamp = pd.to_datetime(blame_log.timestamp)
blame_log.head()
Out[7]:
In [8]:
blame_log['age'] = pd.Timestamp('today') - blame_log.timestamp
blame_log.head()
Out[8]:
In [9]:
blame_log['component'] = blame_log.path.str.split("/").str[:2].str.join(":")
blame_log.head()
Out[9]:
In [10]:
age_per_component = blame_log.groupby('component') \
.age.min().sort_values()
age_per_component.head()
Out[10]:
These are the oldest 10 components
In [11]:
age_per_component.tail(10)
Out[11]:
For all components, we create an overview with a bar chart.
In [13]:
age_per_component.plot.bar(figsize=[15,5])
Out[13]: