In [ ]:
import pandas as pd

log = pd.read_csv("../dataset/linux_blame_log.csv")
log.head()

In [ ]:
log.info(memory_usage='deep')

In [ ]:
log.author = pd.Categorical(log.author)
log.path = pd.Categorical(log.path)
log.head()

In [ ]:
log.info(memory_usage='deep')

In [ ]:
%matplotlib inline
log.author.value_counts().head(10).plot.pie();

In [ ]:
log.timestamp = pd.to_datetime(log.timestamp)
log.head()

In [ ]:
knowledge = log.timestamp > pd.Timestamp('today') - pd.DateOffset(months=6)
knowledge.value_counts()

In [ ]:
knowledge.mean()

In [ ]:
knowledge.value_counts().plot.pie();

In [ ]:
log['component'] = log.path.str.split("/").str[0:2].str.join(":")
log.head()

In [ ]:
log.component.value_counts().head()

In [ ]:
log['age'] = pd.Timestamp('today') - log.timestamp
log.head()

In [ ]:
log.age.mean()

In [ ]:
log.age.describe()

In [ ]:
age_per_component = log.groupby(['component']).age.apply(min).sort_values()
age_per_component.head()

In [ ]:
age_per_component.tail(10)

In [ ]:
age_per_component.plot.bar(figsize=[15,5]);

In [ ]:
log_timed = log.set_index('timestamp')
log_timed.head()

In [ ]:
log_timed = log.groupby([pd.Grouper(key='timestamp', freq='M'), 'component']).line.count()
log_timed.head()

In [ ]:
component_history = log_timed.unstack().fillna(0)
component_history.head()

In [ ]:
relative_history = component_history.apply(
    lambda x : x / component_history.sum(axis=1))
relative_history.head()

In [ ]:
relative_history.plot.area(legend=False, figsize=[15,8])