In [1]:
import pandas as pd
log = pd.read_csv("../dataset/linux_blame_log.csv")
log.head()
Out[1]:
In [2]:
log.info(memory_usage='deep')
In [3]:
log.author = pd.Categorical(log.author)
log.path = pd.Categorical(log.path)
log.head()
Out[3]:
In [4]:
log.info(memory_usage='deep')
In [5]:
%matplotlib inline
log.author.value_counts().head(10).plot.pie();
In [6]:
log.timestamp = pd.to_datetime(log.timestamp)
log.head()
Out[6]:
In [7]:
knowledge = log.timestamp > pd.Timestamp('today') - pd.DateOffset(months=6)
knowledge.value_counts()
Out[7]:
In [8]:
knowledge.mean()
Out[8]:
In [9]:
knowledge.value_counts().plot.pie();
In [10]:
log['component'] = log.path.str.split("/").str[0:2].str.join(":")
log.head()
Out[10]:
In [11]:
log.component.value_counts().head()
Out[11]:
In [12]:
log['age'] = pd.Timestamp('today') - log.timestamp
log.head()
Out[12]:
In [13]:
log.age.mean()
Out[13]:
In [14]:
log.age.describe()
Out[14]:
In [15]:
age_per_component = log.groupby(['component']).age.apply(min).sort_values()
age_per_component.head()
Out[15]:
In [16]:
age_per_component.tail(10)
Out[16]:
In [17]:
age_per_component.plot.bar(figsize=[15,5]);
In [18]:
log_timed = log.set_index('timestamp')
log_timed.head()
Out[18]:
In [19]:
log_timed = log.groupby([pd.Grouper(key='timestamp', freq='M'), 'component']).line.count()
log_timed.head()
Out[19]:
In [20]:
component_history = log_timed.unstack().fillna(0)
component_history.head()
Out[20]:
In [21]:
relative_history = component_history.apply(
lambda x : x / component_history.sum(axis=1))
relative_history.head()
Out[21]:
In [22]:
relative_history.plot.area(legend=False, figsize=[15,8])
Out[22]: