In [5]:
import pandas as pd
git_blame = pd.read_csv("../dataset/linux_blame_log.csv")
git_blame.head()
Out[5]:
In [7]:
git_blame.info(memory_usage='deep')
In [9]:
git_blame.path = pd.Categorical(git_blame.path)
git_blame.author = pd.Categorical(git_blame.author)
git_blame.info()
In [16]:
git_blame.timestamp = pd.to_datetime(git_blame.timestamp)
git_blame.head()
Out[16]:
In [20]:
git_blame.author.value_counts();
In [22]:
six_months_ago = pd.Timestamp('now') - pd.DateOffset(months=6)
six_months_ago
Out[22]:
In [25]:
git_blame['knowing'] = git_blame.timestamp >= six_months_ago
git_blame.head()
Out[25]:
In [29]:
git_blame[git_blame.knowing].author.value_counts();
In [31]:
git_blame.path.value_counts().head(1)
Out[31]:
In [32]:
git_blame['component'] = git_blame.path.str.split("/").str[:2].str.join(":")
git_blame.head()
Out[32]:
In [35]:
components = git_blame.groupby('component').knowing.mean().sort_values()
components.head()
Out[35]:
In [37]:
%matplotlib inline
components.plot.barh(figsize=[5,20])
Out[37]: