In [1]:
import pandas as pd
git_blame = pd.read_csv("../dataset/linux_blame_log.csv")
git_blame.head()
Out[1]:
In [2]:
git_blame.info(memory_usage="deep")
In [3]:
git_blame.path = pd.Categorical(git_blame.path)
git_blame.author = pd.Categorical(git_blame.author)
git_blame.info(memory_usage="deep")
In [4]:
%matplotlib inline
git_blame.author.value_counts().head(10).plot.pie()
Out[4]:
In [5]:
git_blame.timestamp = pd.to_datetime(git_blame.timestamp)
In [6]:
a_year_ago = pd.Timestamp("today") - pd.DateOffset(years=1)
a_year_ago
Out[6]:
In [7]:
git_blame['knowing'] = git_blame.timestamp >= a_year_ago
git_blame.head()
Out[7]:
In [8]:
knowledge_carrier = git_blame[git_blame.knowing]
knowledge_carrier.head()
Out[8]:
In [9]:
(knowledge_carrier.author.value_counts() / len(knowledge_carrier)).head(10)
Out[9]:
In [10]:
git_blame.path.value_counts().head()
Out[10]:
In [11]:
git_blame['component'] = git_blame.path.str.split("/").str[:2].str.join(":")
git_blame.head()
Out[11]:
In [12]:
git_blame.component.value_counts();
In [13]:
knowledge_per_component = git_blame.groupby('component')[['knowing']].mean()
knowledge_per_component.head()
Out[13]:
In [14]:
knowledge_per_component.knowing.sort_values().plot.barh(figsize=[5,20])
Out[14]: