In [1]:
import pandas as pd
log = pd.read_csv("../datasets/git_log_numstat_dropover.csv")
log.head()
Out[1]:
In [2]:
java_prod = log[log['file'].str.contains("backend/src/main/java/")].copy()
java_prod = java_prod[~java_prod['file'].str.contains("package-info.java")]
java_prod.head()
Out[2]:
In [3]:
author_knowledge = java_prod['author'].value_counts() / java_prod['author'].count()
author_knowledge
Out[3]:
Which author does "know" which module?
In [4]:
java_prod['module'] = java_prod['file'].str.split("/").str[6]
java_prod['commit'] = 1
java_prod.head()
Out[4]:
In [5]:
knowledge_per_module = java_prod.groupby(['module', 'author'])[['commit']].count()
knowledge_per_module['all'] = knowledge_per_module.groupby(['module'])['commit'].transform('sum')
knowledge_per_module['ratio'] = knowledge_per_module['commit'] / knowledge_per_module['all']
knowledge_per_module.head()
Out[5]:
In [6]:
%matplotlib inline
knowledge_per_module.unstack()['ratio'].plot.bar(stacked=True);