In [172]:
%matplotlib inline
In [173]:
from git import Repo
from functools import partial
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
import saapy.util
from saapy.vcs import check_file_move
In [186]:
sns.set_context("notebook", font_scale=1.5, rc={"lines.linewidth": 2.5})
plt.rcParams['figure.figsize'] = (16, 6)
In [2]:
def get_commits(repo, revs, paths='', **kwargs):
commits = []
visited_commit_hexsha = set()
for rev in revs:
for commit in repo.iter_commits(rev=rev, paths=paths, **kwargs):
commit_hexsha = commit.hexsha
if commit_hexsha in visited_commit_hexsha:
continue
else:
visited_commit_hexsha.add(commit_hexsha)
commits.append(commit)
return commits
In [93]:
def extract_actors(commits, actor_type, attrs):
actor_attrs = ['{}.{}'.format(actor_type, attr) for attr in attrs]
actors = saapy.util.dicts_to_dataframe(list(
saapy.util.objs_to_dicts(commits, actor_attrs)))
actors = actors.groupby(by=actor_attrs).size()
actors = actors.reset_index().sort_values(actor_attrs)
actors.rename(columns=dict(list(zip(actor_attrs, attrs)) + [(0, '{}_commits'.format(actor_type))]), inplace=True)
return actors
def commits_to_actor_frame(commits):
attrs = ('name', 'email')
authors = extract_actors(commits, 'author', attrs)
committers = extract_actors(commits, 'committer', attrs)
actors = pd.merge(authors, committers, on=attrs, how='outer')
actors = actors.drop_duplicates().reset_index(drop=True).fillna(0)
for attr in attrs:
actors[attr] = actors[attr].astype('category')
for col_name in ('author_commits', 'committer_commits'):
actors[col_name] = actors[col_name].astype('int')
return actors
def connect_actors(actor_frame, connectivity_sets, connectivity_column):
connectivity = {}
for actor_id, connectivity_set in connectivity_sets.items():
for actor in connectivity_set:
connectivity[actor] = actor_id
actor_frame[connectivity_column] = pd.Series(connectivity).astype('category')
return actor_frame
def combine_actors(actor_frame, connectivity_column):
aggregator = {'name':'first', 'email': 'first',
'author_commits': 'sum',
'committer_commits': 'sum'}
return actor_frame.groupby(connectivity_column).agg(aggregator)
In [4]:
def refs_to_ref_frame(git_refs):
attrs = {'__class__.__name__': 'ref_type', 'name': 'name',
'path': 'path', 'commit.hexsha': 'commit'}
ref_frame = saapy.util.dicts_to_dataframe(list(
saapy.util.objs_to_dicts(git_refs, attrs.keys())))
ref_frame.rename(columns=attrs, inplace=True)
return ref_frame
In [42]:
def commits_to_frame(commits):
commit_attrs = (
'hexsha', 'name_rev', 'size',
'author.name', 'author.email',
'authored_datetime', 'author_tz_offset',
'committer.name', 'committer.email',
'committed_datetime', 'committer_tz_offset',
'encoding', 'message',
'stats.total.files', 'stats.total.lines',
'stats.total.insertions', 'stats.total.deletions',
'stats.files')
column_names = {attr: attr.replace('.', '_') for attr in commit_attrs}
commit_frame = saapy.util.dicts_to_dataframe(list(
saapy.util.objs_to_dicts(commits, commit_attrs)))
commit_frame.rename(columns=column_names, inplace=True)
commit_frame['name_rev'] = commit_frame['name_rev'].str.split(' ', 1).apply(lambda x: x[-1])
categorical_cols = (
'name_rev', 'author_name', 'author_email',
'committer_name', 'committer_email', 'encoding')
for c in categorical_cols:
commit_frame[c] = commit_frame[c].astype('category')
for c in ('authored_datetime', 'committed_datetime'):
commit_frame[c] = commit_frame[c].astype('datetime64[ns]')
commit_frame['message'] = commit_frame['message'].str.replace('\n', '\\n')
commit_frame = commit_frame.sort_values('committed_datetime', ascending=False).reset_index(drop=True)
return commit_frame
In [57]:
def commit_parents_to_frame(commits):
commit_parents = []
for c in commits:
hexsha = c.hexsha
parent_hexshas = [p.hexsha for p in c.parents]
if not len(parent_hexshas):
commit_parents.append(dict(hexsha=hexsha, parent_hexsha=None))
else:
commit_parents.extend(
(dict(hexsha=hexsha, parent_hexsha=p)
for p in parent_hexshas))
return pd.DataFrame(commit_parents, columns=['hexsha', 'parent_hexsha'])
In [125]:
def insert_actor_ids(commit_frame, actor_frame, drop_name_email=True):
actor_columns = ['author_name', 'author_email',
'committer_name', 'committer_email']
cf = commit_frame[actor_columns]
af = actor_frame[['name', 'email', 'actor_id']]
author = pd.merge(
cf, af, left_on=actor_columns[:2],
right_on=('name', 'email'),
how='left')['actor_id']
committer = pd.merge(
cf, af, left_on=actor_columns[2:],
right_on=('name', 'email'),
how='left')['actor_id']
commit_frame.insert(3, 'author', author)
commit_frame.insert(4, 'committer', committer)
if drop_name_email:
commit_frame.drop(actor_columns, axis=1, inplace=True)
return commit_frame
In [156]:
def commit_trees_to_frame(commits):
frame = pd.concat((commit_tree_to_frame(c)
for c in commits))
cat_columns = ('hexsha', 'tree', 'child', 'child_type')
for col in cat_columns:
frame[col] = frame[col].astype('category')
frame.reset_index(inplace=True, drop=True)
return frame
def commit_tree_to_frame(commit):
tree_dicts = []
_add_subtree(tree_dicts, commit.tree, '.')
tree_frame = pd.DataFrame(tree_dicts)
tree_frame['hexsha'] = commit.hexsha
tree_frame['child_type'] = tree_frame['child_type'].astype('category')
return tree_frame
def _add_subtree(tree_dicts, tree, tree_path):
tree_dicts.extend((
dict(tree=tree_path, child=subtree.path, child_type='tree')
for subtree in tree.trees))
tree_dicts.extend((
dict(tree=tree_path, child=blob.path, child_type='blob')
for blob in tree.blobs))
for subtree in tree.trees:
_add_subtree(tree_dicts, subtree, subtree.path)
In [6]:
repo = Repo('~/Projects/3party/povray')
In [7]:
commits = get_commits(repo, (ref.commit.hexsha for ref in repo.refs))
In [8]:
ref_frame = refs_to_ref_frame(repo.refs)
ref_frame.to_feather('povray-data/ref_frame.feather')
ref_frame.sample(n=10)
Out[8]:
In [97]:
actor_frame = commits_to_actor_frame(commits)
actor_frame.to_feather('povray-data/actor_frame.feather')
actor_frame
Out[97]:
In [98]:
same_actors = {
'ccason': [3, 14, 15], 'clipka': [4, 5, 13],
'wfpokorny': [11, 17], 'anshuarya': [0],
'bentsm': [1], 'cbarton': [2], 'dbodor': [6],
'jlecher': [7], 'jgrimbert': [8], 'nalvarez': [9],
'selvik': [10], 'wverhelst': [12], 'gryken': [16],
'github': [18]}
actor_frame = connect_actors(actor_frame, same_actors, 'actor_id')
actor_frame
Out[98]:
In [99]:
combine_actors(actor_frame, 'actor_id')
Out[99]:
In [45]:
commit_frame = commits_to_frame(commits)
stats_files_frame = commit_frame[['hexsha', 'stats_files']]
commit_frame.drop('stats_files', axis=1, inplace=True)
commit_frame.to_feather('povray-data/commit_frame.feather')
commit_frame.head()
Out[45]:
In [13]:
commit_frame.describe()
Out[13]:
In [47]:
stats_files_frame.loc[0, 'stats_files']
Out[47]:
In [87]:
file_changes = []
for row in stats_files_frame.itertuples():
for file_path, file_change in row.stats_files.items():
file_path1, file_path2 = check_file_move(file_path)
move = file_path1 != file_path2
d = dict(hexsha=row.hexsha,
file_path1=file_path1,
file_path2=file_path2,
move=move)
d.update(file_change)
file_changes.append(d)
file_frame = pd.DataFrame(file_changes, columns=(
'hexsha', 'file_path1', 'file_path2', 'move', 'lines', 'insertions', 'deletions'))
In [88]:
file_frame.to_feather('povray-data/file_frame.feather')
In [89]:
file_frame.sample(n=10)
Out[89]:
In [90]:
move_frame = file_frame[file_frame.move]
In [92]:
move_frame.sample(n=10)
Out[92]:
In [16]:
parent_frame = commit_parents_to_frame(commits)
In [30]:
p1 = parent_frame.groupby('hexsha', as_index=False).agg({'parent_hexsha': 'count'})
In [31]:
mp = p1[p1.parent > 1]
In [35]:
merge_frame = pd.merge(left=commit_frame, right=mp, left_on='hexsha', right_on='hexsha', how='inner')
In [41]:
merge_frame[~merge_frame.message.str.contains('Merge')]
Out[41]:
In [128]:
move_frame = file_frame[file_frame.move]
In [131]:
move_graph = nx.from_pandas_dataframe(
move_frame,
source='file_path1', target='file_path2',
edge_attr='hexsha', create_using=nx.DiGraph())
In [135]:
move_graph.edges(data=True)[:5]
Out[135]:
In [136]:
nx.is_directed_acyclic_graph(move_graph)
Out[136]:
In [137]:
pd.concat?
In [143]:
df = commit_tree_to_frame(repo.head.commit)
In [145]:
df.sample(n=10)
Out[145]:
In [146]:
df1 = commit_trees_to_frame(commits)
In [160]:
df1.to_feather('povray-data/commit_trees.feather')
In [162]:
tree_frame = df1
In [163]:
tree_frame.sample(n=10)
Out[163]:
In [164]:
blob_counts = tree_frame[tree_frame.child_type == 'blob'].groupby('child').agg('count')
In [166]:
blob_counts = blob_counts.sort_values('child_type', ascending=False)
In [168]:
blob_counts.describe()
Out[168]:
In [171]:
blob_counts[blob_counts.child_type <= 100]
Out[171]:
In [188]:
_ = sns.boxplot(x='author', y='stats_total_lines',
data=commit_frame[commit_frame.stats_total_lines < 600],
orient='v')
_ = plt.title('Code Contributions by Authors')
_ = plt.xlabel('Author')
_ = plt.ylabel('Total Lines Committed')
plt.xticks(rotation=70)
plt.show()
In [189]:
sns.boxplot?
In [ ]: