In [14]:
from ozapfdis import git
git_log = git.log_numstat("../../../dropover/")[['sha', 'file']]
git_log.head()
Out[14]:
In [16]:
prod_code = git_log.copy()
prod_code = prod_code[prod_code.file.str.contains("src/main/java")]
prod_code = prod_code[~prod_code.file.str.endswith("package-info.java")]
prod_code.head()
Out[16]:
In [5]:
prod_code['hit'] = 1
prod_code.head()
Out[5]:
In [6]:
commit_matrix = prod_code.reset_index().pivot_table(
index='file',
columns='sha',
values='hit',
fill_value=0)
commit_matrix.iloc[0:5,50:55]
Out[6]:
In [7]:
from sklearn.metrics.pairwise import cosine_distances
dissimilarity_matrix = cosine_distances(commit_matrix)
dissimilarity_matrix[:5,:5]
Out[7]:
In [8]:
import pandas as pd
dissimilarity_df = pd.DataFrame(
dissimilarity_matrix,
index=commit_matrix.index,
columns=commit_matrix.index)
dissimilarity_df.iloc[:5,:2]
Out[8]:
In [9]:
from sklearn.manifold import MDS
# uses a fixed seed for random_state for reproducibility
model = MDS(dissimilarity='precomputed', random_state=0)
dissimilarity_2d = model.fit_transform(dissimilarity_df)
dissimilarity_2d[:5]
Out[9]:
In [10]:
dissimilarity_2d_df = pd.DataFrame(
dissimilarity_2d,
index=commit_matrix.index,
columns=["x", "y"])
dissimilarity_2d_df.head()
Out[10]:
In [11]:
dissimilarity_2d_df['module'] = dissimilarity_2d_df.index.str.split("/").str[6].values
dissimilarity_2d_df.head()
Out[11]:
In [12]:
from ausi import pygal
xy = pygal.create_xy_chart(dissimilarity_2d_df,"module")
xy.render_in_browser()