In [1]:
import pandas as pd
git_log = pd.read_csv("../dataset/git_log_numstat_dropover.csv")[['sha', 'file']]
git_log.head()
Out[1]:
In [2]:
prod_code = git_log.copy()
prod_code = prod_code[prod_code.file.str.contains("src/main/java")]
prod_code = prod_code[~prod_code.file.str.endswith("package-info.java")]
prod_code.head()
Out[2]:
In [3]:
prod_code['commit'] = 1
prod_code.head()
Out[3]:
In [4]:
commit_matrix = prod_code.reset_index().pivot_table(
index='file',
columns='sha',
values='commit',
fill_value=0)
commit_matrix.iloc[0:5,50:55]
Out[4]:
In [5]:
from sklearn.metrics.pairwise import cosine_distances
dissimilarity_matrix = cosine_distances(commit_matrix)
dissimilarity_matrix[:5,:5]
Out[5]:
In [6]:
import pandas as pd
dissimilarity_df = pd.DataFrame(
dissimilarity_matrix,
index=commit_matrix.index,
columns=commit_matrix.index)
dissimilarity_df.iloc[:5,:2]
Out[6]:
In [7]:
from sklearn.manifold import MDS
# uses a fixed seed for random_state for reproducibility
model = MDS(dissimilarity='precomputed', random_state=0)
dissimilarity_2d = model.fit_transform(dissimilarity_df)
dissimilarity_2d[:5]
Out[7]:
In [8]:
dissimilarity_2d_df = pd.DataFrame(
dissimilarity_2d,
index=commit_matrix.index,
columns=["x", "y"])
dissimilarity_2d_df.head()
Out[8]:
In [9]:
dissimilarity_2d_df['module'] = dissimilarity_2d_df.index.str.split("/").str[6].values
dissimilarity_2d_df.head()
Out[9]:
In [10]:
from ausi import pygal
xy = pygal.create_xy_chart(dissimilarity_2d_df,"module")
xy.render_in_browser()
In [11]:
commit_matrix.head()
Out[11]:
In [12]:
from sklearn.cluster import AgglomerativeClustering
clustering = AgglomerativeClustering()
model = clustering.fit(commit_matrix)
model
Out[12]:
In [13]:
from ausi.scipy import plot_dendrogram
plot_dendrogram(model, labels=commit_matrix.index)