In my previous blog post, we've seen how we can identify files that change together in one commit.
In this blog post, we take the analysis to an advanced level:
We're using Python and pandas as well as some algorithms from scikit-learn for these purposes.
For this analysis, we use a former project of me and some buddys of mine. It's called "DropOver", a web application that can manage events with features like event sites, schedulings, comments, file uploads and so on. Albeit I can't share the repository yet, you have to know that we developed the software almost strictly feature-based by feature teams (ok, one developer was one team). The history of this repository should perfectly fit for validating our use case for grouping co-changing sourcecode because an identified group should represent a feature.
We use a little helper library for the data import of the project.
In [70]:
from lib.ozapfdis.git_tc import log_numstat
#Autoren
GIT_REPO_DIR = "../../dropover_git/"
# Domänen
#GIT_REPO_DIR = "../../buschmais-spring-petclinic/"
#Sprachen
#GIT_REPO_DIR = "../../fossology_fork/"
#allgemein
#GIT_REPO_DIR = "../../synthetic_repo/"
git_log = log_numstat(GIT_REPO_DIR)[['sha', 'timestamp', 'author', 'file']]
git_log.head()
Out[70]:
In [71]:
git_log = git_log[git_log.file.str.endswith(".java")]
git_log = git_log[~git_log.file.str.endswith("Test.java")]
git_log.head()
Out[71]:
In [72]:
git_log.author.value_counts()
Out[72]:
If we look a
In [73]:
commits_per_author = git_log.groupby('author').sha.nunique()
files_changed_per_authors = git_log.groupby('author').file.count()
commits_per_author / files_changed_per_authors
Out[73]:
In [74]:
import pandas as pd
#commits_per_day = git_log.groupby([pd.Grouper(key='timestamp', freq='1D'), 'file']).first()
commits_per_day = git_log.groupby(['timestamp', 'file']).first()
commits_per_day.head()
Out[74]:
In [75]:
commits_per_day['hit'] = 1
commits_per_day.head()
Out[75]:
In [76]:
commit_matrix = commits_per_day.reset_index().pivot_table(
index='file',
columns='timestamp',
values='hit',
fill_value=0)
commit_matrix.head()
Out[76]:
In [77]:
from sklearn.metrics.pairwise import cosine_distances
dissimilarity_matrix = cosine_distances(commit_matrix)
dissimilarity_matrix
Out[77]:
In [78]:
dissimilarity_df = pd.DataFrame(
dissimilarity_matrix,
index=commit_matrix.index,
columns=commit_matrix.index)
# show some interesting parts of results
dissimilarity_df.head()
Out[78]:
In [79]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=[4,3.5])
sns.heatmap(
1-dissimilarity_df
)
Out[79]:
In [80]:
domain_df = dissimilarity_df.copy()
domain_df.index = domain_df.index.str.split("/").str[6]
domain_df.columns = domain_df.index
domain_df = domain_df.sort_index()
domain_df = domain_df.sort_index(axis=1)
domain_df.head()
Out[80]:
In [81]:
plt.figure(figsize=[5,4])
sns.heatmap(
domain_df,
#xticklabels=False,
#yticklabels=False
cmap="YlGnBu"
)
Out[81]:
In [82]:
#similarity_2d_df = pd.DataFrame(similarity_2d, index=similarity_df.index)
#
#similarity_2d_df['domain'] = "Other"
#
#domains = ["Owner", "Pet", "Visit", "Vet", "Specialty", "Clinic"]
#for domain in domains:
# similarity_2d_df.loc[similarity_2d_df.index.str.contains(domain), 'domain'] = domain
#
#similarity_2d_df.head()
In [83]:
from sklearn.manifold import MDS
# uses a fixed seed for random_state for reproducibility
model = MDS(dissimilarity='precomputed', random_state=0)
# this could take some seconds
dissimilarity_2d = model.fit_transform(dissimilarity_df)
dissimilarity_2d[:5]
Out[83]:
In [84]:
%matplotlib inline
from matplotlib import cm
import matplotlib.pyplot as plt
# brew some colors
#relative_index = distance_df.index.labels[0].values() / distance_df.index.labels[0].max()
#colors = [x for x in cm.hsv(relative_index)]
# plot the 2D matrix with colors
plt.figure(figsize=(8,8))
x = dissimilarity_2d[:,0]
y = dissimilarity_2d[:,1]
#plt.scatter(x, y, c=colors)
plt.scatter(x, y)
Out[84]:
In [85]:
dissimilarity_2d_df = pd.DataFrame(dissimilarity_2d, index=commit_matrix.index)
dissimilarity_2d_df.head()
Out[85]:
In [86]:
#dissimilarity_2d_df['grouping'] = dissimilarity_2d_df.index.str.rsplit(".").str[-1]
dissimilarity_2d_df['grouping'] = dissimilarity_2d_df.index.str.split("/").str[6]
#dissimilarity_2d_df.loc[dissimilarity_2d_df['language'].str.strip().str.len() == 0, "language"] = "n/a"
dissimilarity_2d_df.head()
Out[86]:
In [87]:
counts = dissimilarity_2d_df.language.value_counts()
counts = counts[counts > 10]
counts.head()
In [ ]:
dissimilarity_2d_df.loc[
~dissimilarity_2d_df.language.isin(counts.index), 'language'] = "Other"
dissimilarity_2d_df['language'].value_counts()
In [ ]:
plot_data = pd.DataFrame(index=dissimilarity_2d_df['language'])
plot_data['value'] = tuple(zip(dissimilarity_2d_df[0], dissimilarity_2d_df[1]))
plot_data['label'] = dissimilarity_2d_df.index
plot_data['data'] = plot_data[['label', 'value']].to_dict('records')
plot_dict = plot_data.groupby(plot_data.index).data.apply(list)
plot_dict
In [ ]:
import pygal
xy_chart = pygal.XY(stroke=False)
for entry in plot_dict.iteritems():
xy_chart.add(entry[0], entry[1])
xy_chart.render_in_browser()
In [ ]:
In [ ]:
from sklearn.manifold import TSNE
# hier auf 5 lassen
model = TSNE()
# this could take some seconds
dissimilarity_2d = model.fit_transform(dissimilarity_df)
dissimilarity_2d[:5]
%matplotlib inline
from matplotlib import cm
import matplotlib.pyplot as plt
# brew some colors
#relative_index = distance_df.index.labels[0].values() / distance_df.index.labels[0].max()
#colors = [x for x in cm.hsv(relative_index)]
# plot the 2D matrix with colors
plt.figure(figsize=(8,8))
x = dissimilarity_2d[:,0]
y = dissimilarity_2d[:,1]
#plt.scatter(x, y, c=colors)
plt.scatter(x, y)
In [ ]:
dissimilarity_2d_df = pd.DataFrame(dissimilarity_2d, index=commit_matrix.index)
dissimilarity_2d_df['language'] = dissimilarity_2d_df.index.str.rsplit(".").str[-1]
counts = dissimilarity_2d_df.language.value_counts()
counts = counts[counts > 10]
dissimilarity_2d_df.loc[~dissimilarity_2d_df.language.isin(counts.index), 'language'] = "Other"
plot_data = pd.DataFrame(index=dissimilarity_2d_df['language'])
plot_data['value'] = tuple(zip(dissimilarity_2d_df[0], dissimilarity_2d_df[1]))
plot_data['label'] = dissimilarity_2d_df.index
plot_data['data'] = plot_data[['label', 'value']].to_dict('records')
plot_dict = plot_data.groupby(plot_data.index).data.apply(list)
xy_chart = pygal.XY(stroke=False)
for entry in plot_dict.iteritems():
xy_chart.add(entry[0], entry[1])
xy_chart.render_in_browser()
In [ ]:
domains_encoded = pd.factorize(domain_df.index)[0]
domains_encoded[:10]
In [ ]:
relative_index = domains_encoded / domains_encoded.max()
colors = [x for x in cm.hsv(relative_index)]
colors[:3]
In [ ]:
plt.figure(figsize=(8,8))
x = dissimilarity_2d[:,0]
y = dissimilarity_2d[:,1]
plt.scatter(x, y, c=colors)
In [ ]:
dissimilarity_2d_df['domain'] = "other"
#dissimilarity_2d_df['domain'] = dissimilarity_2d_df.index.str.split("/").str[6]
dissimilarity_2d_df.head()
In [ ]:
dissimilarity_2d_df = pd.DataFrame(dissimilarity_2d, index=commit_matrix.index)
plot_data = pd.DataFrame(index=dissimilarity_2d_df['domain'])
plot_data['value'] = tuple(zip(dissimilarity_2d_df[0], dissimilarity_2d_df[1]))
plot_data['label'] = dissimilarity_2d_df.index
plot_data['data'] = plot_data[['label', 'value']].to_dict('records')
plot_dict = plot_data.groupby(plot_data.index).data.apply(list)
plot_dict
import pygal
xy_chart = pygal.XY(stroke=False)
for entry in plot_dict.iteritems():
xy_chart.add(entry[0], entry[1])
xy_chart.render_in_browser()
In [ ]:
import pygal
xy_chart = pygal.XY(stroke=False)
for entry in plot_dict.iteritems():
xy_chart.add(entry[0], entry[1])
xy_chart.render_in_browser()
In [ ]:
import pygal
xy_chart = pygal.XY(stroke=False)
xy_chart.add()
In [ ]:
import pygal
xy_chart = pygal.XY(stroke=False)
xy_chart.title = 'Grouped co-changing code'
xy_chart.add('A', [{'value': (2,2), 'label': 'two'}, (4,3)])
xy_chart.render_in_browser()