In my previous blog post, we've seen how we can identify files that change together in one commit.
In this blog post, we take the analysis to an advanced level:
We're using Python and pandas as well as some algorithms from scikit-learn for these purposes.
For this analysis, we use a former project of me and some buddys of mine. It's called "DropOver", a web application that can manage events with features like event sites, schedulings, comments, file uploads and so on. Albeit I can't share the repository yet, you have to know that we developed the software almost strictly feature-based by feature teams (ok, one developer was one team). The history of this repository should perfectly fit for validating our use case for grouping co-changing sourcecode because an identified group should represent a feature.
We use a little helper library for the data import of the project.
In [166]:
from lib.ozapfdis.git_tc import log_numstat
GIT_REPO_DIR = "../../dropover_git/"
#GIT_REPO_DIR = "../../buschmais-spring-petclinic/"
#GIT_REPO_DIR = "../../fossology_fork/"
git_log = log_numstat(GIT_REPO_DIR)[['sha', 'timestamp', 'author', 'file']]
git_log.head()
Out[166]:
In [167]:
git_log = git_log[git_log.file.str.endswith(".java")]
git_log = git_log[~git_log.file.str.endswith("Test.java")]
git_log.head()
Out[167]:
In [168]:
git_log.author.value_counts()
Out[168]:
If we look a
In [169]:
commits_per_author = git_log.groupby('author').sha.nunique()
files_changed_per_authors = git_log.groupby('author').file.count()
commits_per_author / files_changed_per_authors
Out[169]:
In [170]:
import pandas as pd
#commits_per_day = git_log.groupby([pd.Grouper(key='timestamp', freq='1D'), 'file']).first()
commits_per_day = git_log.groupby(['timestamp', 'file']).first()
commits_per_day.head()
Out[170]:
In [171]:
commits_per_day['hit'] = 1
commits_per_day.head()
Out[171]:
In [172]:
commit_matrix = commits_per_day.reset_index().pivot_table(
index='file',
columns='timestamp',
values='hit',
fill_value=0)
commit_matrix.head()
Out[172]:
In [173]:
from sklearn.metrics.pairwise import cosine_distances
dissimilarity_matrix = cosine_distances(commit_matrix)
dissimilarity_matrix
Out[173]:
In [174]:
dissimilarity_df = pd.DataFrame(
dissimilarity_matrix,
index=commit_matrix.index,
columns=commit_matrix.index)
# show some interesting parts of results
dissimilarity_df.head()
Out[174]:
In [175]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=[30,25])
sns.heatmap(
dissimilarity_df,
#xticklabels=False,
#yticklabels=False
)
Out[175]:
In [176]:
domain_df = dissimilarity_df.copy()
domain_df.index = domain_df.index.str.split("/").str[6]
domain_df.columns = domain_df.index
domain_df = domain_df.sort_index()
domain_df = domain_df.sort_index(axis=1)
domain_df.head()
Out[176]:
In [177]:
plt.figure(figsize=[5,4])
sns.heatmap(
domain_df,
#xticklabels=False,
#yticklabels=False
cmap="YlGnBu"
)
Out[177]:
In [178]:
#similarity_2d_df = pd.DataFrame(similarity_2d, index=similarity_df.index)
#
#similarity_2d_df['domain'] = "Other"
#
#domains = ["Owner", "Pet", "Visit", "Vet", "Specialty", "Clinic"]
#for domain in domains:
# similarity_2d_df.loc[similarity_2d_df.index.str.contains(domain), 'domain'] = domain
#
#similarity_2d_df.head()
In [179]:
from sklearn.manifold import MDS
# uses a fixed seed for random_state for reproducibility
model = MDS(dissimilarity='precomputed', random_state=0)
# this could take some seconds
dissimilarity_2d = model.fit_transform(dissimilarity_df)
dissimilarity_2d[:5]
Out[179]:
In [180]:
%matplotlib inline
from matplotlib import cm
import matplotlib.pyplot as plt
# brew some colors
#relative_index = distance_df.index.labels[0].values() / distance_df.index.labels[0].max()
#colors = [x for x in cm.hsv(relative_index)]
# plot the 2D matrix with colors
plt.figure(figsize=(8,8))
x = dissimilarity_2d[:,0]
y = dissimilarity_2d[:,1]
#plt.scatter(x, y, c=colors)
plt.scatter(x, y)
Out[180]:
In [181]:
from sklearn.manifold import Isomap
# uses a fixed seed for random_state for reproducibility
model = Isomap()
# this could take some seconds
dissimilarity_2d = model.fit_transform(dissimilarity_df)
dissimilarity_2d[:5]
Out[181]:
In [182]:
%matplotlib inline
from matplotlib import cm
import matplotlib.pyplot as plt
# brew some colors
#relative_index = distance_df.index.labels[0].values() / distance_df.index.labels[0].max()
#colors = [x for x in cm.hsv(relative_index)]
# plot the 2D matrix with colors
plt.figure(figsize=(8,8))
x = dissimilarity_2d[:,0]
y = dissimilarity_2d[:,1]
#plt.scatter(x, y, c=colors)
plt.scatter(x, y)
Out[182]:
In [183]:
domains_encoded = pd.factorize(domain_df.index)[0]
domains_encoded[:10]
Out[183]:
In [184]:
relative_index = domains_encoded / domains_encoded.max()
colors = [x for x in cm.hsv(relative_index)]
colors[:3]
Out[184]:
In [185]:
plt.figure(figsize=(8,8))
x = dissimilarity_2d[:,0]
y = dissimilarity_2d[:,1]
plt.scatter(x, y, c=colors)
Out[185]:
In [186]:
dissimilarity_2d_df = pd.DataFrame(dissimilarity_2d, index=commit_matrix.index)
dissimilarity_2d_df.head()
Out[186]:
In [187]:
dissimilarity_2d_df['domain'] = dissimilarity_2d_df.index.str.split("/").str[6]
dissimilarity_2d_df.head()
Out[187]:
In [188]:
plot_data = pd.DataFrame(index=dissimilarity_2d_df['domain'])
plot_data['value'] = tuple(zip(dissimilarity_2d_df[0], dissimilarity_2d_df[1]))
plot_data['label'] = dissimilarity_2d_df.index
plot_data['data'] = plot_data[['label', 'value']].to_dict('records')
plot_dict = plot_data.groupby(plot_data.index).data.apply(list)
plot_dict
Out[188]:
In [189]:
import pygal
xy_chart = pygal.XY(stroke=False)
for entry in plot_dict.iteritems():
xy_chart.add(entry[0], entry[1])
xy_chart.render_in_browser()
In [190]:
import pygal
xy_chart = pygal.XY(stroke=False)
xy_chart.add()
In [ ]:
import pygal
xy_chart = pygal.XY(stroke=False)
xy_chart.title = 'Grouped co-changing code'
xy_chart.add('A', [{'value': (2,2), 'label': 'two'}, (4,3)])
xy_chart.render_in_browser()