We sort by timestamp ascending because we need a monotonic time data for later use.
In [20]:
import py2neo
import pandas as pd
graph = py2neo.Graph()
query = """
MATCH
(t:Type)-->(ch:Change)<--(co:Commit)
RETURN t.fqn as type, co.date + " " + co.time as timestamp
ORDER BY timestamp
"""
changes = pd.DataFrame(graph.data(query))
changes.head()
Out[20]:
In [21]:
changes.timestamp = pd.to_datetime(changes.timestamp)
changes_over_time = changes.set_index('timestamp')
changes_over_time.head()
Out[21]:
In [45]:
dependend_types_within_1hour = \
changes_over_time.groupby('type').resample("1h").count()
dependend_types_within_1hour.head()
Out[45]:
In [50]:
dependency_matrix = dependend_types_within_1hour.unstack().fillna(0)
dependency_matrix.iloc[:5,:3]
Out[50]:
In [52]:
from sklearn.metrics.pairwise import cosine_distances
distance_matrix = cosine_distances(dependency_matrix)
distance_matrix[:5,:5]
Out[52]:
In [53]:
distance_df = pd.DataFrame(
distance_matrix,
index=dependency_matrix.index,
columns=dependency_matrix.index)
distance_df.iloc[:3,:3]
Out[53]:
In [54]:
from sklearn.manifold import MDS
model = MDS(dissimilarity='precomputed')
T = model.fit_transform(distance_df)
T[:5]
Out[54]:
In [61]:
plt.figure(figsize=(5,5))
plt.scatter(T[:,0], T[:,1])
Out[61]:
In [84]:
T_df = pd.DataFrame(T, columns=['x','y'])
T_df['class'] = distance_df.index
T_df.head()
Out[84]:
In [85]:
from bokeh.plotting import figure, ColumnDataSource
from bokeh.models import HoverTool
def scatter_with_hover(df, x, y,fig_width=500, fig_height=500):
fig = figure(width=fig_width, height=fig_height)
source = ColumnDataSource(data=df)
name = 'main'
fig.scatter(x, y, source=source, name='main')
hover = HoverTool(names=[name])
hover.tooltips = [(c, '@' + c) for c in df.columns]
hover.tooltips.append(('index', '$index'))
fig.add_tools(hover)
return fig
In [88]:
from bokeh.plotting import output_notebook, show
fig = scatter_with_hover(T_df,"x","y")
output_notebook()
show(fig)
In [58]:
%matplotlib inline
from matplotlib import cm
import matplotlib.pyplot as plt
types = distance_df.index / distance_df.index.max()
colors = [x for x in cm.hsv(types)]
plt.figure(figsize=(5,5))
plt.scatter(T[:,0], T[:,1], c=colors)