In [29]:
# Necessary imports
import os
import time
from nbminer.notebook_miner import NotebookMiner
from nbminer.cells.cells import Cell
from nbminer.features.ast_features import ASTFeatures
from nbminer.stats.summary import Summary
from nbminer.stats.multiple_summary import MultipleSummary
from nbminer.encoders.ast_graph.ast_graph import *
In [30]:
people = os.listdir('../testbed/Final')
notebooks = []
for person in people:
person = os.path.join('../testbed/Final', person)
if os.path.isdir(person):
direc = os.listdir(person)
notebooks.extend([os.path.join(person, filename) for filename in direc if filename.endswith('.ipynb')])
notebook_objs = [NotebookMiner(file) for file in notebooks]
a = ASTFeatures(notebook_objs)
In [31]:
for i, nb in enumerate(a.nb_features):
a.nb_features[i] = nb.get_new_notebook()
In [ ]:
In [32]:
agr = ASTGraphReducer(a, threshold=5, split_call=True)
agr.set_graphs()
num_nodes_orig = []
print ('Total number of graphs:',agr.number_graphs())
print ('Total number of graphs with one node:',agr.number_single())
print ('Total number of nodes:',agr.count_nodes())
print (agr.count_nodes())
In [33]:
print (agr.count_nodes())
agr.encode()
In [34]:
agr.decode()
In [36]:
print(agr.get_samples_str())
In [8]:
from nbminer.reconstruction_error.astor_error import AstorError
In [9]:
ae = AstorError(a)
print (ae.get_summary())
In [ ]:
In [10]:
coverage = []
number_templates = []
avg_dist = []
avg_sim = []
for value in [2,5,10,20,30,50,10000]:
print ('Calculating for value: ',value)
people = os.listdir('../testbed/Final')
notebooks = []
for person in people:
person = os.path.join('../testbed/Final', person)
if os.path.isdir(person):
direc = os.listdir(person)
notebooks.extend([os.path.join(person, filename) for filename in direc if filename.endswith('.ipynb')])
notebook_objs = [NotebookMiner(file) for file in notebooks]
a = ASTFeatures(notebook_objs)
for i, nb in enumerate(a.nb_features):
a.nb_features[i] = nb.get_new_notebook()
agr = ASTGraphReducer(a, threshold=value, split_call=True)
agr.set_graphs()
num_nodes_orig = []
agr.encode()
agr.decode()
ae = AstorError(a)
avg_dist.append(ae.average_distance())
avg_sim.append(ae.average_similarity())
coverage.append(ae.get_percent_coverage())
number_templates.append(ae.get_unique_templates())
In [ ]:
In [11]:
coverage_general = []
number_templates_general = []
avg_dist_general = []
avg_sim_general = []
for value in [2,5,10,20,30,50, 10000]:
print ('Calculating for value: ',value)
people = os.listdir('../testbed/Final')
notebooks = []
for person in people:
person = os.path.join('../testbed/Final', person)
if os.path.isdir(person):
direc = os.listdir(person)
notebooks.extend([os.path.join(person, filename) for filename in direc if filename.endswith('.ipynb')])
notebook_objs = [NotebookMiner(file) for file in notebooks]
a = ASTFeatures(notebook_objs)
for i, nb in enumerate(a.nb_features):
a.nb_features[i] = nb.get_new_notebook()
agr = ASTGraphReducer(a, threshold=value, split_call=False)
agr.set_graphs()
num_nodes_orig = []
agr.encode()
agr.decode()
ae = AstorError(a)
avg_dist_general.append(ae.average_distance())
avg_sim_general.append(ae.average_similarity())
coverage_general.append(ae.get_percent_coverage())
number_templates_general.append(ae.get_unique_templates())
In [39]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (20, 20)
x = [2,5,10,20,30,50]
fig, axes = plt.subplots(2,2)
n1, = axes[0,0].plot(x, avg_dist[:6], label = 'Average Distance (Split function calls)')
n2, = axes[0,0].plot(x, avg_dist_general[:6], label = 'Average Distance (All calls are the same)')
n3, = axes[0,0].plot(x, [avg_dist_general[6] for i in range(6)], label = 'Bound')
axes[0,0].set_title("Average edit distance")
axes[0,0].set_xlabel('Threshold for Template Creation')
axes[0,0].set_ylabel('Average edit distance')
axes[0,0].legend(handles=[n1,n2,n3])
n1, = axes[0,1].plot(x, avg_sim[:6], label = 'Average Similarity (Split function calls)')
n2, = axes[0,1].plot(x, avg_sim_general[:6], label = 'Average Similarity (All calls are the same)')
n3, = axes[0,1].plot(x, [avg_sim_general[6] for i in range(6)], label = 'Bound')
axes[0,1].set_title("Average matching characters")
axes[0,1].set_xlabel('Threshold for Template Creation')
axes[0,1].set_ylabel('Average matching characters')
axes[0,1].legend(handles=[n1,n2,n3])
n1, = axes[1,0].plot(x, coverage[:6], label = 'Coverage (Split function calls)')
n2, = axes[1,0].plot(x, coverage_general[:6], label = 'Coverage (All calls are the same)')
n3, = axes[1,0].plot(x, [coverage_general[6] for i in range(6)], label = 'Bound')
axes[1,0].set_title("Coverage of templates")
axes[1,0].set_xlabel('Threshold for Template Creation')
axes[1,0].set_ylabel('Coverage of templates')
axes[1,0].legend(handles=[n1,n2,n3])
n1, = axes[1,1].plot(x, number_templates[:6], label = 'Number of Templates (Split function calls)')
n2, = axes[1,1].plot(x, number_templates_general[:6], label = 'Number of Templates (All calls are the same)')
n3, = axes[1,1].plot(x, [number_templates_general[6] for i in range(6)], label = 'Bound')
axes[1,1].set_title("Number of templates")
axes[1,1].set_xlabel('Threshold for Template Creation')
axes[1,1].set_ylabel('Number of templates')
axes[1,1].legend(handles=[n1,n2,n3])
Out[39]:
In [ ]: