In [1]:
# Necessary imports 
import os
import time
from nbminer.notebook_miner import NotebookMiner
from nbminer.cells.cells import Cell
from nbminer.features.features import Features
from nbminer.stats.summary import Summary
from nbminer.stats.multiple_summary import MultipleSummary
from nbminer.encoders.ast_graph.ast_graph import *

In [2]:
people = os.listdir('../testbed/Final')
notebooks = []
for person in people:
    person = os.path.join('../testbed/Final', person)
    if os.path.isdir(person):
        direc = os.listdir(person)
        notebooks.extend([os.path.join(person, filename) for filename in direc if filename.endswith('.ipynb')])
notebook_objs = [NotebookMiner(file) for file in notebooks]

In [3]:
from nbminer.pipeline.pipeline import Pipeline
from nbminer.preprocess.get_ast_features import GetASTFeatures
from nbminer.preprocess.resample_by_node import ResampleByNode
from nbminer.results.reconstruction_error.astor_error import AstorError
coverage = []
number_templates = []
avg_dist = []
avg_sim = []
for value in [2,5,10,20,30,50,10000]:
    print ('Calculating for value: ',value)
    a = Features(notebook_objs)
    gastf = GetASTFeatures()
    rbn = ResampleByNode()
    agr = ASTGraphReducer(a, threshold=value, split_call=True)
    ae = AstorError()
    pipe = Pipeline([gastf, rbn, agr, ae])
    a = pipe.transform(a)
    avg_dist.append(ae.average_distance())
    avg_sim.append(ae.average_similarity())
    coverage.append(ae.get_percent_coverage())
    number_templates.append(ae.get_unique_templates())
print (avg_dist)
print (avg_sim)
print (coverage)
print (number_templates)


Calculating for value:  2
<nbminer.preprocess.get_ast_features.GetASTFeatures object at 0x10a261588>
<nbminer.preprocess.resample_by_node.ResampleByNode object at 0x10adccb00>
<nbminer.encoders.ast_graph.ast_graph.ASTGraphReducer object at 0x10adcc828>
<nbminer.results.reconstruction_error.astor_error.AstorError object at 0x10adccdd8>
Calculating for value:  5
<nbminer.preprocess.get_ast_features.GetASTFeatures object at 0x153486d2e8>
<nbminer.preprocess.resample_by_node.ResampleByNode object at 0x15346963c8>
<nbminer.encoders.ast_graph.ast_graph.ASTGraphReducer object at 0x1534696b70>
<nbminer.results.reconstruction_error.astor_error.AstorError object at 0x1532b70da0>
Calculating for value:  10
<nbminer.preprocess.get_ast_features.GetASTFeatures object at 0x151bdf1278>
<nbminer.preprocess.resample_by_node.ResampleByNode object at 0x152bd70be0>
<nbminer.encoders.ast_graph.ast_graph.ASTGraphReducer object at 0x152cf83e48>
<nbminer.results.reconstruction_error.astor_error.AstorError object at 0x152cf83198>
Calculating for value:  20
<nbminer.preprocess.get_ast_features.GetASTFeatures object at 0x152de3df98>
<nbminer.preprocess.resample_by_node.ResampleByNode object at 0x10c699550>
<nbminer.encoders.ast_graph.ast_graph.ASTGraphReducer object at 0x10c63b240>
<nbminer.results.reconstruction_error.astor_error.AstorError object at 0x10c63bd68>
Calculating for value:  30
<nbminer.preprocess.get_ast_features.GetASTFeatures object at 0x153d3619e8>
<nbminer.preprocess.resample_by_node.ResampleByNode object at 0x10d00f6a0>
<nbminer.encoders.ast_graph.ast_graph.ASTGraphReducer object at 0x10d2029e8>
<nbminer.results.reconstruction_error.astor_error.AstorError object at 0x10d2026a0>
Calculating for value:  50
<nbminer.preprocess.get_ast_features.GetASTFeatures object at 0x1531bce4a8>
<nbminer.preprocess.resample_by_node.ResampleByNode object at 0x152a998e48>
<nbminer.encoders.ast_graph.ast_graph.ASTGraphReducer object at 0x152a998c88>
<nbminer.results.reconstruction_error.astor_error.AstorError object at 0x151fc17d30>
Calculating for value:  10000
<nbminer.preprocess.get_ast_features.GetASTFeatures object at 0x1512a13588>
<nbminer.preprocess.resample_by_node.ResampleByNode object at 0x151b9a8828>
<nbminer.encoders.ast_graph.ast_graph.ASTGraphReducer object at 0x151a486da0>
<nbminer.results.reconstruction_error.astor_error.AstorError object at 0x152897ea20>
[35.496982194950206, 42.55120209234484, 47.99733427220602, 52.06825269087617, 53.51855950105623, 54.97117996177447, 60.32411226234785]
[31.06662564557202, 23.719788178471433, 17.54467619957786, 12.923137776717137, 11.234805453601139, 9.463230399808078, 0.0]
[0.8525299265667438, 0.7391610501961573, 0.6337893572075244, 0.5289206317271904, 0.47887536465144354, 0.424756060758475, 0.0]
[1272, 633, 309, 143, 93, 59, 1]

In [4]:
coverage_general = []
number_templates_general = []
avg_dist_general = []
avg_sim_general = []
for value in [2,5,10,20,30,50,10000]:
    print ('Calculating for value: ',value)
    a = Features(notebook_objs)
    gastf = GetASTFeatures()
    rbn = ResampleByNode()
    agr = ASTGraphReducer(a, threshold=value, split_call=False)
    ae = AstorError()
    pipe = Pipeline([gastf, rbn, agr, ae])
    a = pipe.transform(a)
    avg_dist_general.append(ae.average_distance())
    avg_sim_general.append(ae.average_similarity())
    coverage_general.append(ae.get_percent_coverage())
    number_templates_general.append(ae.get_unique_templates())


Calculating for value:  2
<nbminer.preprocess.get_ast_features.GetASTFeatures object at 0x10a272eb8>
<nbminer.preprocess.resample_by_node.ResampleByNode object at 0x1534019630>
<nbminer.encoders.ast_graph.ast_graph.ASTGraphReducer object at 0x153dcd9518>
<nbminer.results.reconstruction_error.astor_error.AstorError object at 0x1532214cf8>
Calculating for value:  5
<nbminer.preprocess.get_ast_features.GetASTFeatures object at 0x152fcaea58>
<nbminer.preprocess.resample_by_node.ResampleByNode object at 0x1534c2db70>
<nbminer.encoders.ast_graph.ast_graph.ASTGraphReducer object at 0x15233d6da0>
<nbminer.results.reconstruction_error.astor_error.AstorError object at 0x15371def98>
Calculating for value:  10
<nbminer.preprocess.get_ast_features.GetASTFeatures object at 0x10bfe1b38>
<nbminer.preprocess.resample_by_node.ResampleByNode object at 0x1522de5588>
<nbminer.encoders.ast_graph.ast_graph.ASTGraphReducer object at 0x151e331a90>
<nbminer.results.reconstruction_error.astor_error.AstorError object at 0x1534eabe10>
Calculating for value:  20
<nbminer.preprocess.get_ast_features.GetASTFeatures object at 0x153aac6390>
<nbminer.preprocess.resample_by_node.ResampleByNode object at 0x151da4b828>
<nbminer.encoders.ast_graph.ast_graph.ASTGraphReducer object at 0x151da4b940>
<nbminer.results.reconstruction_error.astor_error.AstorError object at 0x15317e0550>
Calculating for value:  30
<nbminer.preprocess.get_ast_features.GetASTFeatures object at 0x1530b7c400>
<nbminer.preprocess.resample_by_node.ResampleByNode object at 0x15202c0b38>
<nbminer.encoders.ast_graph.ast_graph.ASTGraphReducer object at 0x15308689e8>
<nbminer.results.reconstruction_error.astor_error.AstorError object at 0x1534a2bd30>
Calculating for value:  50
<nbminer.preprocess.get_ast_features.GetASTFeatures object at 0x151babf8d0>
<nbminer.preprocess.resample_by_node.ResampleByNode object at 0x152b8b51d0>
<nbminer.encoders.ast_graph.ast_graph.ASTGraphReducer object at 0x1534a719e8>
<nbminer.results.reconstruction_error.astor_error.AstorError object at 0x152dfd82e8>
Calculating for value:  10000
<nbminer.preprocess.get_ast_features.GetASTFeatures object at 0x1518b05d30>
<nbminer.preprocess.resample_by_node.ResampleByNode object at 0x1531e9af28>
<nbminer.encoders.ast_graph.ast_graph.ASTGraphReducer object at 0x1531e9a860>
<nbminer.results.reconstruction_error.astor_error.AstorError object at 0x153dc50b00>

In [5]:
print (avg_dist)
print (avg_sim)
print (coverage)
print (number_templates)
print (avg_dist_general)
print (avg_sim_general)
print (coverage_general)
print (number_templates_general)


[35.496982194950206, 42.55120209234484, 47.99733427220602, 52.06825269087617, 53.51855950105623, 54.97117996177447, 60.32411226234785]
[31.06662564557202, 23.719788178471433, 17.54467619957786, 12.923137776717137, 11.234805453601139, 9.463230399808078, 0.0]
[0.8525299265667438, 0.7391610501961573, 0.6337893572075244, 0.5289206317271904, 0.47887536465144354, 0.424756060758475, 0.0]
[1272, 633, 309, 143, 93, 59, 1]
[39.31153807464038, 44.6477215571874, 49.00804748013278, 52.401418368373406, 53.27688361331858, 54.9838547429836, 60.32411226234785]
[28.68371213081397, 23.01173296620936, 18.042136985194723, 14.317219310187276, 12.884671387691785, 11.32392442722537, 0.0]
[0.8997585755960165, 0.8272306608993059, 0.7526405794185695, 0.6774469369278745, 0.6370083492606378, 0.5903329644904939, 0.0]
[899, 485, 255, 132, 93, 63, 1]

In [6]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (20, 20)
x = [2,5,10,20,30,50]

fig, axes = plt.subplots(2,2)
n1, = axes[0,0].plot(x, avg_dist[:6], label = 'Average Distance (Split function calls)')
n2, = axes[0,0].plot(x, avg_dist_general[:6], label = 'Average Distance (All calls are the same)')
n3, = axes[0,0].plot(x, [avg_dist_general[6] for i in range(6)], label = 'Bound')
axes[0,0].set_title("Average edit distance")
axes[0,0].set_xlabel('Threshold for Template Creation')
axes[0,0].set_ylabel('Average edit distance')
axes[0,0].legend(handles=[n1,n2,n3])
n1, = axes[0,1].plot(x, avg_sim[:6], label = 'Average Similarity (Split function calls)')
n2, = axes[0,1].plot(x, avg_sim_general[:6], label = 'Average Similarity (All calls are the same)')
n3, = axes[0,1].plot(x, [avg_sim_general[6] for i in range(6)], label = 'Bound')
axes[0,1].set_title("Average matching characters")
axes[0,1].set_xlabel('Threshold for Template Creation')
axes[0,1].set_ylabel('Average matching characters')
axes[0,1].legend(handles=[n1,n2,n3])
n1, = axes[1,0].plot(x, coverage[:6], label = 'Coverage (Split function calls)')
n2, = axes[1,0].plot(x, coverage_general[:6], label = 'Coverage (All calls are the same)')
n3, = axes[1,0].plot(x, [coverage_general[6] for i in range(6)], label = 'Bound')
axes[1,0].set_title("Coverage of templates")
axes[1,0].set_xlabel('Threshold for Template Creation')
axes[1,0].set_ylabel('Coverage of templates')
axes[1,0].legend(handles=[n1,n2,n3])
n1, = axes[1,1].plot(x, number_templates[:6], label = 'Number of Templates (Split function calls)')
n2, = axes[1,1].plot(x, number_templates_general[:6], label = 'Number of Templates (All calls are the same)')
n3, = axes[1,1].plot(x, [number_templates_general[6] for i in range(6)], label = 'Bound')
axes[1,1].set_title("Number of templates")
axes[1,1].set_xlabel('Threshold for Template Creation')
axes[1,1].set_ylabel('Number of templates')
axes[1,1].legend(handles=[n1,n2,n3])


Out[6]:
<matplotlib.legend.Legend at 0x1511c4c6a0>

In [ ]:


In [7]:
from nbminer.pipeline.pipeline import Pipeline
from nbminer.preprocess.get_ast_features import GetASTFeatures
from nbminer.preprocess.get_imports import GetImports
from nbminer.preprocess.resample_by_node import ResampleByNode
from nbminer.results.reconstruction_error.astor_error import AstorError
from nbminer.preprocess.feature_encoding import FeatureEncoding
from nbminer.encoders.cluster.kmeans_encoder import KmeansEncoder
coverage_kmeans = []
number_kmeans = []
avg_dist_kmeans = []
avg_sim_kmeans = []
for value in [1000, 700, 500, 200, 100, 10, 1]:
    print ('Calculating for value: ',value)
    a = Features(notebook_objs)
    gastf = GetASTFeatures()
    rbn = ResampleByNode()
    gi = GetImports()
    fe = FeatureEncoding()
    ke = KmeansEncoder(n_clusters = value)
    ae = AstorError()
    pipe = Pipeline([gastf, rbn, gi, fe, ke, ae])
    a = pipe.transform(a)
    avg_dist_kmeans.append(ae.average_distance())
    avg_sim_kmeans.append(ae.average_similarity())
    coverage_kmeans.append(ae.get_percent_coverage())
    number_kmeans.append(ae.get_unique_templates())
    print(avg_dist_kmeans)
    print(avg_sim_kmeans)
    print(coverage_kmeans)
    print(number_kmeans)


Calculating for value:  1000
<nbminer.preprocess.get_ast_features.GetASTFeatures object at 0x1520246a58>
<nbminer.preprocess.resample_by_node.ResampleByNode object at 0x1531083f98>
<nbminer.preprocess.get_imports.GetImports object at 0x1531083f60>
<nbminer.preprocess.feature_encoding.FeatureEncoding object at 0x1521d35320>
<nbminer.encoders.cluster.kmeans_encoder.KmeansEncoder object at 0x151dde1390>
<nbminer.results.reconstruction_error.astor_error.AstorError object at 0x1538341400>
[41.274720853032896]
[32.040070227847224]
[1.0]
[1000]
Calculating for value:  700
<nbminer.preprocess.get_ast_features.GetASTFeatures object at 0x1512a8d940>
<nbminer.preprocess.resample_by_node.ResampleByNode object at 0x151e906198>
<nbminer.preprocess.get_imports.GetImports object at 0x15358e0cf8>
<nbminer.preprocess.feature_encoding.FeatureEncoding object at 0x153cde9630>
<nbminer.encoders.cluster.kmeans_encoder.KmeansEncoder object at 0x1523f28f28>
<nbminer.results.reconstruction_error.astor_error.AstorError object at 0x153cde4828>
[41.274720853032896, 44.70365154411025]
[32.040070227847224, 30.512287039379256]
[1.0, 1.0]
[1000, 700]
Calculating for value:  500
<nbminer.preprocess.get_ast_features.GetASTFeatures object at 0x151136e9b0>
<nbminer.preprocess.resample_by_node.ResampleByNode object at 0x15381db630>
<nbminer.preprocess.get_imports.GetImports object at 0x15381db470>
<nbminer.preprocess.feature_encoding.FeatureEncoding object at 0x1536a426a0>
<nbminer.encoders.cluster.kmeans_encoder.KmeansEncoder object at 0x1536a42908>
<nbminer.results.reconstruction_error.astor_error.AstorError object at 0x1536a42e48>
[41.274720853032896, 44.70365154411025, 45.89950709184187]
[32.040070227847224, 30.512287039379256, 29.104695582394044]
[1.0, 1.0, 1.0]
[1000, 700, 500]
Calculating for value:  200
<nbminer.preprocess.get_ast_features.GetASTFeatures object at 0x152a02dc88>
<nbminer.preprocess.resample_by_node.ResampleByNode object at 0x1529eb1198>
<nbminer.preprocess.get_imports.GetImports object at 0x1524b08dd8>
<nbminer.preprocess.feature_encoding.FeatureEncoding object at 0x1527378208>
<nbminer.encoders.cluster.kmeans_encoder.KmeansEncoder object at 0x1527378390>
<nbminer.results.reconstruction_error.astor_error.AstorError object at 0x1527378160>
[41.274720853032896, 44.70365154411025, 45.89950709184187, 53.49708278845186]
[32.040070227847224, 30.512287039379256, 29.104695582394044, 25.129859103392896]
[1.0, 1.0, 1.0, 1.0]
[1000, 700, 500, 200]
Calculating for value:  100
<nbminer.preprocess.get_ast_features.GetASTFeatures object at 0x152fb02ac8>
<nbminer.preprocess.resample_by_node.ResampleByNode object at 0x1531b43710>
<nbminer.preprocess.get_imports.GetImports object at 0x1537985f98>
<nbminer.preprocess.feature_encoding.FeatureEncoding object at 0x1524725588>
<nbminer.encoders.cluster.kmeans_encoder.KmeansEncoder object at 0x1537c0d748>
<nbminer.results.reconstruction_error.astor_error.AstorError object at 0x153c9b3240>
[41.274720853032896, 44.70365154411025, 45.89950709184187, 53.49708278845186, 57.275123227039536]
[32.040070227847224, 30.512287039379256, 29.104695582394044, 25.129859103392896, 22.94207881262498]
[1.0, 1.0, 1.0, 1.0, 1.0]
[1000, 700, 500, 200, 100]
Calculating for value:  10
<nbminer.preprocess.get_ast_features.GetASTFeatures object at 0x15396ef4e0>
<nbminer.preprocess.resample_by_node.ResampleByNode object at 0x153da81908>
<nbminer.preprocess.get_imports.GetImports object at 0x153d147320>
<nbminer.preprocess.feature_encoding.FeatureEncoding object at 0x153d147358>
<nbminer.encoders.cluster.kmeans_encoder.KmeansEncoder object at 0x151d13dba8>
<nbminer.results.reconstruction_error.astor_error.AstorError object at 0x151d13d630>
[41.274720853032896, 44.70365154411025, 45.89950709184187, 53.49708278845186, 57.275123227039536, 65.94054924051906]
[32.040070227847224, 30.512287039379256, 29.104695582394044, 25.129859103392896, 22.94207881262498, 17.460419982770787]
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
[1000, 700, 500, 200, 100, 10]
Calculating for value:  1
<nbminer.preprocess.get_ast_features.GetASTFeatures object at 0x1525f10198>
<nbminer.preprocess.resample_by_node.ResampleByNode object at 0x152b0b1e48>
<nbminer.preprocess.get_imports.GetImports object at 0x151cc0acf8>
<nbminer.preprocess.feature_encoding.FeatureEncoding object at 0x151cc0a940>
<nbminer.encoders.cluster.kmeans_encoder.KmeansEncoder object at 0x1531ae5ef0>
<nbminer.results.reconstruction_error.astor_error.AstorError object at 0x1531ae5b00>
[41.274720853032896, 44.70365154411025, 45.89950709184187, 53.49708278845186, 57.275123227039536, 65.94054924051906, 70.79916507393622]
[32.040070227847224, 30.512287039379256, 29.104695582394044, 25.129859103392896, 22.94207881262498, 17.460419982770787, 14.834598151160746]
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
[1000, 700, 500, 200, 100, 10, 1]

In [ ]:
dists = [avg_dist, avg_dist_general, avg_dist_kmeans]
sims = [avg_sim, avg_sim_general, avg_sim_kmeans]
covers = [coverage, coverage_general, coverage_kmeans]
numbers = [number_templates, number_templates_general, number_kmeans]

In [14]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (10, 20)

fig, axes = plt.subplots(3)
n1, = axes[0].plot(number_templates[:6], avg_dist[:6], label = 'Split function calls')
n2, = axes[0].plot(number_templates_general[:6], avg_dist_general[:6], label = 'All calls are the same')
n3, = axes[0].plot(number_kmeans[:6], avg_dist_kmeans[:6], label = 'KMeans')
n4, = axes[0].plot([0,1,2,3,4,1200], [avg_dist_general[6] for i in range(6)], label = 'Bound')
axes[0].set_title("Average edit distance")
axes[0].set_xlabel('Number of templates')
axes[0].set_ylabel('Average edit distance')
axes[0].legend(handles=[n1,n2,n3,n4])

n1, = axes[1].plot(number_templates[:6], avg_sim[:6], label = 'Split function calls')
n2, = axes[1].plot(number_templates_general[:6], avg_sim_general[:6], label = 'All calls are the same')
n3, = axes[1].plot(number_kmeans[:6], avg_sim_kmeans[:6], label = 'KMeans')
n4, = axes[1].plot([0,1,2,3,4,1200], [avg_sim_general[6] for i in range(6)], label = 'Bound')
axes[1].set_title("Average matching characters")
axes[1].set_xlabel('Number of templates')
axes[1].set_ylabel('Average matching characters')
axes[1].legend(handles=[n1,n2,n3,n4])

n1, = axes[2].plot(number_templates[:6], coverage[:6], label = 'Split function calls')
n2, = axes[2].plot(number_templates_general[:6], coverage_general[:6], label = 'All calls are the same')
n3, = axes[2].plot(number_kmeans[:6], coverage_kmeans[:6], label = 'KMeans')
n4, = axes[2].plot([0,1,2,3,4,1200], [coverage_general[6] for i in range(6)], label = 'Bound')
axes[2].set_title("Coverage of templates")
axes[2].set_xlabel('Number of templates')
axes[2].set_ylabel('Coverage of templates')
axes[2].legend(handles=[n1,n2,n3,n4])


Out[14]:
<matplotlib.legend.Legend at 0x151237be80>

In [ ]: