In [3]:
import os
dir_name = '../results/reconstruction_error/results'
files = os.listdir(dir_name)
print(files)


['bottom_up_general_1.npy', 'bottom_up_general_132.npy', 'bottom_up_general_255.npy', 'bottom_up_general_485.npy', 'bottom_up_general_63.npy', 'bottom_up_general_899.npy', 'bottom_up_general_93.npy', 'bottom_up_split_call_1.npy', 'bottom_up_split_call_1272.npy', 'bottom_up_split_call_143.npy', 'bottom_up_split_call_309.npy', 'bottom_up_split_call_59.npy', 'bottom_up_split_call_633.npy', 'bottom_up_split_call_93.npy', 'hierarchical_111.npy', 'hierarchical_2.npy', 'hierarchical_326.npy', 'homework_bottom_up_general_1.npy', 'homework_bottom_up_general_1306.npy', 'homework_bottom_up_general_144.npy', 'homework_bottom_up_general_205.npy', 'homework_bottom_up_general_348.npy', 'homework_bottom_up_general_659.npy', 'homework_bottom_up_general_90.npy', 'homework_bottom_up_split_call_1.npy', 'homework_bottom_up_split_call_153.npy', 'homework_bottom_up_split_call_1790.npy', 'homework_bottom_up_split_call_207.npy', 'homework_bottom_up_split_call_403.npy', 'homework_bottom_up_split_call_80.npy', 'homework_bottom_up_split_call_857.npy', 'homework_hierarchical_174.npy', 'homework_hierarchical_2.npy', 'homework_hierarchical_507.npy', 'homework_kmeans_1.npy', 'homework_kmeans_10.npy', 'homework_kmeans_100.npy', 'homework_kmeans_1000.npy', 'homework_kmeans_200.npy', 'homework_kmeans_500.npy', 'homework_kmeans_700.npy', 'kmeans_1.npy', 'kmeans_10.npy', 'kmeans_100.npy', 'kmeans_1000.npy', 'kmeans_200.npy', 'kmeans_500.npy', 'kmeans_700.npy']

In [4]:
from nbminer.results.reconstruction_error.astor_error import AstorError
ae = AstorError()
results_dir = ae.load_summaries('../results/reconstruction_error/results')

In [11]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (10, 10)
val = 0
y_label = 'Reconstruction Similarity'
#y_label = 'Reconstruction Distance'
for key in results_dir.keys():
    x = [el[1][3] for el in sorted(results_dir[key])]
    y = [el[1][val] for el in sorted(results_dir[key])]
    print(x)
    print(y)
    label = key
    plt.plot(x[:5],y[:5], label=label )
plt.xlabel('Templates')
plt.ylabel(y_label)
plt.legend()
plt.show()


[1.0, 63.0, 93.0, 132.0, 255.0, 485.0, 899.0]
[0.0, 11.419386025672779, 12.918756031484381, 14.408670645973372, 18.207177526916581, 23.033699811491253, 28.673621643303324]
[1.0, 59.0, 93.0, 143.0, 309.0, 633.0, 1272.0]
[0.0, 9.5165519343260794, 11.148776577530255, 12.846796022620401, 17.515993102913072, 23.792586641283147, 30.997285358013087]
[2.0, 111.0, 326.0]
[14.950740202045028, 19.002279195539899, 24.721859731083853]
[1.0, 90.0, 144.0, 205.0, 348.0, 659.0, 1306.0]
[0.0, 10.639320597045618, 12.855680712899211, 14.203275281009498, 17.271290804694651, 21.602460298007912, 30.25748576184726]
[1.0, 80.0, 153.0, 207.0, 403.0, 857.0, 1790.0]
[0.0, 8.4808025058687218, 11.122467087021043, 12.437342573395313, 16.443636072958345, 22.51618893625605, 32.660081512598843]
[2.0, 174.0, 507.0]
[18.819948115476031, 22.905364686028012, 28.112882050377593]
[1.0, 10.0, 100.0, 200.0, 500.0, 700.0, 1000.0]
[16.920078565241141, 21.573622354220063, 28.478194569340186, 31.670310062886141, 37.156175149406657, 39.623364241870405, 41.92009995989897]
[1.0, 10.0, 100.0, 200.0, 500.0, 700.0, 1000.0]
[14.751064689460042, 17.608177899201841, 23.098107295158137, 25.510456498220258, 29.155493577203792, 30.615054460037069, 32.054761881468316]

In [ ]:


In [7]:
from nbminer.pipeline.pipeline import Pipeline
from nbminer.preprocess.get_ast_features import GetASTFeatures
from nbminer.preprocess.get_imports import GetImports
from nbminer.preprocess.resample_by_node import ResampleByNode
from nbminer.results.reconstruction_error.astor_error import AstorError
from nbminer.preprocess.feature_encoding import FeatureEncoding
from nbminer.encoders.cluster.kmeans_encoder import KmeansEncoder

for value in [1000, 700, 500, 200, 100, 10, 1]:
    print ('Calculating for value: ',value)
    a = Features(notebook_objs)
    gastf = GetASTFeatures()
    rbn = ResampleByNode()
    gi = GetImports()
    fe = FeatureEncoding()
    ke = KmeansEncoder(n_clusters = value)
    ae = AstorError()
    pipe = Pipeline([gastf, rbn, gi, fe, ke, ae])
    a = pipe.transform(a)


Calculating for value:  1000
<nbminer.preprocess.get_ast_features.GetASTFeatures object at 0x1520246a58>
<nbminer.preprocess.resample_by_node.ResampleByNode object at 0x1531083f98>
<nbminer.preprocess.get_imports.GetImports object at 0x1531083f60>
<nbminer.preprocess.feature_encoding.FeatureEncoding object at 0x1521d35320>
<nbminer.encoders.cluster.kmeans_encoder.KmeansEncoder object at 0x151dde1390>
<nbminer.results.reconstruction_error.astor_error.AstorError object at 0x1538341400>
[41.274720853032896]
[32.040070227847224]
[1.0]
[1000]
Calculating for value:  700
<nbminer.preprocess.get_ast_features.GetASTFeatures object at 0x1512a8d940>
<nbminer.preprocess.resample_by_node.ResampleByNode object at 0x151e906198>
<nbminer.preprocess.get_imports.GetImports object at 0x15358e0cf8>
<nbminer.preprocess.feature_encoding.FeatureEncoding object at 0x153cde9630>
<nbminer.encoders.cluster.kmeans_encoder.KmeansEncoder object at 0x1523f28f28>
<nbminer.results.reconstruction_error.astor_error.AstorError object at 0x153cde4828>
[41.274720853032896, 44.70365154411025]
[32.040070227847224, 30.512287039379256]
[1.0, 1.0]
[1000, 700]
Calculating for value:  500
<nbminer.preprocess.get_ast_features.GetASTFeatures object at 0x151136e9b0>
<nbminer.preprocess.resample_by_node.ResampleByNode object at 0x15381db630>
<nbminer.preprocess.get_imports.GetImports object at 0x15381db470>
<nbminer.preprocess.feature_encoding.FeatureEncoding object at 0x1536a426a0>
<nbminer.encoders.cluster.kmeans_encoder.KmeansEncoder object at 0x1536a42908>
<nbminer.results.reconstruction_error.astor_error.AstorError object at 0x1536a42e48>
[41.274720853032896, 44.70365154411025, 45.89950709184187]
[32.040070227847224, 30.512287039379256, 29.104695582394044]
[1.0, 1.0, 1.0]
[1000, 700, 500]
Calculating for value:  200
<nbminer.preprocess.get_ast_features.GetASTFeatures object at 0x152a02dc88>
<nbminer.preprocess.resample_by_node.ResampleByNode object at 0x1529eb1198>
<nbminer.preprocess.get_imports.GetImports object at 0x1524b08dd8>
<nbminer.preprocess.feature_encoding.FeatureEncoding object at 0x1527378208>
<nbminer.encoders.cluster.kmeans_encoder.KmeansEncoder object at 0x1527378390>
<nbminer.results.reconstruction_error.astor_error.AstorError object at 0x1527378160>
[41.274720853032896, 44.70365154411025, 45.89950709184187, 53.49708278845186]
[32.040070227847224, 30.512287039379256, 29.104695582394044, 25.129859103392896]
[1.0, 1.0, 1.0, 1.0]
[1000, 700, 500, 200]
Calculating for value:  100
<nbminer.preprocess.get_ast_features.GetASTFeatures object at 0x152fb02ac8>
<nbminer.preprocess.resample_by_node.ResampleByNode object at 0x1531b43710>
<nbminer.preprocess.get_imports.GetImports object at 0x1537985f98>
<nbminer.preprocess.feature_encoding.FeatureEncoding object at 0x1524725588>
<nbminer.encoders.cluster.kmeans_encoder.KmeansEncoder object at 0x1537c0d748>
<nbminer.results.reconstruction_error.astor_error.AstorError object at 0x153c9b3240>
[41.274720853032896, 44.70365154411025, 45.89950709184187, 53.49708278845186, 57.275123227039536]
[32.040070227847224, 30.512287039379256, 29.104695582394044, 25.129859103392896, 22.94207881262498]
[1.0, 1.0, 1.0, 1.0, 1.0]
[1000, 700, 500, 200, 100]
Calculating for value:  10
<nbminer.preprocess.get_ast_features.GetASTFeatures object at 0x15396ef4e0>
<nbminer.preprocess.resample_by_node.ResampleByNode object at 0x153da81908>
<nbminer.preprocess.get_imports.GetImports object at 0x153d147320>
<nbminer.preprocess.feature_encoding.FeatureEncoding object at 0x153d147358>
<nbminer.encoders.cluster.kmeans_encoder.KmeansEncoder object at 0x151d13dba8>
<nbminer.results.reconstruction_error.astor_error.AstorError object at 0x151d13d630>
[41.274720853032896, 44.70365154411025, 45.89950709184187, 53.49708278845186, 57.275123227039536, 65.94054924051906]
[32.040070227847224, 30.512287039379256, 29.104695582394044, 25.129859103392896, 22.94207881262498, 17.460419982770787]
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
[1000, 700, 500, 200, 100, 10]
Calculating for value:  1
<nbminer.preprocess.get_ast_features.GetASTFeatures object at 0x1525f10198>
<nbminer.preprocess.resample_by_node.ResampleByNode object at 0x152b0b1e48>
<nbminer.preprocess.get_imports.GetImports object at 0x151cc0acf8>
<nbminer.preprocess.feature_encoding.FeatureEncoding object at 0x151cc0a940>
<nbminer.encoders.cluster.kmeans_encoder.KmeansEncoder object at 0x1531ae5ef0>
<nbminer.results.reconstruction_error.astor_error.AstorError object at 0x1531ae5b00>
[41.274720853032896, 44.70365154411025, 45.89950709184187, 53.49708278845186, 57.275123227039536, 65.94054924051906, 70.79916507393622]
[32.040070227847224, 30.512287039379256, 29.104695582394044, 25.129859103392896, 22.94207881262498, 17.460419982770787, 14.834598151160746]
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
[1000, 700, 500, 200, 100, 10, 1]

In [ ]:
dists = [avg_dist, avg_dist_general, avg_dist_kmeans]
sims = [avg_sim, avg_sim_general, avg_sim_kmeans]
covers = [coverage, coverage_general, coverage_kmeans]
numbers = [number_templates, number_templates_general, number_kmeans]

In [14]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (10, 20)

fig, axes = plt.subplots(3)
n1, = axes[0].plot(number_templates[:6], avg_dist[:6], label = 'Split function calls')
n2, = axes[0].plot(number_templates_general[:6], avg_dist_general[:6], label = 'All calls are the same')
n3, = axes[0].plot(number_kmeans[:6], avg_dist_kmeans[:6], label = 'KMeans')
n4, = axes[0].plot([0,1,2,3,4,1200], [avg_dist_general[6] for i in range(6)], label = 'Bound')
axes[0].set_title("Average edit distance")
axes[0].set_xlabel('Number of templates')
axes[0].set_ylabel('Average edit distance')
axes[0].legend(handles=[n1,n2,n3,n4])

n1, = axes[1].plot(number_templates[:6], avg_sim[:6], label = 'Split function calls')
n2, = axes[1].plot(number_templates_general[:6], avg_sim_general[:6], label = 'All calls are the same')
n3, = axes[1].plot(number_kmeans[:6], avg_sim_kmeans[:6], label = 'KMeans')
n4, = axes[1].plot([0,1,2,3,4,1200], [avg_sim_general[6] for i in range(6)], label = 'Bound')
axes[1].set_title("Average matching characters")
axes[1].set_xlabel('Number of templates')
axes[1].set_ylabel('Average matching characters')
axes[1].legend(handles=[n1,n2,n3,n4])

n1, = axes[2].plot(number_templates[:6], coverage[:6], label = 'Split function calls')
n2, = axes[2].plot(number_templates_general[:6], coverage_general[:6], label = 'All calls are the same')
n3, = axes[2].plot(number_kmeans[:6], coverage_kmeans[:6], label = 'KMeans')
n4, = axes[2].plot([0,1,2,3,4,1200], [coverage_general[6] for i in range(6)], label = 'Bound')
axes[2].set_title("Coverage of templates")
axes[2].set_xlabel('Number of templates')
axes[2].set_ylabel('Coverage of templates')
axes[2].legend(handles=[n1,n2,n3,n4])


Out[14]:
<matplotlib.legend.Legend at 0x151237be80>

In [ ]: