In [2]:
# Necessary imports
import os
import time
from nbminer.notebook_miner import NotebookMiner
from nbminer.cells.cells import Cell
from nbminer.features.ast_features import ASTFeatures
from nbminer.stats.summary import Summary
from nbminer.stats.multiple_summary import MultipleSummary
In [3]:
#Loading in the notebooks
people = os.listdir('../testbed/Final')
notebooks = []
for person in people:
person = os.path.join('../testbed/Final', person)
if os.path.isdir(person):
direc = os.listdir(person)
notebooks.extend([os.path.join(person, filename) for filename in direc if filename.endswith('.ipynb')])
notebook_objs = [NotebookMiner(file) for file in notebooks[:5]]
a = ASTFeatures(notebook_objs)
In [3]:
# Now lets compute the jaccard similarity between each cell
s = time.time()
rd, cls = a.ranked_cell_similarity()
print ('Time elapsed: ', time.time()-s)
In [4]:
short_similarities = []
full_similarities = []
call_similarities = []
string_similarities = []
for key in rd:
short_similarities.append(rd[key]['short_dict_similarity'])
full_similarities.append(rd[key]['full_dict_similarity'])
call_similarities.append(rd[key]['call_dict_similarity'])
string_similarities.append(rd[key]['string_dict_similarity'])
In [5]:
import numpy as np
short_similarities = np.array(short_similarities)
full_similarities = np.array(full_similarities)
call_similarities = np.array(call_similarities)
string_similarities = np.array(string_similarities)
In [6]:
print (np.mean(short_similarities))
print (np.mean(full_similarities))
print (np.mean(call_similarities))
print (np.mean(string_similarities))
In [7]:
greater_than_0 = len([i for i in string_similarities if i > 0])
total_length = len(call_similarities)
print ('Total length: ', total_length)
print ('Greater than 0: ', greater_than_0)
print ('Fraction greater than 0: ',greater_than_0/total_length)
We now have a much higher fraction with some similarity
In [8]:
%matplotlib inline
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
In [9]:
plt.rcParams['figure.figsize'] = (20, 10)
plt.hist([i for i in string_similarities if i !=0],bins=50)
plt.show()
In [11]:
# Finding examples of cells that have a high string similarity
total_examples = 10
for key in rd:
if rd[key]['string_similarity'] > .3:
if len(cls[int(key[0])].get_feature('strings')) < 20:
continue
print ("LOOKING AT DIFFERENCE BETWEEN CODE X AND Y")
print ("CODE X:")
for line in cls[int(key[0])].get_feature('code').split('\n'):
print (line)
print ("\n\nCODE Y:")
for line in cls[int(key[1])].get_feature('code').split('\n'):
print (line)
print ("\n\n\n\n\n\n")
total_examples -= 1
if total_examples == 0:
break
In [12]:
# Finding examples of cells that have a high string similarity
total_examples = 50
for key in rd:
if rd[key]['string_similarity'] > .5:
if len(cls[int(key[0])].get_feature('strings')) < 10:
continue
print ("LOOKING AT DIFFERENCE BETWEEN CODE X AND Y")
print ("CODE X:")
for line in cls[int(key[0])].get_feature('strings'):
print (line)
print ("\n\nCODE Y:")
for line in cls[int(key[1])].get_feature('strings'):
print (line)
print ("\n\n\n\n\n\n")
total_examples -= 1
if total_examples == 0:
break
In [ ]: