In [1]:
# Necessary imports
import os
import time
from nbminer.notebook_miner import NotebookMiner
from nbminer.cells.cells import Cell
from nbminer.features.features import Features
from nbminer.stats.summary import Summary
from nbminer.stats.multiple_summary import MultipleSummary
from nbminer.encoders.ast_graph.ast_graph import *
In [2]:
people = os.listdir('../testbed/Final')
notebooks = []
for person in people:
person = os.path.join('../testbed/Final', person)
if os.path.isdir(person):
direc = os.listdir(person)
notebooks.extend([os.path.join(person, filename) for filename in direc if filename.endswith('.ipynb')])
notebook_objs = [NotebookMiner(file) for file in notebooks]
In [3]:
# Loading in the two corpuses
notebooks = [os.path.join('../hw_corpus', fname) for fname in os.listdir('../hw_corpus')]
hw_notebook_objs = [NotebookMiner(file) for file in notebooks]
In [4]:
person_to_notebooks = {}
for nb in hw_notebook_objs:
person = nb.filename.split('/')[2].split('_')[0]
if person not in person_to_notebooks:
person_to_notebooks[person] = []
person_to_notebooks[person].append(nb)
In [5]:
print(len([key for key in person_to_notebooks.keys()]))
In [6]:
print(len(os.listdir('../testbed/Final')))
Looks like there are 176 students in the Final directory and only 56 in the homework directory. Furthermore, there were actually 60 repos in the hw_corpus... 4 apparently have no notebooks. Representative example: cyriaquebrousse
In [7]:
max_hw_notebook_objs = []
for key in person_to_notebooks.keys():
cur_max = 0
max_nb = None
for nb in person_to_notebooks[key]:
if nb.get_number_cells() > cur_max:
cur_max = nb.get_number_cells()
max_nb = nb
max_hw_notebook_objs.append(max_nb)
In [8]:
from nbminer.stats.multiple_summary import MultipleSummary
hw_summary = MultipleSummary(max_hw_notebook_objs)
final_summary = MultipleSummary(notebook_objs)
print("Number of Final notebooks: ", len(final_summary.summary_vec))
print("Number of Homework notebooks: ", len(hw_summary.summary_vec))
print("Average number of cells, Final: ", final_summary.average_number_of_cells())
print("Average number of cells, Homework: ", hw_summary.average_number_of_cells())
print("Average lines of code, Final: ", final_summary.average_lines_of_code())
print("Average lines of code, Homework: ", hw_summary.average_lines_of_code())
In [9]:
from nbminer.pipeline.pipeline import Pipeline
from nbminer.features.features import Features
from nbminer.preprocess.get_ast_features import GetASTFeatures
from nbminer.preprocess.get_imports import GetImports
from nbminer.preprocess.resample_by_node import ResampleByNode
from nbminer.encoders.ast_graph.ast_graph import ASTGraphReducer
from nbminer.preprocess.feature_encoding import FeatureEncoding
from nbminer.encoders.cluster.kmeans_encoder import KmeansEncoder
from nbminer.results.reconstruction_error.astor_error import AstorError
from nbminer.results.similarity.jaccard_similarity import NotebookJaccardSimilarity
a = Features(max_hw_notebook_objs, 'group_1')
a.add_notebooks(notebook_objs, 'group_2')
gastf = GetASTFeatures()
rbn = ResampleByNode()
gi = GetImports()
fe = FeatureEncoding()
ke = KmeansEncoder(n_clusters = 100)
#agr = ASTGraphReducer(a, threshold=20, split_call=False)
njs = NotebookJaccardSimilarity()
pipe = Pipeline([gastf, rbn, gi, fe, ke, njs])
a = pipe.transform(a)
In [10]:
import numpy as np
intra, inter = njs.group_average_jaccard_similarity('group_1')
print('Mean within group: ', np.mean(np.array(intra)))
print('STD within group: ', np.std(np.array(intra)))
print('Mean outside group: ', np.mean(np.array(inter)))
print('STD outside group: ', np.std(np.array(inter)))
In [11]:
from nbminer.pipeline.pipeline import Pipeline
from nbminer.features.features import Features
from nbminer.preprocess.get_ast_features import GetASTFeatures
from nbminer.preprocess.get_imports import GetImports
from nbminer.preprocess.resample_by_node import ResampleByNode
from nbminer.encoders.ast_graph.ast_graph import ASTGraphReducer
from nbminer.preprocess.feature_encoding import FeatureEncoding
from nbminer.encoders.cluster.kmeans_encoder import KmeansEncoder
from nbminer.results.similarity.jaccard_similarity import NotebookJaccardSimilarity
from nbminer.results.prediction.corpus_identifier import CorpusIdentifier
a = Features(max_hw_notebook_objs, 'group_1')
a.add_notebooks(notebook_objs, 'group_2')
gastf = GetASTFeatures()
rbn = ResampleByNode()
gi = GetImports()
fe = FeatureEncoding()
ke = KmeansEncoder(n_clusters = 100)
#agr = ASTGraphReducer(a, threshold=20, split_call=False)
ci = CorpusIdentifier()
pipe = Pipeline([gastf, rbn, gi, fe, ke, ci])
a = pipe.transform(a)
In [66]:
%matplotlib inline
import matplotlib.pyplot as plt
fpr, tpr, m = ci.predict()
print(m)
plt.plot(fpr, tpr)
Out[66]:
In [49]:
template_counter = {'group_1':{}, 'group_2':{}}
for i in range(a.get_number_notebooks()):
group = a.get_notebook(i).get_feature('import_name')
for seg in a.get_notebook_segments(i):
templ = seg.get_feature('template')
if templ != None:
if templ not in template_counter[group]:
template_counter[group][templ] = 0
template_counter[group][templ] += 1
In [58]:
for key in template_counter['group_1'].keys():
print (template_counter['group_1'][key], template_counter['group_2'][key])
In [67]:
percentages = []
total_sum_1 = 0
total_sum_2 = 0
for key in template_counter['group_1'].keys():
v1 = template_counter['group_1'][key]
v2 = template_counter['group_2'][key]
total_sum_1 += v1
total_sum_2 += v2
arr = (v1/(v1+v2), key, v1+v2)
percentages.append(arr)
In [68]:
print(total_sum_1/(total_sum_1+total_sum_2))
In [47]:
# Smaller number -- more likely in group 2 (aka, final)
for el in sorted(percentages):
if el[2] > 20:
print(el)
print (astor.to_source(ke.templates.get_random_example(el[1])))
In [28]:
import astor
print(astor.to_source(ke.templates.get_random_example('template_89')))
print(astor.to_source(ke.templates.get_random_example('template_64')))
print(astor.to_source(ke.templates.get_random_example('template_73')))
print(astor.to_source(ke.templates.get_random_example('template_5')))
In [ ]:
In [ ]: