In [1]:
# Necessary imports
import os
import time
from nbminer.notebook_miner import NotebookMiner
from nbminer.cells.cells import Cell
from nbminer.features.features import Features
from nbminer.stats.summary import Summary
from nbminer.stats.multiple_summary import MultipleSummary
from nbminer.encoders.ast_graph.ast_graph import *
people = os.listdir('../testbed/Final')
notebooks = []
for person in people:
person = os.path.join('../testbed/Final', person)
if os.path.isdir(person):
direc = os.listdir(person)
notebooks.extend([os.path.join(person, filename) for filename in direc if filename.endswith('.ipynb')])
notebook_objs = [NotebookMiner(file) for file in notebooks[:20]]
notebook_objs_2 = [NotebookMiner(file) for file in notebooks[20:40]]
In [2]:
from nbminer.pipeline.pipeline import Pipeline
from nbminer.preprocess.get_ast_features import GetASTFeatures
from nbminer.preprocess.get_imports import GetImports
from nbminer.preprocess.resample_by_node import ResampleByNode
from nbminer.results.reconstruction_error.astor_error import AstorError
from nbminer.preprocess.feature_encoding import FeatureEncoding
from nbminer.encoders.cluster.hierarchical_encoder import HierarchicalEncoder
from nbminer.results.similarity.jaccard_similarity import NotebookJaccardSimilarity
a = Features(notebook_objs, 'group_1')
a.add_notebooks(notebook_objs_2, 'group_2')
gastf = GetASTFeatures()
rbn = ResampleByNode()
gi = GetImports()
fe = FeatureEncoding()
he = HierarchicalEncoder()
njs = NotebookJaccardSimilarity()
pipe = Pipeline([gastf, rbn, gi, fe, he, njs])
a = pipe.transform(a)
In [ ]:
In [3]:
njs.average_jaccard_similarity(8)
Out[3]:
In [8]:
import numpy as np
intra, inter = njs.group_average_jaccard_similarity('group_2')
print('Mean within group: ', np.mean(np.array(intra)))
print('STD within group: ', np.std(np.array(intra)))
print('Mean outside group: ', np.mean(np.array(inter)))
print('STD outside group: ', np.std(np.array(inter)))
In [ ]: