In [1]:
import sys
home_directory = '/dfs/scratch2/fcipollone'
sys.path.append(home_directory)
import numpy as np
from nbminer.notebook_miner import NotebookMiner
hw_filenames = np.load('../homework_names_jplag_combined_per_student.npy')
hw_notebooks = [[NotebookMiner(filename) for filename in temp[:59]] for temp in hw_filenames]
In [2]:
from nbminer.pipeline.pipeline import Pipeline
from nbminer.features.features import Features
from nbminer.preprocess.get_ast_features import GetASTFeatures
from nbminer.preprocess.get_imports import GetImports
from nbminer.preprocess.resample_by_node import ResampleByNode
from nbminer.encoders.ast_graph.ast_graph import ASTGraphReducer
from nbminer.preprocess.feature_encoding import FeatureEncoding
from nbminer.encoders.ast_graph.postorder_hash import PostorderHash
from nbminer.encoders.cluster.kmeans_encoder import KmeansEncoder
from nbminer.results.similarity.jaccard_similarity import NotebookJaccardSimilarity
from nbminer.results.prediction.corpus_identifier import CorpusIdentifier
a = Features(hw_notebooks[2], 'hw2')
a.add_notebooks(hw_notebooks[3], 'hw3')
a.add_notebooks(hw_notebooks[4], 'hw4')
a.add_notebooks(hw_notebooks[5], 'hw5')
gastf = GetASTFeatures()
rbn = ResampleByNode()
gi = GetImports()
ph = PostorderHash(None, threshold=0, split_call=True, hash_many=True)
ci = CorpusIdentifier()
pipe = Pipeline([gastf, rbn, gi, ph, ci])
a = pipe.transform(a)
In [3]:
import sklearn
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
X, y = ci.get_data_set()
countvec = sklearn.feature_extraction.text.CountVectorizer()
X_list = [" ".join(el) for el in X]
countvec.fit(X_list)
X = countvec.transform(X_list)
p = np.random.permutation(len(X.todense()))
X = X.todense()[p]
y = np.array(y)[p]
clfAdaboost = sklearn.ensemble.AdaBoostClassifier(n_estimators=100)
clf = sklearn.ensemble.RandomForestClassifier(n_estimators=400, max_depth=4)
scores = cross_val_score(clf, X, y, cv=8)
print(scores)
print(np.mean(scores))
In [4]:
X_list[0]
Out[4]:
In [ ]: