In [ ]:
import numpy as np
from nbminer.notebook_miner import NotebookMiner
hw_filenames = np.load('homework_file_names.npy')
hw_notebooks = [[NotebookMiner(filename) for filename in temp] for temp in hw_filenames]
In [ ]:
from nbminer.pipeline.pipeline import Pipeline
from nbminer.features.features import Features
from nbminer.preprocess.get_ast_features import GetASTFeatures
from nbminer.preprocess.get_imports import GetImports
from nbminer.preprocess.resample_by_node import ResampleByNode
from nbminer.encoders.ast_graph.ast_graph import ASTGraphReducer
from nbminer.preprocess.feature_encoding import FeatureEncoding
from nbminer.encoders.cluster.kmeans_encoder import KmeansEncoder
from nbminer.encoders.cluster.hierarchical_encoder import HierarchicalEncoder
from nbminer.results.similarity.jaccard_similarity import NotebookJaccardSimilarity
from nbminer.results.prediction.corpus_identifier import CorpusIdentifier
a = Features(hw_notebooks[0], 'hw1')
a.add_notebooks(hw_notebooks[1], 'hw2')
a.add_notebooks(hw_notebooks[2], 'hw3')
a.add_notebooks(hw_notebooks[3], 'hw4')
a.add_notebooks(hw_notebooks[4], 'hw5')
gastf = GetASTFeatures()
rbn = ResampleByNode()
gi = GetImports()
fe = FeatureEncoding()
he = HierarchicalEncoder()
ci = CorpusIdentifier()
pipe = Pipeline([gastf, rbn, gi, fe, he, ci])
a = pipe.transform(a)
In [ ]:
import sklearn
X, y = ci.get_data_set()
tdidf = ci.get_tdidf(X)
X_train = ci.transform_tdidf(X, tdidf)
pca = sklearn.decomposition.PCA(n_components=2)
X_trans = pca.fit_transform(X_train.todense())
In [ ]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.scatter(X_trans[:,0],X_trans[:,1],c=y)
In [ ]:
import sklearn
from sklearn import svm
from sklearn.metrics import accuracy_score
X_train_mat, X_test_mat, y_train, y_test = ci.get_data_split()
clf = svm.SVC(decision_function_shape='ovo')
clf.fit(X_train_mat, y_train)
y_train_res = clf.predict(X_train_mat)
y_test_res = clf.predict(X_test_mat)
print('Train accuracy: ', accuracy_score(y_train, y_train_res))
print('Test accuracy: ', accuracy_score(y_test, y_test_res))
In [ ]:
from sklearn.metrics import confusion_matrix
fig, axes = plt.subplots(1,2)
cm = confusion_matrix(y_test, y_test_res)
axes[0].imshow(cm, interpolation='nearest')
cm = confusion_matrix(y_train, y_train_res)
axes[1].imshow(cm, interpolation='nearest')
In [ ]:
import sklearn
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
X_train_mat, X_test_mat, y_train, y_test = ci.get_data_split()
clf = DecisionTreeClassifier(random_state=0)
clf.fit(X_train_mat, y_train)
y_train_res = clf.predict(X_train_mat)
y_test_res = clf.predict(X_test_mat)
print('Train accuracy: ', accuracy_score(y_train, y_train_res))
print('Test accuracy: ', accuracy_score(y_test, y_test_res))
In [ ]:
from sklearn.metrics import confusion_matrix
fig, axes = plt.subplots(1,2)
cm = confusion_matrix(y_test, y_test_res)
axes[0].imshow(cm, interpolation='nearest')
cm = confusion_matrix(y_train, y_train_res)
axes[1].imshow(cm, interpolation='nearest')
In [ ]:
import sklearn
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
X_train_mat, X_test_mat, y_train, y_test = ci.get_data_split()
clf = sklearn.neural_network.MLPClassifier(hidden_layer_sizes=(100))
clf.fit(X_train_mat, y_train)
y_train_res = clf.predict(X_train_mat)
y_test_res = clf.predict(X_test_mat)
print('Train accuracy: ', accuracy_score(y_train, y_train_res))
print('Test accuracy: ', accuracy_score(y_test, y_test_res))
In [ ]:
from sklearn.metrics import confusion_matrix
fig, axes = plt.subplots(1,2)
cm = confusion_matrix(y_test, y_test_res)
axes[0].imshow(cm, interpolation='nearest')
cm = confusion_matrix(y_train, y_train_res)
axes[1].imshow(cm, interpolation='nearest')
In [ ]: