This notebook details the process of prediction from which homework a notebook came after featurizing the notebook using the bottom up method. This is done by gathering all templates in each notebook after running the algorithm, then using countvectorizer to featurize the notebooks, and finally using random forests to make the prediction
In [1]:
import sys
home_directory = '/dfs/scratch2/fcipollone'
sys.path.append(home_directory)
import numpy as np
from nbminer.notebook_miner import NotebookMiner
hw_filenames = np.load('../homework_names_jplag_combined_per_student.npy')
hw_notebooks = [[NotebookMiner(filename) for filename in temp[:59]] for temp in hw_filenames]
In [3]:
from nbminer.pipeline.pipeline import Pipeline
from nbminer.features.features import Features
from nbminer.preprocess.get_ast_features import GetASTFeatures
from nbminer.preprocess.get_imports import GetImports
from nbminer.preprocess.resample_by_node import ResampleByNode
from nbminer.encoders.ast_graph.ast_graph import ASTGraphReducer
from nbminer.freq_itemsets.frequent_itemsets import FrequentItemsets
from nbminer.preprocess.feature_encoding import FeatureEncoding
from nbminer.encoders.cluster.kmeans_encoder import KmeansEncoder
from nbminer.results.similarity.jaccard_similarity import NotebookJaccardSimilarity
from nbminer.results.prediction.corpus_identifier import CorpusIdentifier
a = Features(hw_notebooks[0], 'hw0')
a.add_notebooks(hw_notebooks[1], 'hw1')
a.add_notebooks(hw_notebooks[2], 'hw2')
a.add_notebooks(hw_notebooks[3], 'hw3')
a.add_notebooks(hw_notebooks[4], 'hw4')
a.add_notebooks(hw_notebooks[5], 'hw5')
gastf = GetASTFeatures()
rbn = ResampleByNode()
gi = GetImports()
agr = ASTGraphReducer(a, threshold=8, split_call=True)
fi = FrequentItemsets()
pipe = Pipeline([gastf, rbn, gi, agr, fi])
a = pipe.transform(a)
In [ ]:
In [4]:
notebook_patterns = {}
for bucket in fi.buckets:
name = None
for cell in bucket.items:
name = cell.get_feature('notebook_name')
if name not in notebook_patterns:
notebook_patterns[name] = []
notebook_patterns[name].append(bucket.get_patterns())
In [5]:
notebook_itemsets = {}
for key in notebook_patterns.keys():
itemsets = []
for cell in notebook_patterns[key]:
itemsets.extend(cell)
notebook_itemsets[key] = set(itemsets)
In [6]:
keys = [key for key in notebook_itemsets.keys()]
print(len(notebook_itemsets[keys[4]]))
In [7]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.hist([len(notebook_itemsets[key]) for key in notebook_itemsets.keys()])
Out[7]:
In [8]:
total_set = []
for key in notebook_itemsets.keys():
total_set.extend(notebook_itemsets[key])
print(len(set(total_set)))
In [9]:
hw_filenames = np.load('../homework_names_jplag_combined_per_student.npy')
hw_notebooks = [[NotebookMiner(filename) for filename in temp[:59]] for temp in hw_filenames]
a = Features(hw_notebooks[0], 'hw0')
a.add_notebooks(hw_notebooks[1], 'hw1')
a.add_notebooks(hw_notebooks[2], 'hw2')
a.add_notebooks(hw_notebooks[3], 'hw3')
a.add_notebooks(hw_notebooks[4], 'hw4')
a.add_notebooks(hw_notebooks[5], 'hw5')
In [10]:
len(a.notebooks)
Out[10]:
In [11]:
nbcorpus = {}
for i, corp in enumerate(hw_filenames):
for fname in corp:
nbcorpus[fname] = int(i)
print(nbcorpus.values())
In [ ]:
In [12]:
X = []
y = []
for key in notebook_itemsets.keys():
if key in nbcorpus:
X.append(['_'.join([temp for temp in el]) for el in notebook_itemsets[key]])
y.append(nbcorpus[key])
In [13]:
import tqdm
similarities = np.zeros((len(X), len(X)))
for i in tqdm.tqdm(range(len(X))):
for j in range(len(X)):
if len(set.union(set(X[i]), set(X[j]))) == 0:
continue
similarities[i][j] = len(set.intersection(set(X[i]), set(X[j]))) / (len(set.union(set(X[i]), set(X[j]))))
def get_avg_inter_intra_sims(X, y, val):
inter_sims = []
intra_sims = []
for i in range(len(X)):
for j in range(i+1, len(X)):
if y[i] == y[j] and y[i] == val:
intra_sims.append(similarities[i][j])
else:
inter_sims.append(similarities[i][j])
return np.array(intra_sims), np.array(inter_sims)
for i in np.unique(y):
intra_sims, inter_sims = get_avg_inter_intra_sims(X, y, i)
print('Mean intra similarity for hw',i,'is',np.mean(intra_sims),'with std',np.std(intra_sims))
print('Mean inter similarity for hw',i,'is',np.mean(inter_sims),'with std',np.std(inter_sims))
print('----')
In [14]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = 5, 10
def get_all_sims(X, y, val):
sims = []
for i in range(len(X)):
for j in range(i+1, len(X)):
if y[i] == val or y[j] == val:
sims.append(similarities[i][j])
return sims
fig, axes = plt.subplots(6)
for i in range(6):
axes[i].hist(get_all_sims(X,y,i), bins=30)
In [15]:
tot = []
for el in X:
tot.extend(el)
print(len(set(tot)))
In [16]:
print(X[0])
In [17]:
import sklearn
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
countvec = sklearn.feature_extraction.text.CountVectorizer()
X_list = [" ".join(el) for el in X]
countvec.fit(X_list)
X = countvec.transform(X_list)
In [18]:
X.shape
Out[18]:
In [ ]:
In [19]:
p = np.random.permutation(len(X.todense()))
X = X.todense()[p]
y = np.array(y)[p]
clf = sklearn.ensemble.RandomForestClassifier(n_estimators=400, max_depth=4)
scores = cross_val_score(clf, X, y, cv=10)
print(scores)
print(np.mean(scores))
In [20]:
clf.fit(X[:360],y[:360])
Out[20]:
In [21]:
clf.predict(X[300:])
Out[21]:
In [22]:
y[300:]
Out[22]:
In [23]:
print(np.sum(clf.predict(X[:360])== y[:360])/len(y[:360]))