Prediction using the Frequent Itemsets generated from Bottom Up Features

This notebook details the process of prediction from which homework a notebook came after featurizing the notebook using the bottom up method. This is done by gathering all templates in each notebook after running the algorithm, then using countvectorizer to featurize the notebooks, and finally using random forests to make the prediction


In [1]:
import sys
home_directory = '/dfs/scratch2/fcipollone'
sys.path.append(home_directory)
import numpy as np
from nbminer.notebook_miner import NotebookMiner

hw_filenames = np.load('../homework_names_jplag_combined_per_student.npy')
hw_notebooks = [[NotebookMiner(filename) for filename in temp[:59]] for temp in hw_filenames]

In [3]:
from nbminer.pipeline.pipeline import Pipeline
from nbminer.features.features import Features
from nbminer.preprocess.get_ast_features import GetASTFeatures
from nbminer.preprocess.get_imports import GetImports
from nbminer.preprocess.resample_by_node import ResampleByNode
from nbminer.encoders.ast_graph.ast_graph import ASTGraphReducer
from nbminer.freq_itemsets.frequent_itemsets import FrequentItemsets
from nbminer.preprocess.feature_encoding import FeatureEncoding
from nbminer.encoders.cluster.kmeans_encoder import KmeansEncoder
from nbminer.results.similarity.jaccard_similarity import NotebookJaccardSimilarity
from nbminer.results.prediction.corpus_identifier import CorpusIdentifier
a = Features(hw_notebooks[0], 'hw0')
a.add_notebooks(hw_notebooks[1], 'hw1')
a.add_notebooks(hw_notebooks[2], 'hw2')
a.add_notebooks(hw_notebooks[3], 'hw3')
a.add_notebooks(hw_notebooks[4], 'hw4')
a.add_notebooks(hw_notebooks[5], 'hw5')
gastf = GetASTFeatures()
rbn = ResampleByNode()
gi = GetImports()
agr = ASTGraphReducer(a, threshold=8, split_call=True)
fi = FrequentItemsets()
pipe = Pipeline([gastf, rbn, gi, agr, fi])
a = pipe.transform(a)


<nbminer.preprocess.get_ast_features.GetASTFeatures object at 0x7f68ecab85f8>
354
<nbminer.preprocess.resample_by_node.ResampleByNode object at 0x7f6901b8f5c0>
354
<nbminer.preprocess.get_imports.GetImports object at 0x7f68ecab8940>
354
<nbminer.encoders.ast_graph.ast_graph.ASTGraphReducer object at 0x7f68ecab8be0>
354
<nbminer.freq_itemsets.frequent_itemsets.FrequentItemsets object at 0x7f68eca68fd0>
354
27003
  0%|          | 1/1849 [00:00<04:15,  7.25it/s]
0.5155630111694336
100%|██████████| 1849/1849 [04:20<00:00,  7.09it/s]

In [ ]:


In [4]:
notebook_patterns = {}
for bucket in fi.buckets:
    name = None
    for cell in bucket.items:
        name = cell.get_feature('notebook_name')
    if name not in notebook_patterns:
        notebook_patterns[name] = []
    notebook_patterns[name].append(bucket.get_patterns())

In [5]:
notebook_itemsets = {}
for key in notebook_patterns.keys():
    itemsets = []
    for cell in notebook_patterns[key]:
        itemsets.extend(cell)
    notebook_itemsets[key] = set(itemsets)

In [6]:
keys = [key for key in notebook_itemsets.keys()]
print(len(notebook_itemsets[keys[4]]))


40

In [7]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.hist([len(notebook_itemsets[key]) for key in notebook_itemsets.keys()])


Out[7]:
(array([110., 101.,   7.,   4.,  47.,  32.,  42.,   3.,   5.,   3.]),
 array([  1. ,  35.3,  69.6, 103.9, 138.2, 172.5, 206.8, 241.1, 275.4,
        309.7, 344. ]),
 <a list of 10 Patch objects>)

In [8]:
total_set = []
for key in notebook_itemsets.keys():
    total_set.extend(notebook_itemsets[key])
print(len(set(total_set)))


1293

In [9]:
hw_filenames = np.load('../homework_names_jplag_combined_per_student.npy')
hw_notebooks = [[NotebookMiner(filename) for filename in temp[:59]] for temp in hw_filenames]
a = Features(hw_notebooks[0], 'hw0')
a.add_notebooks(hw_notebooks[1], 'hw1')
a.add_notebooks(hw_notebooks[2], 'hw2')
a.add_notebooks(hw_notebooks[3], 'hw3')
a.add_notebooks(hw_notebooks[4], 'hw4')
a.add_notebooks(hw_notebooks[5], 'hw5')

In [10]:
len(a.notebooks)


Out[10]:
354

In [11]:
nbcorpus = {}
for i, corp in enumerate(hw_filenames):
    for fname in corp:
        nbcorpus[fname] = int(i)
print(nbcorpus.values())


dict_values([3, 1, 2, 3, 2, 0, 0, 0, 0, 3, 2, 0, 3, 4, 0, 1, 1, 0, 2, 0, 1, 5, 0, 0, 3, 2, 1, 3, 2, 3, 0, 3, 3, 5, 3, 1, 5, 3, 1, 2, 4, 1, 4, 4, 2, 1, 1, 0, 2, 1, 3, 3, 2, 1, 0, 1, 4, 1, 3, 2, 0, 5, 3, 5, 5, 2, 2, 4, 0, 3, 1, 5, 3, 3, 3, 3, 0, 2, 1, 3, 5, 5, 1, 2, 4, 4, 5, 0, 4, 1, 2, 3, 5, 5, 2, 5, 0, 5, 5, 2, 4, 1, 4, 1, 4, 2, 0, 5, 3, 5, 0, 0, 5, 0, 5, 2, 1, 3, 3, 1, 1, 4, 4, 0, 0, 4, 0, 5, 4, 3, 1, 4, 1, 2, 5, 4, 3, 5, 4, 3, 5, 5, 3, 0, 0, 3, 1, 4, 1, 2, 0, 4, 0, 4, 4, 2, 4, 1, 1, 4, 4, 2, 2, 2, 3, 4, 1, 5, 2, 2, 1, 3, 5, 5, 0, 2, 4, 3, 5, 0, 4, 4, 5, 1, 5, 2, 4, 1, 0, 3, 3, 0, 3, 3, 3, 3, 5, 0, 2, 5, 4, 5, 5, 4, 2, 2, 0, 3, 2, 3, 2, 0, 1, 5, 0, 4, 1, 4, 4, 2, 4, 3, 0, 0, 1, 2, 5, 0, 5, 1, 3, 1, 3, 2, 5, 4, 1, 5, 2, 4, 2, 3, 4, 4, 5, 5, 0, 3, 1, 1, 4, 1, 5, 4, 4, 3, 5, 1, 1, 1, 1, 2, 3, 3, 1, 0, 2, 0, 1, 5, 0, 5, 5, 4, 2, 4, 2, 4, 4, 5, 1, 0, 5, 5, 0, 5, 3, 5, 3, 2, 4, 2, 5, 2, 4, 5, 4, 2, 1, 3, 0, 2, 5, 5, 5, 5, 3, 4, 5, 1, 4, 2, 1, 4, 3, 4, 1, 1, 2, 3, 5, 2, 0, 2, 1, 1, 5, 3, 4, 4, 0, 4, 5, 1, 2, 0, 3, 4, 2, 3, 0, 1, 4, 3, 3, 0, 4, 0, 4, 0, 1, 0, 2, 5, 0, 3, 2, 2, 2, 2, 0, 3, 2, 5, 0, 4, 3, 1, 0])

In [ ]:


In [12]:
X = []
y = []
for key in notebook_itemsets.keys():
    if key in nbcorpus:
        X.append(['_'.join([temp for temp in el]) for el in notebook_itemsets[key]])
        y.append(nbcorpus[key])

In [13]:
import tqdm
similarities = np.zeros((len(X), len(X)))
for i in tqdm.tqdm(range(len(X))):
    for j in range(len(X)):
        if len(set.union(set(X[i]), set(X[j]))) == 0:
            continue
        similarities[i][j] = len(set.intersection(set(X[i]), set(X[j]))) / (len(set.union(set(X[i]), set(X[j]))))

def get_avg_inter_intra_sims(X, y, val):
    inter_sims = []
    intra_sims = []
    for i in range(len(X)):
        for j in range(i+1, len(X)):
            if y[i] == y[j] and y[i] == val:
                intra_sims.append(similarities[i][j])
            else:
                inter_sims.append(similarities[i][j])
    return np.array(intra_sims), np.array(inter_sims)

for i in np.unique(y):
    intra_sims, inter_sims = get_avg_inter_intra_sims(X, y, i)
    print('Mean intra similarity for hw',i,'is',np.mean(intra_sims),'with std',np.std(intra_sims))
    print('Mean inter similarity for hw',i,'is',np.mean(inter_sims),'with std',np.std(inter_sims))
    print('----')


100%|██████████| 353/353 [00:08<00:00, 43.68it/s]
Mean intra similarity for hw 0 is 0.8499075694885955 with std 0.14604565901974068
Mean inter similarity for hw 0 is 0.10959019269215617 with std 0.13946254474506906
----
Mean intra similarity for hw 1 is 0.7658213356389376 with std 0.2356592897692731
Mean inter similarity for hw 1 is 0.1119715016965219 with std 0.14765174126047814
----
Mean intra similarity for hw 2 is 0.1878730021144792 with std 0.06366260470895117
Mean inter similarity for hw 2 is 0.12839600580044233 with std 0.1868395883390528
----
Mean intra similarity for hw 3 is 0.18796662480229873 with std 0.06349435499187157
Mean inter similarity for hw 3 is 0.1283362568191114 with std 0.1869105244270893
----
Mean intra similarity for hw 4 is 0.17053893656304794 with std 0.11783336098712888
Mean inter similarity for hw 4 is 0.1288298062270158 with std 0.1862968599229316
----
Mean intra similarity for hw 5 is 0.16498506747397476 with std 0.11626780821958488
Mean inter similarity for hw 5 is 0.12898709093184063 with std 0.18635750941418033
----

In [14]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = 5, 10
def get_all_sims(X, y, val):
    sims = []
    for i in range(len(X)):
        for j in range(i+1, len(X)):
            if y[i] == val or y[j] == val:
                sims.append(similarities[i][j])
    return sims
fig, axes = plt.subplots(6)
for i in range(6):
    axes[i].hist(get_all_sims(X,y,i), bins=30)



In [15]:
tot = []
for el in X:
    tot.extend(el)
print(len(set(tot)))


1293

In [16]:
print(X[0])


['template_194_template_290', 'template_4923', 'template_2715', 'template_6921', 'template_2165', 'template_194_template_287', 'template_194_template_48', 'template_1_template_48', 'template_5024', 'template_1042', 'template_194', 'template_48', 'template_287_template_48', 'template_290', 'template_7280', 'template_2169', 'template_1_template_48_template_7710', 'template_48_template_7710', 'template_7710', 'template_287', 'template_288', '', 'template_1_template_7710', 'template_2263', 'template_194_template_2169', 'template_1']

In [17]:
import sklearn
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

countvec = sklearn.feature_extraction.text.CountVectorizer()
X_list = [" ".join(el) for el in X]
countvec.fit(X_list)
X = countvec.transform(X_list)

In [18]:
X.shape


Out[18]:
(353, 1292)

In [ ]:


In [19]:
p = np.random.permutation(len(X.todense()))
X = X.todense()[p]
y = np.array(y)[p]
clf = sklearn.ensemble.RandomForestClassifier(n_estimators=400, max_depth=4)
scores = cross_val_score(clf, X, y, cv=10)
print(scores)
print(np.mean(scores))


[0.83333333 0.86111111 0.94444444 0.83333333 0.88888889 0.97222222
 0.91666667 0.91666667 0.88571429 0.9       ]
0.8952380952380953

In [20]:
clf.fit(X[:360],y[:360])


Out[20]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=4, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=400, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [21]:
clf.predict(X[300:])


Out[21]:
array([2, 1, 1, 1, 0, 4, 2, 2, 1, 5, 1, 1, 1, 2, 1, 3, 3, 2, 2, 4, 3, 0,
       3, 0, 0, 4, 4, 3, 5, 4, 2, 2, 5, 1, 0, 2, 1, 5, 3, 5, 1, 5, 4, 5,
       1, 2, 0, 1, 4, 4, 2, 3, 1])

In [22]:
y[300:]


Out[22]:
array([2, 1, 1, 1, 0, 4, 3, 5, 1, 5, 1, 1, 1, 2, 1, 3, 3, 5, 2, 4, 3, 0,
       3, 0, 0, 4, 4, 3, 5, 4, 2, 2, 5, 1, 0, 2, 1, 5, 3, 5, 1, 5, 4, 5,
       1, 2, 0, 1, 4, 4, 2, 3, 1])

In [23]:
print(np.sum(clf.predict(X[:360])== y[:360])/len(y[:360]))


0.9235127478753541