In [1]:
# Necessary imports
import os
import time
from nbminer.notebook_miner import NotebookMiner
from nbminer.cells.cells import Cell
from nbminer.features.ast_features import ASTFeatures
from nbminer.stats.summary import Summary
from nbminer.stats.multiple_summary import MultipleSummary
In [2]:
#Loading in the notebooks
people = os.listdir('../testbed/Final')
notebooks = []
for person in people:
person = os.path.join('../testbed/Final', person)
if os.path.isdir(person):
direc = os.listdir(person)
notebooks.extend([os.path.join(person, filename) for filename in direc if filename.endswith('.ipynb')])
notebook_objs = [NotebookMiner(file) for file in notebooks]
a = ASTFeatures(notebook_objs)
Now that we have everything loaded into an ASTFeatures variable, we can take a look at what happens when we apply the algorithm to the first notebook
In [3]:
print ("Number of cells before combination:", a.nb_features[0].get_number_cells())
print ("Number of cells after combination:", a.nb_features[0].get_num_combined_sets())
This doesn't look too bad at first glance, but when we consider the cells that don't have ANY variables, it turns out we've ended up with just 2 cells:
In [4]:
print ("Number of cells before combination:", a.nb_features[0].get_number_cells())
print ("Number of cells after combination:", a.nb_features[0].get_num_combined_sets())
print ("Number of cells without variables :", a.nb_features[0].get_number_cells_without_variables())
Ok, time to compile results on every notebook:
In [5]:
number_before = []
resulting_sets = []
no_variables = []
for i, nb in enumerate(a.nb_features):
number_before.append(nb.get_number_cells())
resulting_sets.append(nb.get_combined_indices())
no_variables.append(nb.get_number_cells_without_variables())
In [6]:
len_sets = [len(el) for el in resulting_sets]
In [ ]:
In [7]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
plt.rcParams['figure.figsize'] = (20, 10)
fig, axes = plt.subplots(1,3)
axes[0].hist(np.array(number_before),bins=10)
axes[0].set_title('Number of Cells in NB before')
axes[1].hist(np.array(len_sets),bins=10)
axes[1].set_title('Number of Cells in NB after')
axes[2].hist(np.array(len_sets)-np.array(no_variables),bins=10)
axes[2].set_title('Number of Cells in NB after (exclude blank)')
plt.show()
In [8]:
sizes_of_all_sets = []
sizes_of_all_sets_without_singles = []
for s_vec in resulting_sets:
for s in s_vec:
if len(s) > 1:
sizes_of_all_sets_without_singles.append(len(s))
sizes_of_all_sets.append(len(s))
print ("Number of total sets in all nbs: ", len(sizes_of_all_sets))
print ("Number of total sets in all nbs (exlude sets of 1): ", len(sizes_of_all_sets_without_singles))
print ("Single sets accounted for by cells without variables: ", np.sum(np.array(no_variables)))
print ("Number of total sets in all nbs (exlude blank sets): ", len(sizes_of_all_sets)-np.sum(np.array(no_variables)))
In [9]:
fig, axes = plt.subplots(1,2)
axes[0].hist(np.array(sizes_of_all_sets),bins=10)
axes[0].set_title('Number of cells in each set')
axes[1].hist(np.array(sizes_of_all_sets_without_singles),bins=10)
axes[1].set_title('Number of Cells in each set (exclude singles)')
Out[9]:
In [19]:
no_vars = [a.nb_features[i].get_cells_without_variables() for i in range(5)]
In [20]:
for nb in no_vars:
for cellF in nb:
print ("*"*100)
print (cellF.get_feature('code'))
In [ ]: