First pass at combining Cells

Method: Combine any two cells that have a variable in common. Recurse. Result: For now, the result is the number of Cells from before the combination, and the number of Cells after.



In [1]:

    
# Necessary imports 
import os
import time
from nbminer.notebook_miner import NotebookMiner
from nbminer.cells.cells import Cell
from nbminer.features.ast_features import ASTFeatures
from nbminer.stats.summary import Summary
from nbminer.stats.multiple_summary import MultipleSummary



In [2]:

    
#Loading in the notebooks
people = os.listdir('../testbed/Final')
notebooks = []
for person in people:
    person = os.path.join('../testbed/Final', person)
    if os.path.isdir(person):
        direc = os.listdir(person)
        notebooks.extend([os.path.join(person, filename) for filename in direc if filename.endswith('.ipynb')])
notebook_objs = [NotebookMiner(file) for file in notebooks]
a = ASTFeatures(notebook_objs)

Now that we have everything loaded into an ASTFeatures variable, we can take a look at what happens when we apply the algorithm to the first notebook



In [3]:

    
print ("Number of cells before combination:", a.nb_features[0].get_number_cells())
print ("Number of cells after  combination:", a.nb_features[0].get_num_combined_sets())









    



Number of cells before combination: 23
Number of cells after  combination: 7

This doesn't look too bad at first glance, but when we consider the cells that don't have ANY variables, it turns out we've ended up with just 2 cells:



In [4]:

    
print ("Number of cells before combination:", a.nb_features[0].get_number_cells())
print ("Number of cells after  combination:", a.nb_features[0].get_num_combined_sets())
print ("Number of cells without variables :", a.nb_features[0].get_number_cells_without_variables())









    



Number of cells before combination: 23
Number of cells after  combination: 7
Number of cells without variables : 5

Ok, time to compile results on every notebook:



In [5]:

    
number_before = []
resulting_sets = []
no_variables = []
for i, nb in enumerate(a.nb_features):
    number_before.append(nb.get_number_cells())
    resulting_sets.append(nb.get_combined_indices())
    no_variables.append(nb.get_number_cells_without_variables())



In [6]:

    
len_sets = [len(el) for el in resulting_sets]



In [ ]:



In [7]:

    
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
plt.rcParams['figure.figsize'] = (20, 10)
fig, axes = plt.subplots(1,3)
axes[0].hist(np.array(number_before),bins=10)
axes[0].set_title('Number of Cells in NB before')
axes[1].hist(np.array(len_sets),bins=10)
axes[1].set_title('Number of Cells in NB after')
axes[2].hist(np.array(len_sets)-np.array(no_variables),bins=10)
axes[2].set_title('Number of Cells in NB after (exclude blank)')
plt.show()



In [8]:

    
sizes_of_all_sets = []
sizes_of_all_sets_without_singles = []
for s_vec in resulting_sets:
    for s in s_vec:
        if len(s) > 1:
            sizes_of_all_sets_without_singles.append(len(s))
        sizes_of_all_sets.append(len(s))
        
print ("Number of total sets in all nbs: ", len(sizes_of_all_sets))
print ("Number of total sets in all nbs (exlude sets of 1): ", len(sizes_of_all_sets_without_singles))
print ("Single sets accounted for by cells without variables: ", np.sum(np.array(no_variables)))
print ("Number of total sets in all nbs (exlude blank sets): ", len(sizes_of_all_sets)-np.sum(np.array(no_variables)))









    



Number of total sets in all nbs:  2729
Number of total sets in all nbs (exlude sets of 1):  294
Single sets accounted for by cells without variables:  2052
Number of total sets in all nbs (exlude blank sets):  677



In [9]:

    
fig, axes = plt.subplots(1,2)
axes[0].hist(np.array(sizes_of_all_sets),bins=10)
axes[0].set_title('Number of cells in each set')
axes[1].hist(np.array(sizes_of_all_sets_without_singles),bins=10)
axes[1].set_title('Number of Cells in each set (exclude singles)')









    Out[9]:





<matplotlib.text.Text at 0x1149a07b8>

Why do cells not have variables?

To answer why a cell might not have variables, lets take a look a sample



In [19]:

    
no_vars = [a.nb_features[i].get_cells_without_variables() for i in range(5)]



In [20]:

    
for nb in no_vars:
    for cellF in nb:
        print ("*"*100)
        print (cellF.get_feature('code'))









    



****************************************************************************************************

****************************************************************************************************
# Warm up:
****************************************************************************************************
# Data Wrangling:
****************************************************************************************************

# coding: utf-8

# In[ ]:




****************************************************************************************************
# Machine Learning :
****************************************************************************************************

****************************************************************************************************

# coding: utf-8

# In[ ]:

# FOR EPFL FIRST select only the values that we need


****************************************************************************************************

# coding: utf-8

# In[ ]:

# ETH Whole Dataframe


****************************************************************************************************

# coding: utf-8

# In[ ]:

# I have already done the first part of data wrangling by reducing the dataframe to only the columns i possibly need


****************************************************************************************************

# coding: utf-8

# In[ ]:

# i drop userId cause its the same for all records


****************************************************************************************************

# coding: utf-8

# In[ ]:

# Until now I do not have much time since parsing the json file took most of my time so I am going to ML part 


****************************************************************************************************

# coding: utf-8

# In[ ]:

# id_str is going to be categorical data for ML


****************************************************************************************************

# coding: utf-8

# In[ ]:

from sklearn.neighbors import KNeighborsRegressor


****************************************************************************************************

# coding: utf-8

# In[ ]:

# This was just some trivial example


****************************************************************************************************

# coding: utf-8

# In[ ]:

from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import KFold as cross_validation_KFold
from sklearn.model_selection import KFold as model_selection_KFold


****************************************************************************************************

# coding: utf-8

# In[ ]:

# So with KFold Cross Validation we found out that the best K is 36 and 49 for eth (both small)


****************************************************************************************************

# coding: utf-8

# In[ ]:

# Also i think it would be important to have the time of day in the X, if it is morning or noon for example
# I would do that by categorizing the values of column date


****************************************************************************************************

# coding: utf-8

# In[ ]:




****************************************************************************************************

****************************************************************************************************
4.
****************************************************************************************************

# coding: utf-8

# In[ ]:

from gensim import corpora
import gensim


****************************************************************************************************

# coding: utf-8

# In[ ]:




****************************************************************************************************

****************************************************************************************************
# TASK 1: DATA WRANGLING
****************************************************************************************************

# coding: utf-8

# In[ ]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


****************************************************************************************************
# TASK 2: DESCRIPTIVE STATISTICS AND PLOTS
****************************************************************************************************
# TASK 3: Regression
****************************************************************************************************

# coding: utf-8

# In[ ]:




****************************************************************************************************

****************************************************************************************************

# coding: utf-8

# In[ ]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import string
import re
import operator
from gensim import corpora, models, similarities
from gensim.models import hdpmodel, ldamodel
import nltk
from nltk.probability import FreqDist


****************************************************************************************************

# coding: utf-8

# In[ ]:




****************************************************************************************************
## YEAR
****************************************************************************************************
### ETH
****************************************************************************************************
## MONTH
****************************************************************************************************
### EPFL
****************************************************************************************************
### ETH
****************************************************************************************************
### EPFL
****************************************************************************************************
### ETH
****************************************************************************************************

# coding: utf-8

# In[ ]:




****************************************************************************************************

# coding: utf-8

# In[ ]:




****************************************************************************************************
### EPFL
****************************************************************************************************
### ETH



In [ ]: