This notebook has a group of summary stats for the testbed

I will start off by loading in the test notebooks


In [1]:
import os
import numpy as np
import time
from nbminer.notebook_miner import NotebookMiner
from nbminer.cells.cells import Cell
from nbminer.stats.summary import Summary
from nbminer.stats.multiple_summary import MultipleSummary

people = os.listdir('../testbed/Final')
notebooks = []
for person in people:
    person = os.path.join('../testbed/Final', person)
    if os.path.isdir(person):
        direc = os.listdir(person)
        notebooks.extend([os.path.join(person, filename) for filename in direc if filename.endswith('.ipynb')])

notebook_objs = []
for nb in notebooks:
    nb_obj = NotebookMiner(nb)
    notebook_objs.append(nb_obj)
    nb_obj.write_to_file(nb)

In [3]:
ac = notebook_objs[0].get_all_cells()

In [6]:
list_of_source = []
for cell in ac:
    if cell.is_python():
        list_of_source.append(cell.get_source())
    if cell.is_markdown():
        print (cell.get_source())


This notebook has been normalized

# Warm up:
# Data Wrangling:
By means of descriptive statistics and plots, show the different volume of engagement (e.g., number of favorites and retweets) that the accounts generate. Compute the results per year (to highlight the growth trends), per month (to figure out if the accounts follow the academic year), and per hour of the day (to find out if tweets posted at a certain hour get more attention). Similarly, break down the results per hashtag (e.g., #EPFLisAwesome) -- are there hashtags that are used more often than others, and that obtain more engagement than others?


# Machine Learning :

In [7]:
(list_of_source[0])


Out[7]:
"\n# coding: utf-8\n\n# In[ ]:\n\nimport pandas as pd\nimport numpy as np\nimport os\n\nfrom sklearn.linear_model import LinearRegression\nfrom sklearn import metrics\n\nimport matplotlib.pyplot as plt\nget_ipython().magic('matplotlib inline')\n\n"

In [2]:
notebook_objs[0]


Out[2]:
<nbminer.notebook_miner.NotebookMiner at 0x1112d34e0>

Then, lets gather all the objects into a summary object and get the summary arrays for the group of notebooks


In [2]:
multiple = MultipleSummary(notebook_objs)
header, val = multiple.all_full_summaries()


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-2-50aa48ac75eb> in <module>()
----> 1 multiple = MultipleSummary(notebook_objs)
      2 header, val = multiple.all_full_summaries()

~/stanford/fall2017/snap/notebooks/nbminer/stats/multiple_summary.py in __init__(self, notebook_vec)
      8             notebook_vec = [notebook_vec]
      9         self.summary_vec = [Summary(nb) for nb in notebook_vec]
---> 10         self.summary_string_vec = [sm.full_statistics() for sm in self.summary_vec]
     11 
     12     def overall_summary(self):

~/stanford/fall2017/snap/notebooks/nbminer/stats/multiple_summary.py in <listcomp>(.0)
      8             notebook_vec = [notebook_vec]
      9         self.summary_vec = [Summary(nb) for nb in notebook_vec]
---> 10         self.summary_string_vec = [sm.full_statistics() for sm in self.summary_vec]
     11 
     12     def overall_summary(self):

~/stanford/fall2017/snap/notebooks/nbminer/stats/summary.py in full_statistics(self)
     18 
     19     def full_statistics(self):
---> 20         return_labels, return_arr = self.basic_statistics()
     21         return_labels += ['lines_of_code', 'words_of_code', 'chars_of_code', 'largest_code_cell', 'smallest_code_cell']
     22         return_arr_new = []

~/stanford/fall2017/snap/notebooks/nbminer/stats/summary.py in basic_statistics(self)
     10         return_labels = ['filename','num_cells','num_markdown','num_python','num_executed']
     11         return_arr = []
---> 12         return_arr.append(self.notebook.get_filename())
     13         return_arr.append(self.notebook.get_number_cells())
     14         return_arr.append(self.get_number_markdown_cells())

AttributeError: 'NotebookMiner' object has no attribute 'get_filename'

In [12]:
val.shape


Out[12]:
(177, 10)

In [4]:
%matplotlib inline

import matplotlib
import numpy as np
import matplotlib.pyplot as plt

In [37]:
plt.rcParams['figure.figsize'] = (20, 20)
fig, axes = plt.subplots(3,3)
for i in range(9):
    axes[int(i/3),i%3].hist(val[:,i+1].astype(float), bins=20)
    axes[int(i/3),i%3].set_title(header[i+1])