notebook.community

Edit and run



In [114]:

    
# Necessary imports 
import os
import time
from nbminer.notebook_miner import NotebookMiner
from nbminer.cells.cells import Cell
from nbminer.features.ast_features import ASTFeatures
from nbminer.stats.summary import Summary
from nbminer.stats.multiple_summary import MultipleSummary



In [115]:

    
#Loading in the notebooks
people = os.listdir('../testbed/Final')
notebooks = []
for person in people:
    person = os.path.join('../testbed/Final', person)
    if os.path.isdir(person):
        direc = os.listdir(person)
        notebooks.extend([os.path.join(person, filename) for filename in direc if filename.endswith('.ipynb')])
notebook_objs = [NotebookMiner(file) for file in notebooks]
a = ASTFeatures(notebook_objs)



In [116]:

    
cells = a.get_list_segments()
cells[0]









    Out[116]:





<nbminer.features.featurize.cell_features.SegmentFeatures at 0x12454e5f8>



In [123]:

    
itemsets = [cell.get_feature('nodes') for cell in cells]

print (itemsets[1])
print (len(itemsets))









    



[(<class '_ast.Import'>, 4), (<class '_ast.ImportFrom'>, 2), (<class '_ast.Expr'>, 1)]
9366



In [118]:

    
possible_keys = set()
for nodes in itemsets:
    for node in nodes:
        if node[0] not in possible_keys:
            possible_keys.add(node[0])
classes = {}
class_lookup = {}
for i, key in enumerate(possible_keys):
    classes[key] = i
    class_lookup[i] = key



In [119]:

    
class_itemsets = []
for itemset in itemsets:
    temp = []
    for el in itemset:
        temp.append(classes[el[0]])
    class_itemsets.append(temp)



In [120]:

    
print (class_itemsets[1])









    



[10, 8, 4]



In [194]:

    
import pyfpgrowth
patterns = pyfpgrowth.find_frequent_patterns(class_itemsets, 10)



In [195]:

    
for pattern in patterns:
    if len(pattern) > 2:
        print ([class_lookup[pattern[i]] for i in range(len(pattern))])









    



[<class '_ast.FunctionDef'>, <class '_ast.ImportFrom'>, <class '_ast.Import'>]
[<class '_ast.Assign'>, <class '_ast.ImportFrom'>, <class '_ast.Import'>]
[<class '_ast.Assign'>, <class '_ast.Expr'>, <class '_ast.ImportFrom'>, <class '_ast.Import'>]
[<class '_ast.Assign'>, <class '_ast.Expr'>, <class '_ast.Import'>]
[<class '_ast.Expr'>, <class '_ast.ImportFrom'>, <class '_ast.Import'>]
[<class '_ast.Assign'>, <class '_ast.For'>, <class '_ast.ImportFrom'>]
[<class '_ast.FunctionDef'>, <class '_ast.Assign'>, <class '_ast.ImportFrom'>]
[<class '_ast.Assign'>, <class '_ast.Expr'>, <class '_ast.ImportFrom'>]
[<class '_ast.Assign'>, <class '_ast.Expr'>, <class '_ast.For'>]
[<class '_ast.FunctionDef'>, <class '_ast.Assign'>, <class '_ast.Expr'>]



In [127]:

    
second_itemsets = [np.unique(np.array(cell.get_feature('short_name_string'))) for cell in cells]



In [212]:

    
patterns = pyfpgrowth.find_frequent_patterns(second_itemsets, 20)



In [213]:

    
for pattern in patterns:
    if len(pattern) > 2:
        print (pattern)









    



('get_ipython', 'magic', 'set_context')
('fit', 'mean_squared_error', 'predict')
('arange', 'bar', 'show')
('axis', 'figure', 'show')
('axis', 'figure', 'imshow')
('axis', 'figure', 'imshow', 'show')
('axis', 'generate', 'join')
('WordCloud', 'axis', 'join')
('axis', 'imshow', 'join')
('WordCloud', 'axis', 'generate', 'join')
('axis', 'generate', 'imshow', 'join')
('WordCloud', 'axis', 'imshow', 'join')
('WordCloud', 'axis', 'generate', 'imshow', 'join')
('axis', 'generate', 'show')
('WordCloud', 'axis', 'show')
('axis', 'imshow', 'show')
('WordCloud', 'axis', 'generate', 'show')
('axis', 'generate', 'imshow', 'show')
('WordCloud', 'axis', 'imshow', 'show')
('WordCloud', 'axis', 'generate', 'imshow', 'show')
('WordCloud', 'axis', 'generate')
('axis', 'generate', 'imshow')
('WordCloud', 'axis', 'generate', 'imshow')
('WordCloud', 'axis', 'imshow')
('barplot', 'figure', 'groupby')
('RandomForestRegressor', 'fit', 'predict')
('figure', 'generate', 'imshow')
('WordCloud', 'figure', 'generate')
('WordCloud', 'figure', 'generate', 'imshow')
('generate', 'imshow', 'show')
('WordCloud', 'generate', 'show')
('WordCloud', 'generate', 'imshow', 'show')
('generate', 'imshow', 'join')
('WordCloud', 'generate', 'join')
('WordCloud', 'generate', 'imshow', 'join')
('WordCloud', 'generate', 'imshow')
('LinearRegression', 'fit', 'predict')
('cross_val_score', 'mean', 'print')
('WordCloud', 'imshow', 'join')
('figure', 'imshow', 'show')
('WordCloud', 'figure', 'imshow')
('WordCloud', 'imshow', 'show')
('legend', 'title', 'xticks')
('title', 'xticks', 'ylabel')
('range', 'show', 'xticks')
('bar', 'range', 'xticks')
('bar', 'title', 'xticks')
('bar', 'show', 'title', 'xticks')
('bar', 'show', 'xticks')
('show', 'title', 'xticks')
('fit', 'print', 'train_test_split')
('fit', 'predict', 'train_test_split')
('plot', 'set_title', 'sum')
('set_title', 'set_ylabel', 'sum')
('groupby', 'set_title', 'sum')
('groupby', 'set_title', 'set_xlabel')
('groupby', 'set_title', 'set_xlabel', 'set_ylabel')
('groupby', 'plot', 'set_title', 'set_xlabel')
('groupby', 'plot', 'set_title', 'set_xlabel', 'set_ylabel')
('groupby', 'plot', 'set_title')
('groupby', 'set_title', 'set_ylabel')
('groupby', 'plot', 'set_title', 'set_ylabel')
('set_title', 'set_ylabel', 'subplots')
('plot', 'set_title', 'set_xlabel')
('plot', 'set_title', 'set_ylabel')
('plot', 'set_title', 'set_xlabel', 'set_ylabel')
('set_title', 'set_xlabel', 'set_ylabel')
('LdaModel', 'doc2bow', 'print_topics')
('Dictionary', 'doc2bow', 'print_topics')
('Dictionary', 'LdaModel', 'print_topics')
('Dictionary', 'LdaModel', 'doc2bow', 'print_topics')
('agg', 'groupby', 'plot')
('groupby', 'reset_index', 'sum')
('groupby', 'plot', 'subplots')
('set_xlabel', 'set_ylabel', 'subplots')
('set_xlabel', 'set_ylabel', 'show')
('groupby', 'plot', 'set_ylabel')
('groupby', 'set_xlabel', 'set_ylabel')
('groupby', 'plot', 'set_xlabel', 'set_ylabel')
('plot', 'set_xlabel', 'set_ylabel')
('plot', 'set_xlabel', 'sum')
('groupby', 'set_xlabel', 'sum')
('groupby', 'plot', 'set_xlabel', 'sum')
('groupby', 'plot', 'set_xlabel')
('Dictionary', 'LdaModel', 'lower')
('LdaModel', 'doc2bow', 'lower')
('Dictionary', 'LdaModel', 'doc2bow', 'lower')
('Dictionary', 'doc2bow', 'lower')
('count', 'groupby', 'plot')
('Dictionary', 'LdaModel', 'len')
('LdaModel', 'doc2bow', 'len')
('Dictionary', 'doc2bow', 'len')
('Dictionary', 'LdaModel', 'doc2bow', 'len')
('Dictionary', 'LdaModel', 'doc2bow')
('fit', 'mean', 'predict')
('fit', 'mean', 'predict', 'print')
('mean', 'predict', 'print')
('fit', 'predict', 'print')
('groupby', 'sum', 'xlabel')
('groupby', 'show', 'xlabel')
('groupby', 'plot', 'xlabel')
('groupby', 'plot', 'show', 'xlabel')
('sum', 'title', 'xlabel')
('sum', 'title', 'xlabel', 'ylabel')
('show', 'sum', 'xlabel', 'ylabel')
('plot', 'show', 'sum', 'xlabel')
('plot', 'sum', 'xlabel')
('plot', 'sum', 'xlabel', 'ylabel')
('sum', 'xlabel', 'ylabel')
('figure', 'legend', 'xlabel')
('figure', 'legend', 'xlabel', 'ylabel')
('figure', 'title', 'xlabel')
('figure', 'title', 'xlabel', 'ylabel')
('figure', 'plot', 'xlabel')
('figure', 'plot', 'xlabel', 'ylabel')
('figure', 'xlabel', 'ylabel')
('legend', 'title', 'xlabel')
('legend', 'plot', 'title', 'xlabel')
('legend', 'title', 'xlabel', 'ylabel')
('legend', 'plot', 'title', 'xlabel', 'ylabel')
('legend', 'show', 'xlabel')
('legend', 'plot', 'show', 'xlabel')
('legend', 'plot', 'xlabel')
('legend', 'plot', 'xlabel', 'ylabel')
('legend', 'xlabel', 'ylabel')
('show', 'title', 'xlabel')
('show', 'title', 'xlabel', 'ylabel')
('plot', 'show', 'title', 'xlabel')
('plot', 'show', 'title', 'xlabel', 'ylabel')
('show', 'xlabel', 'ylabel')
('plot', 'show', 'xlabel', 'ylabel')
('plot', 'show', 'xlabel')
('plot', 'title', 'xlabel')
('title', 'xlabel', 'ylabel')
('plot', 'title', 'xlabel', 'ylabel')
('plot', 'xlabel', 'ylabel')
('groupby', 'legend', 'plot')
('figure', 'legend', 'ylabel')
('legend', 'show', 'ylabel')
('legend', 'show', 'title')
('legend', 'plot', 'show')
('legend', 'title', 'ylabel')
('legend', 'plot', 'title', 'ylabel')
('legend', 'plot', 'ylabel')
('legend', 'plot', 'title')
('figure', 'groupby', 'title')
('figure', 'groupby', 'sum')
('figure', 'title', 'ylabel')
('figure', 'plot', 'ylabel')
('figure', 'sum', 'title')
('figure', 'show', 'title')
('groupby', 'plot', 'ylabel')
('groupby', 'title', 'ylabel')
('show', 'sum', 'ylabel')
('show', 'sum', 'title', 'ylabel')
('plot', 'sum', 'ylabel')
('plot', 'show', 'sum', 'ylabel')
('sum', 'title', 'ylabel')
('show', 'title', 'ylabel')
('plot', 'show', 'title', 'ylabel')
('plot', 'show', 'ylabel')
('plot', 'title', 'ylabel')
('fit', 'mean', 'print')
('bar', 'range', 'show')
('bar', 'show', 'title')
('isnull', 'print', 'sum')
('groupby', 'plot', 'show', 'title')
('groupby', 'show', 'sum', 'title')
('groupby', 'plot', 'show', 'sum', 'title')
('groupby', 'plot', 'title')
('groupby', 'plot', 'sum', 'title')
('groupby', 'sum', 'title')
('plot', 'sum', 'title')
('plot', 'show', 'sum', 'title')
('show', 'sum', 'title')
('plot', 'show', 'title')
('append', 'len', 'range')
('groupby', 'show', 'sum')
('groupby', 'plot', 'show', 'sum')
('groupby', 'plot', 'show')
('plot', 'show', 'sum')
('apply', 'groupby', 'sum')
('groupby', 'head', 'sum')
('groupby', 'plot', 'sum')



In [139]:

    
def cell_to_vec(cell, classes):
    cell_vec = [0 for i in range(len(classes.keys()))]
    for node in cell:
        cell_vec[classes[node[0]]] = node[1]
    return cell_vec



In [140]:

    
import numpy as np
cell_vecs = []
for cell in itemsets:
    cv = np.array(cell_to_vec(cell, classes))
    cell_vecs.append(cv)
cell_vecs = np.array(cell_vecs)
print (cell_vecs.shape)



In [180]:

    
#CLUSTER
from sklearn.cluster import KMeans
nclust = 6
kmeans = KMeans(n_clusters=nclust, random_state=0).fit(cell_vecs)
np.set_printoptions(precision=4)
#kmeans.cluster_centers_
for cluster in kmeans.cluster_centers_:
    print ("\n\nCLUSTER")
    for i in range(len(cluster)):
        print (class_lookup[i],': %.2f' % cluster[i])









    




CLUSTER
<class '_ast.If'> : 0.00
<class '_ast.ClassDef'> : -0.00
<class '_ast.FunctionDef'> : 0.08
<class '_ast.Assign'> : 0.22
<class '_ast.Expr'> : 0.58
<class '_ast.Try'> : 0.00
<class '_ast.Delete'> : 0.03
<class '_ast.For'> : 0.04
<class '_ast.ImportFrom'> : 0.06
<class '_ast.With'> : 0.00
<class '_ast.Import'> : 0.05
<class '_ast.AugAssign'> : 0.00


CLUSTER
<class '_ast.If'> : -0.00
<class '_ast.ClassDef'> : -0.00
<class '_ast.FunctionDef'> : 0.05
<class '_ast.Assign'> : 2.92
<class '_ast.Expr'> : 0.45
<class '_ast.Try'> : -0.00
<class '_ast.Delete'> : 0.02
<class '_ast.For'> : 0.11
<class '_ast.ImportFrom'> : 0.05
<class '_ast.With'> : 0.00
<class '_ast.Import'> : 0.02
<class '_ast.AugAssign'> : 0.00


CLUSTER
<class '_ast.If'> : -0.00
<class '_ast.ClassDef'> : -0.00
<class '_ast.FunctionDef'> : 0.26
<class '_ast.Assign'> : 3.30
<class '_ast.Expr'> : 13.80
<class '_ast.Try'> : -0.00
<class '_ast.Delete'> : 0.00
<class '_ast.For'> : 0.20
<class '_ast.ImportFrom'> : 0.02
<class '_ast.With'> : -0.00
<class '_ast.Import'> : 0.11
<class '_ast.AugAssign'> : 0.13


CLUSTER
<class '_ast.If'> : 0.00
<class '_ast.ClassDef'> : 0.00
<class '_ast.FunctionDef'> : 0.06
<class '_ast.Assign'> : 0.90
<class '_ast.Expr'> : 4.52
<class '_ast.Try'> : 0.00
<class '_ast.Delete'> : 0.00
<class '_ast.For'> : 0.03
<class '_ast.ImportFrom'> : 0.02
<class '_ast.With'> : 0.00
<class '_ast.Import'> : 0.04
<class '_ast.AugAssign'> : 0.00


CLUSTER
<class '_ast.If'> : -0.00
<class '_ast.ClassDef'> : -0.00
<class '_ast.FunctionDef'> : 0.04
<class '_ast.Assign'> : 9.66
<class '_ast.Expr'> : 1.94
<class '_ast.Try'> : -0.00
<class '_ast.Delete'> : 0.17
<class '_ast.For'> : 0.24
<class '_ast.ImportFrom'> : 0.14
<class '_ast.With'> : -0.00
<class '_ast.Import'> : 0.01
<class '_ast.AugAssign'> : 0.00


CLUSTER
<class '_ast.If'> : -0.00
<class '_ast.ClassDef'> : 0.01
<class '_ast.FunctionDef'> : 0.05
<class '_ast.Assign'> : 0.14
<class '_ast.Expr'> : 1.35
<class '_ast.Try'> : -0.00
<class '_ast.Delete'> : -0.00
<class '_ast.For'> : -0.00
<class '_ast.ImportFrom'> : 3.92
<class '_ast.With'> : 0.02
<class '_ast.Import'> : 5.84
<class '_ast.AugAssign'> : 0.00



In [200]:

    
from helper_classes.cond_computer import CondComputer
class_to_cells = {}
node_list = []
for i, nb in enumerate(a.nb_features):
    node_list.append('start')
    for cell in (nb.get_all_cells()):
        t = cell_to_vec(cell.get_feature('nodes'), classes)
        t = kmeans.predict([t])[0]
        node_list.append(t)
        if t not in class_to_cells:
            class_to_cells[t] = []
        class_to_cells[t].append(cell.get_feature('code'))
    node_list.append('end')
cc = CondComputer(node_list)
keys = ['start']
for i in range(nclust):
    keys.append(i)
keys.append('end')



In [182]:

    
arr, arr_names = cc.compute_probabilities(cc.count_totals,.01,keys=keys)



In [183]:

    
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
plt.rcParams['figure.figsize'] = (20, 10)

cc.plot_bar(arr, arr_names, 'Probability per Node type')



In [184]:

    
cc.plot_conditional_bar(arr, arr_names, 0, 'Probability per Node type', x_labels=keys)









    



start



In [185]:

    
cc.plot_conditional_bar(arr, arr_names, 1, 'Probability per Node type')



In [186]:

    
cc.plot_conditional_bar(arr, arr_names, 2, 'Probability per Node type')



In [187]:

    
cc.plot_conditional_bar(arr, arr_names, 3, 'Probability per Node type')



In [188]:

    
cc.plot_conditional_bar(arr, arr_names, 4, 'Probability per Node type')



In [189]:

    
cc.plot_conditional_bar(arr, arr_names, 5, 'Probability per Node type')



In [205]:

    
# Comment cells or short one liners
for i, cell in enumerate(class_to_cells[0]):
    if i > 20:
        break
    print (cell)









    



# Warm up:

# coding: utf-8

# In[ ]:

df_epfl.head(2)



# coding: utf-8

# In[ ]:

df_epfl.size



# coding: utf-8

# In[ ]:

df_epfl_dw.size


# Data Wrangling:

# coding: utf-8

# In[ ]:

# First check for NaN values
df_epfl.isnull().sum()



# coding: utf-8

# In[ ]:




# Machine Learning :


# coding: utf-8

# In[ ]:

get_ipython().magic('matplotlib inline')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.options.mode.chained_assignment = None  # default='warn'
import warnings
warnings.filterwarnings('ignore')



# coding: utf-8

# In[ ]:

type(t1)



# coding: utf-8

# In[ ]:

t1[0]



# coding: utf-8

# In[ ]:

# FOR EPFL FIRST select only the values that we need



# coding: utf-8

# In[ ]:

epfl_df



# coding: utf-8

# In[ ]:

# ETH Whole Dataframe



# coding: utf-8

# In[ ]:

# I have already done the first part of data wrangling by reducing the dataframe to only the columns i possibly need



# coding: utf-8

# In[ ]:

epfl_df.head()



# coding: utf-8

# In[ ]:

small_epfl = epfl_df[epfl_df['tweetId'] % 10 == 3]



# coding: utf-8

# In[ ]:

small_epfl.shape



# coding: utf-8

# In[ ]:

small_epfl.columns



In [207]:

    
# Short lists of (mainly) Assignments
for i, cell in enumerate(class_to_cells[1]):
    if i > 20:
        break
    print (cell)









    



# coding: utf-8

# In[ ]:

#Load the data in pandas Dataframe

df_epfl = pd.read_json("epfl_en.json")
df_eth = pd.read_json("eth_en.json")



# coding: utf-8

# In[ ]:

#Build the two down sampled Dataframe
df_epfl_dw = df_epfl[df_epfl['id'] % 10 == 2]
df_eth_dw = df_eth[df_eth['id'] % 10 == 2]



# coding: utf-8

# In[ ]:

#It's hard to tell which null values will affect future work at this point, but it seems like the id field is
# never null which is good.

#The other thing that we shall do is to use a proper indexing
df_epfl.index = df_epfl['id']
df_epfl_dw.index = df_epfl_dw['id']

df_eth.index = df_eth['id']
df_eth_dw.index = df_eth_dw['id']

#Also some fields are always null so we can drop them but it doesn't affect future works at this moment.




# coding: utf-8

# In[ ]:

df_epfl['user'] #is a dictionnary we want the user's id

#let's build a df with only the tweeter user's id

#df_epfl['user'].iloc[0]['id']
epfl_user = pd.DataFrame()

for i in range(0, df_epfl.shape[0]):
    epfl_user = epfl_user.append([df_epfl['user'].iloc[i]['id']])
    
#I got tricked and there is only one user as said on exam

unique = epfl_user.drop_duplicates()



# coding: utf-8

# In[ ]:

#lets check the id in eth:

eth_user = pd.DataFrame()

for i in range(0, df_eth.shape[0]):
    eth_user = eth_user.append([df_eth['user'].iloc[i]['id']])
    
#I got tricked and there is only one user tweeting as said on exam

unique2 = eth_user.drop_duplicates()
unique2



# coding: utf-8

# In[ ]:

# First let's count number of overall favorite and retweets
favorites_epfl = df_epfl['retweet_count'].sum()
retweet_epfl = df_epfl['favorite_count'].sum()

favorites_eth = df_eth['retweet_count'].sum()
retweet_eth = df_eth['favorite_count'].sum()

#Let's plot this
plt.figure();
df_show = pd.DataFrame(data=[['EPFL', favorites_epfl+retweet_epfl], ['ETH', favorites_eth + retweet_eth]], columns=['Uni', 'Number of Tweet + Likes'], index=['EPFL', 'ETH'])
df_show.plot(kind='bar')

print(favorites_epfl + retweet_epfl)

#It appears EPFL is more present in the twitter game

#We could also have used a pie chart to show this data effectively



# coding: utf-8

# In[ ]:

# Let's groupby hashtag now 

#let's add a column with only the hashtag from the entities column

df_epfl_hashtag = df_epfl

df_hashtags = pd.DataFrame()

for i in range(0, df_epfl_hashtag.shape[0]):
    
    hashtag_i = df_epfl_hashtag['entities'].iloc[i]['hashtags']
    
    if (hashtag_i != ''):
        hashtags = hashtag_i[0]['text']
    
        df_hashtags['hashtags'] = df_hashtags.append([hashtags])
    else:
        df_hashtags['hashtags'] = df_hashtags.append(['No_tag'])

#df_epfl_hashtag
#df_epfl_hashtag = df_epfl.groupby(df_epfl['hashtag']).sum()

# Will fix later if TIME



# coding: utf-8

# In[ ]:

t1 = pd.read_json('epfl_en.json', typ='dataframe')
t2 = pd.read_json('eth_en.json', typ='dataframe')



# coding: utf-8

# In[ ]:

columns_with_nan = pd.isnull(small_epfl).sum() > 0
columns_with_nan = list(columns_with_nan[columns_with_nan].index)
print(columns_with_nan)



# coding: utf-8

# In[ ]:

small_epfl['id_str'] = small_epfl['id_str'].astype('category')
cat_cols = small_epfl.select_dtypes(['category']).columns
small_epfl[cat_cols] = small_epfl[cat_cols].apply(lambda col : col.cat.codes)



# coding: utf-8

# In[ ]:

small_eth['id_str'] = small_eth['id_str'].astype('category')
cat_cols = small_eth.select_dtypes(['category']).columns
small_eth[cat_cols] = small_eth[cat_cols].apply(lambda col : col.cat.codes)



# coding: utf-8

# In[ ]:

columns_with_nan = pd.isnull(small_eth).sum() > 0
columns_with_nan = list(columns_with_nan[columns_with_nan].index)
print(columns_with_nan)



# coding: utf-8

# In[ ]:

list_of_results = []
neighbors = range(2,50)
for n in neighbors:
    # define cv object
    cv = cross_validation_KFold(X.shape[0], shuffle = True, n_folds=10, random_state=4)
    # initialize classifier
    neigh = KNeighborsRegressor(n_neighbors=n)
    # estimate classifier using CV
    avg_test_accuracy = np.mean(cross_val_score(neigh, X, y, cv=cv))
    list_of_results.append(avg_test_accuracy)
    print(avg_test_accuracy)



# coding: utf-8

# In[ ]:

max_test_accuracy = np.max(list_of_results)
max_pos = list_of_results.index(max_test_accuracy)
opt_n = neighbors[max_pos]
opt_n



# coding: utf-8

# In[ ]:

X = small_eth[['favorite','place','id_str']]
y = small_eth['retweet_count']

list_of_results = []
neighbors = range(2,50)
for n in neighbors:
    # define cv object
    cv = cross_validation_KFold(X.shape[0], shuffle = True, n_folds=10, random_state=4)
    # initialize classifier
    neigh = KNeighborsRegressor(n_neighbors=n)
    # estimate classifier using CV
    avg_test_accuracy = np.mean(cross_val_score(neigh, X, y, cv=cv))
    list_of_results.append(avg_test_accuracy)
    print(avg_test_accuracy)



# coding: utf-8

# In[ ]:

max_test_accuracy = np.max(list_of_results)
max_pos = list_of_results.index(max_test_accuracy)
opt_n = neighbors[max_pos]
opt_n



# coding: utf-8

# In[ ]:

epfl_en['year'] = [data.year for data in epfl_en.created_at]
epfl_en['month'] = [data.month for data in epfl_en.created_at]
epfl_en['hour'] = [data.hour for data in epfl_en.created_at]



# coding: utf-8

# In[ ]:

eth_en['year'] = [data.year for data in eth_en.created_at]
eth_en['month'] = [data.month for data in eth_en.created_at]
eth_en['hour'] = [data.hour for data in eth_en.created_at]



# coding: utf-8

# In[ ]:

num_topics = 5
num_top_words = 7



# coding: utf-8

# In[ ]:

mod_tokens = mod_text.apply(lambda tweet: tokenizer(tweet))
m_lower_case = mod_tokens.apply(lambda row : to_lower_case(row))
m_filtered_tokens = lower_case.apply(lambda row: filter_tokens(row))
m_lemmas = m_filtered_tokens.apply(lambda row: lemmantizer(row))



# coding: utf-8

# In[ ]:

dictionary = corpora.Dictionary(m_lemmas)
m_corpus = get_corpus(m_lemmas)



In [208]:

    
# Long lists of (mainly) Function calls. Alot of plotting happening here
for i, cell in enumerate(class_to_cells[2]):
    if i > 20:
        break
    print (cell)









    



# coding: utf-8

# In[ ]:

print('Favorites per year:')
plt.plot(group_by_and_aggregate(epfl[['favorite_count', 'created_at']], lambda dt: dt.year), color='r', label='EPFL')
plt.plot(group_by_and_aggregate(ethz[['favorite_count', 'created_at']], lambda dt: dt.year), color='b', label='ETHZ')
plt.xlabel('Year')
plt.ylabel('Favorites')
plt.legend()
plt.show()
print('Retweets per year:')
plt.plot(group_by_and_aggregate(epfl[['retweet_count', 'created_at']], lambda dt: dt.year), color='r', label='EPFL')
plt.plot(group_by_and_aggregate(ethz[['retweet_count', 'created_at']], lambda dt: dt.year), color='b', label='ETHZ')
plt.xlabel('Year')
plt.ylabel('Retweets')
plt.legend()
plt.show()



# coding: utf-8

# In[ ]:

epflRetCountYear={}
epflRetCountMonth={}
epflRetCountHour={}
epflRetMeanYear=0
epflRetMeanMonth=0
epflRetMeanHour=0

for year in sorted(epfl_en.year.unique()):
    epflRetCountYear[year]=epfl_en[epfl_en['year']==year]['retweet_count'].sum()
    epflRetMeanYear+=epflRetCountYear[year]
for month in sorted(epfl_en.month.unique()):
    epflRetCountMonth[month]=epfl_en[epfl_en['month']==month]['retweet_count'].sum()
    epflRetMeanMonth+=epflRetCountMonth[month]    
for hour in sorted(epfl_en.hour.unique()):
    epflRetCountHour[hour]=epfl_en[epfl_en['hour']==hour]['retweet_count'].sum()
    epflRetMeanHour+=epflRetCountHour[hour]    

epflRetMeanHour/=len(epfl_en.hour.unique())
epflRetMeanYear/=len(epfl_en.year.unique())
epflRetMeanMonth/=len(epfl_en.month.unique())

# Year Plot Count
ind = np.arange(len(epfl_en.year.unique()))
# # width = 0.65
fig = plt.figure()
ax = fig.add_subplot(111)
ax.bar(ind,  epflRetCountYear.values(), color='#deb0b0')
ax.set_xticklabels(sorted(epfl_en.year.unique()),rotation='vertical')
ax.axhline(epflRetMeanYear,color='b',linewidth=2,linestyle='dashed')
ax.legend(['mean','year'],loc='best')
plt.title('retweets per year')
plt.tight_layout()
plt.show()

# Year Month Count
ind = np.arange(len(epfl_en.month.unique()))
# # width = 0.65
fig = plt.figure()
ax = fig.add_subplot(122)
ax.bar(ind, epflRetCountMonth.values(), color='#deb0b0')
ax.set_xticklabels(sorted(epfl_en.month.unique()),rotation='vertical')
ax.axhline(epflRetMeanMonth,color='b',linewidth=2,linestyle='dashed')
ax.legend(['mean','month'],loc='best')
plt.title('retweets per month')
plt.tight_layout()
plt.show()

# # Year Hour Count
ind = np.arange(len(epfl_en.hour.unique()))
# # width = 0.65
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(133)
ax.bar(ind, epflRetCountHour.values(), color='#deb0b0')
ax.set_xticklabels(sorted(epfl_en.hour.unique()),rotation='vertical')
ax.axhline(epflRetMeanHour,color='b',linewidth=2,linestyle='dashed')
ax.legend(['mean','hour'],loc='best')
plt.title('retweets per hour')
plt.tight_layout()
plt.show()




# coding: utf-8

# In[ ]:

epflFavCountYear={}
epflFavCountMonth={}
epflFavCountHour={}
epflFavMeanYear=0
epflFavMeanMonth=0
epflFavMeanHour=0

for year in sorted(epfl_en.year.unique()):
    epflFavCountYear[year]=epfl_en[epfl_en['year']==year]['favorite_count'].sum()
    epflFavMeanYear+=epflFavCountYear[year]
for month in sorted(epfl_en.month.unique()):
    epflFavCountMonth[month]=epfl_en[epfl_en['month']==month]['favorite_count'].sum()
    epflFavMeanMonth+=epflFavCountMonth[month]    
for hour in sorted(epfl_en.hour.unique()):
    epflFavCountHour[hour]=epfl_en[epfl_en['hour']==hour]['favorite_count'].sum()
    epflFavMeanHour+=epflFavCountHour[hour]    

epflFavMeanHour/=len(epfl_en.hour.unique())
epflFavMeanYear/=len(epfl_en.year.unique())
epflFavMeanMonth/=len(epfl_en.month.unique())

# Year Plot Count
ind = np.arange(len(epfl_en.year.unique()))
# # width = 0.65
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(144)
ax.bar(ind,  epflFavCountYear.values(), color='#deb0b0')
ax.set_xticklabels(sorted(epfl_en.year.unique()),rotation='vertical')
ax.axhline(epflFavMeanYear,color='b',linewidth=2,linestyle='dashed')
ax.legend(['mean','year'],loc='best')
plt.title('favorites per year')
plt.tight_layout()
plt.show()

# Year Month Count
ind = np.arange(len(epfl_en.month.unique()))
# # width = 0.65
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(155)
ax.bar(ind, epflFavCountMonth.values(), color='#deb0b0')
ax.set_xticklabels(sorted(epfl_en.month.unique()))
ax.axhline(epflFavMeanMonth,color='b',linewidth=2,linestyle='dashed')
ax.legend(['mean','month'],loc='best')
plt.title('favorites per month')
plt.tight_layout()
plt.show()

# # Year Hour Count
ind = np.arange(len(epfl_en.hour.unique()))
# # width = 0.65
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(166)
ax.bar(ind, epflFavCountHour.values(), color='#deb0b0')
ax.set_xticklabels(sorted(epfl_en.hour.unique()))
ax.axhline(epflFavMeanHour,color='b',linewidth=2,linestyle='dashed')
ax.legend(['mean','hour'],loc='best')
plt.title('favorites per hour')
plt.tight_layout()
plt.show()
# epfl_en.hour.unique()



# coding: utf-8

# In[ ]:

f, axarr = plt.subplots(3,2, figsize=(10,10), sharey=True)

axarr[0,0].plot(epfl_by_year)
axarr[0,0].set_title('EPFL Favorites and retweets by year')
axarr[0,0].legend(['favorite_count','retweet_count'], loc='best')

axarr[0,1].plot(ethz_by_year)
axarr[0,1].set_title('ETHZ Favorites and retweets by year')
axarr[0,1].legend(['favorite_count','retweet_count'], loc='best')

axarr[1,0].plot(epfl_by_month)
axarr[1,0].set_title('EPFL Favorites and retweets by month')
axarr[1,0].legend(['favorite_count','retweet_count'], loc='best')

axarr[1,1].plot(ethz_by_month)
axarr[1,1].set_title('ETHZ Favorites and retweets by month')
axarr[1,1].legend(['favorite_count','retweet_count'], loc='best')

axarr[2,0].plot(epfl_by_hour)
axarr[2,0].set_title('EPFL Favorites and retweets by hour')
axarr[2,0].legend(['favorite_count','retweet_count'], loc='best')

axarr[2,1].plot(ethz_by_hour)
axarr[2,1].set_title('ETHZ Favorites and retweets by hour')
axarr[2,1].legend(['favorite_count','retweet_count'], loc='best')

f.subplots_adjust(hspace=0.5, wspace=1)



# coding: utf-8

# In[ ]:

# year
import matplotlib.pyplot as plt

f, (ax1, ax2) = plt.subplots(1, 2, sharey=True)

data_retweet = []
data_favourite = []
for year in reversed(df_epfl.year.unique()):
    data_retweet.append([list_epfl_retweet['year'][year], list_eth_retweet['year'][year]])
    data_favourite.append([list_epfl_favourite['year'][year], list_eth_favourite['year'][year]])
    
pd.DataFrame(data_retweet, columns=['epfl','eth'], index=reversed(df_epfl.year.unique())).plot(kind='bar', ax = ax1)
pd.DataFrame(data_favourite, columns=['epfl','eth'], index=reversed(df_epfl.year.unique())).plot(kind='bar', ax = ax2)

ax1.set_title("Nb retweets vs year")
ax1.set_xlabel("Nb retweets")
ax1.set_ylabel("Year")

ax2.set_title("Nb favourites vs year")
ax2.set_xlabel("Nb retweets")
ax2.set_ylabel("Year")

plt.show()



# coding: utf-8

# In[ ]:

# month
import matplotlib.pyplot as plt

f, (ax1, ax2) = plt.subplots(1, 2, sharey=True)

data_retweet = []
data_favourite = []
for month in reversed(df_epfl.month.unique()):
    data_retweet.append([list_epfl_retweet['month'][month], list_eth_retweet['month'][month]])
    data_favourite.append([list_epfl_favourite['month'][month], list_eth_favourite['month'][month]])
    
pd.DataFrame(data_retweet, columns=['epfl','eth'], index=reversed(df_epfl.month.unique())).plot(kind='bar', ax = ax1)
pd.DataFrame(data_favourite, columns=['epfl','eth'], index=reversed(df_epfl.month.unique())).plot(kind='bar', ax = ax2)

ax1.set_title("Nb retweets vs months")
ax1.set_xlabel("Nb retweets")
ax1.set_ylabel("Month")

ax2.set_title("Nb favourites vs months")
ax2.set_xlabel("Nb retweets")
ax2.set_ylabel("Month")

plt.show()



# coding: utf-8

# In[ ]:

# hour
import matplotlib.pyplot as plt

f, (ax1, ax2) = plt.subplots(1, 2, sharey=True)

data_retweet = []
data_favourite = []
for hour in reversed(df_epfl.hour.unique()):
    data_retweet.append([list_epfl_retweet['hour'][hour], list_eth_retweet['hour'][hour]])
    data_favourite.append([list_epfl_favourite['hour'][hour], list_eth_favourite['hour'][hour]])
    
pd.DataFrame(data_retweet, columns=['epfl','eth'], index=reversed(df_epfl.hour.unique())).plot(kind='bar', ax = ax1)
pd.DataFrame(data_favourite, columns=['epfl','eth'], index=reversed(df_epfl.hour.unique())).plot(kind='bar', ax = ax2)

ax1.set_title("Nb retweets vs hours")
ax1.set_xlabel("Nb retweets")
ax1.set_ylabel("Hour")

ax2.set_title("Nb favourites vs hours")
ax2.set_xlabel("Nb retweets")
ax2.set_ylabel("Hour")

plt.show()



# coding: utf-8

# In[ ]:

stopwordlist = stopwords.words(['english', 'french'])
stopwordlist.append("—")
stopwordlist.append("•")
stopwordlist.append("--")
stopwordlist.append("-")
stopwordlist.append("A")
stopwordlist.append("The")
stopwordlist.append("&amp;")
stopwordlist.append("...")
stopwordlist.append("–")
stopwordlist.append("...\"")
stopwordlist.append("&")
stopwordlist.append("\"\"The")
stopwordlist.append("I")
stopwordlist.append("via")
stopwordlist.append("(via")
stopwordlist.append("We")
stopwordlist.append(":)")
stopwordlist.append("|")
stopwordlist.append("!")



# coding: utf-8

# In[ ]:

fig1 = plt.figure()
sub = fig1.add_subplot(111)
x = df_eth_year['year']
y1 = df_eth_year['Total Favorites']
y2 = df_eth_year['Total Retweets']
y3 = df_epfl_year['Total Favorites']
y4 = df_epfl_year['Total Retweets']

sub.plot(x,y1,c='b',label='ETH Favorites')
sub.plot(x,y2,c='r', label='ETH Retweets')
sub.plot(x,y3,c='g', label='EPFL Favorites')
sub.plot(x,y4,c='y', label='EPFL Retweets')

plt.legend(loc='upper left')
plt.ylabel('Total')
plt.xlabel('Year')
plt.title('Yearly Growth of Retweets and Favorites at EPFL and ETH')

plt.show()



# coding: utf-8

# In[ ]:

#plot monthly trends
fig2 = plt.figure()
sub2 = fig2.add_subplot(211)
x = df_eth_month['month']
y1 = df_eth_month['Monthly Favorites']
y2 = df_eth_month['Monthly Retweets']
y3 = df_epfl_month['Monthly Favorites']
y4 = df_epfl_month['Monthly Retweets']

sub2.plot(x,y1,'bo',label='ETH Favorites')
sub2.plot(x,y2,'ro', label='ETH Retweets')
sub2.plot(x,y3,'go', label='EPFL Favorites')
sub2.plot(x,y4,'yo', label='EPFL Retweets')

sub2.legend(bbox_to_anchor=(1.5, 1.5))
plt.ylabel('Total')
plt.xlabel('Month')
plt.title('Monthly Trends of Retweets and Favorites at EPFL and ETH')

plt.show()



# coding: utf-8

# In[ ]:

#plot monthly trends with adjusted axis
fig3 = plt.figure()
sub3 = fig3.add_subplot(211)
x = df_eth_month['month']
y1 = df_eth_month['Monthly Favorites']
y2 = df_eth_month['Monthly Retweets']
y3 = df_epfl_month['Monthly Favorites']
y4 = df_epfl_month['Monthly Retweets']

sub3.plot(x,y1,'bo',label='ETH Favorites')
sub3.plot(x,y2,'ro', label='ETH Retweets')
sub3.plot(x,y3,'go', label='EPFL Favorites')
sub3.plot(x,y4,'yo', label='EPFL Retweets')

plt.axis([0, 13, 0, 2500])
sub3.legend(bbox_to_anchor=(1.5, 1.5))
plt.ylabel('Total')
plt.xlabel('Month')
plt.title('Monthly Trends of Retweets and Favorites at EPFL and ETH')

plt.show()



# coding: utf-8

# In[ ]:

#plot hourly trends
fig4 = plt.figure()
sub4 = fig4.add_subplot(211)
x1 = df_eth_hour['hour']
x2 = df_epfl_hour['hour']

y1 = df_eth_hour['Hourly Favorites']
y2 = df_eth_hour['Hourly Retweets']
y3 = df_epfl_hour['Hourly Favorites']
y4 = df_epfl_hour['Hourly Retweets']

sub4.plot(x1,y1, 'ro',label='ETH Favorites')
sub4.plot(x1,y2, 'bo',label='ETH Retweets')
sub4.plot(x2,y3, 'go',label='EPFL Favorites')
sub4.plot(x2,y4, 'yo',label='EPFL Retweets')

sub4.legend(bbox_to_anchor=(1.5, 1.5))
plt.ylabel('Total')
plt.xlabel('Hour')
plt.title('Hourly Trends of Retweets and Favorites at EPFL and ETH')

plt.show()



# coding: utf-8

# In[ ]:

eth.drop(['contributors'],inplace=True,axis=1,errors='ignore')
eth.drop(['coordinates'],inplace=True,axis=1,errors='ignore')
eth.drop(['geo'],inplace=True,axis=1,errors='ignore')
eth.drop(['in_reply_to_screen_name'],inplace=True,axis=1,errors='ignore')
eth.drop(['in_reply_to_status_id'],inplace=True,axis=1,errors='ignore')
eth.drop(['in_reply_to_status_id_str'],inplace=True,axis=1,errors='ignore')
eth.drop(['in_reply_to_user_id'],inplace=True,axis=1,errors='ignore')
eth.drop(['in_reply_to_user_id_str'],inplace=True,axis=1,errors='ignore')
eth.drop(['place'],inplace=True,axis=1,errors='ignore')

eth.drop(['quoted_status'],inplace=True,axis=1,errors='ignore')
eth.drop(['quoted_status_id'],inplace=True,axis=1,errors='ignore')
eth.drop(['quoted_status_id_str'],inplace=True,axis=1,errors='ignore')




eth.count()



# coding: utf-8

# In[ ]:

epfl_df = pd.read_json('epfl_en.json')
ds_epfl = epfl_df.copy()
ds_epfl = ds_epfl[(epfl_df.id % 10) == 0]

#remove all values with no info

epfl_df.dropna(axis = 0,how = 'all',inplace = True)

epfl_df.columns
epfl_df.coordinates.replace('None', 'NaN',inplace = True)
epfl_df.describe()
epfl_df.drop('contributors',axis = 1,inplace = True)
epfl_df.describe()
epfl_df.columns
#remove those with strings therefore duplicated info and those without extra value such as language 
epfl_df = epfl_df.iloc[:,[0,1,2,4,5,6,7,9,10,12,16,17,19,20,21,22,23,25,27]]
epfl_df.columns
epfl_df.describe()
#I did not remove na's so as to retain as much data as possible for further processing. however the None 
#were replaced with NaN incase I needed to use fill Na later on
epfl_df.iloc[:,[8,9,12,13]].replace('None', 'NaN',inplace = True)
#remove location information as not doing question 2 and informations that are strings if missed previously
epfl_df.drop(['coordinates','quoted_status_id_str','geo','in_reply_to_screen_name','place','possibly_sensitive'],axis=1,inplace = True)
epfl_df.describe()



# coding: utf-8

# In[ ]:

eth_df = pd.read_json('eth_en.json')
ds_eth = eth_df.copy()
ds_eth = ds_eth[(epfl_df.id % 10) == 0]

eth_df.dropna(axis = 0,how = 'all',inplace = True)

eth_df.columns
eth_df.coordinates.replace('None', 'NaN',inplace = True)
eth_df.describe()
eth_df.drop('contributors',axis = 1,inplace = True)
eth_df.describe()
eth_df.columns
#remove those with strings therefore duplicated info and those without extra value such as language 
eth_df = eth_df.iloc[:,[0,1,2,4,5,6,7,9,10,12,16,17,19,20,21,22,23,25,27]]
eth_df.columns
eth_df.describe()
eth_df.iloc[:,[8,9,12,13]].replace('None', 'NaN',inplace = True)
#remove location information as not doing question 2
eth_df.drop(['coordinates','geo','in_reply_to_screen_name','place','possibly_sensitive'],axis=1,inplace = True)
eth_df.head()



# coding: utf-8

# In[ ]:


#remove all values with no info

ds_epfl.dropna(axis = 0,how = 'all',inplace = True)

ds_epfl.columns
ds_epfl.coordinates.replace('None', 'NaN',inplace = True)
ds_epfl.describe()
ds_epfl.drop('contributors',axis = 1,inplace = True)
ds_epfl.describe()
ds_epfl.columns
#remove those with strings therefore duplicated info and those without extra value such as language 
ds_epfl = ds_epfl.iloc[:,[0,1,2,4,5,6,7,9,10,12,16,17,19,20,21,22,23,25,27]]
ds_epfl.columns
ds_epfl.describe()
ds_epfl.iloc[:,[8,9,12,13]].replace('None', 'NaN',inplace = True)
#remove location information as not doing question 2
ds_epfl.drop(['coordinates','geo','in_reply_to_screen_name','place','possibly_sensitive'],axis=1,inplace = True)
ds_epfl.head()



# coding: utf-8

# In[ ]:



ds_eth.dropna(axis = 0,how = 'all',inplace = True)

ds_eth.columns
ds_eth.coordinates.replace('None', 'NaN',inplace = True)
ds_eth.describe()
ds_eth.drop('contributors',axis = 1,inplace = True)
ds_eth.describe()
ds_eth.columns
#remove those with strings therefore duplicated info and those without extra value such as language 
ds_eth = ds_eth.iloc[:,[0,1,2,4,5,6,7,9,10,12,16,17,19,20,21,22,23,25,27]]
ds_eth.columns
ds_eth.describe()
ds_eth.iloc[:,[8,9,12,13]].replace('None', 'NaN',inplace = True)
#remove location information as not doing question 2
ds_eth.drop(['coordinates','geo','in_reply_to_screen_name','place','possibly_sensitive'],axis=1,inplace = True)
ds_eth.head()



# coding: utf-8

# In[ ]:

print('unique values per month')
print(epfl2['month'].unique())
print(eth2['month'].unique())

print('unique values per year')
print(epfl2['year'].unique())
print(eth2['year'].unique())

print('unique values per daily hour')
print(epfl2['hour_of_day'].unique())
print(eth2['hour_of_day'].unique())

print('unique universities')
print(epfl2['uni'].unique())
print(eth2['uni'].unique())



# coding: utf-8

# In[ ]:

#Tests for the values of the columns :
print('Contributors EPFL (null entries):',Counter(df_epfl.contributors.isnull().values))
print('Contributors ETH (null entries) :',Counter(df_ethz.contributors.isnull().values))

print('\nCoordinates EPFL (null entries) :', Counter(df_epfl.coordinates.isnull().values))
print('Coordinates ETH (null entries) :', Counter(df_ethz.coordinates.isnull().values))

print('\nGeo Coordinates EPFL (null entries) :', Counter(df_epfl.geo.isnull().values))
print('Geo Coordinates ETH (null entries) :', Counter(df_ethz.geo.isnull().values))

print('\nIn Reply to Status EPFL (null entries) :', Counter(df_epfl.in_reply_to_status_id.isnull().values))
print('In Reply to Status ETH (null entries) :', Counter(df_ethz.in_reply_to_status_id.isnull().values))

print('\nIn Reply to User EPFL (null entries) :', Counter(df_epfl.in_reply_to_user_id_str.isnull().values))
print('In Reply to User ETH (null entries) :', Counter(df_ethz.in_reply_to_user_id_str.isnull().values))

print('\nIs Quote EPFL :', Counter(df_epfl.is_quote_status.values))
print('Is Quote ETH :', Counter(df_ethz.is_quote_status.values))

print('\nQuoted Status EPFL (null entries) :', Counter(df_epfl.quoted_status.isnull().values))
print('Quoted Status ETH (null entries) :', Counter(df_ethz.quoted_status.isnull().values))

print('\nQuoted Status ID EPFL (null entries) :', Counter(df_epfl.quoted_status_id.isnull().values))
print('Quoted Status ID ETH (null entries) :', Counter(df_ethz.quoted_status_id.isnull().values))

print('\nPlace EPFL (null entries) :', Counter(df_epfl.place.isnull().values))
print('Place ETH (null entries) :', Counter(df_ethz.place.isnull().values))

print('\nPossibly Sensitive EPFL :', Counter(df_epfl.possibly_sensitive.isnull().values))
print('Possibly Sensitive ETH :', Counter(df_ethz.possibly_sensitive.isnull().values))

print('\nExtended Entities EPFL (null entries) :', Counter(df_epfl.extended_entities.isnull().values))
print('Extended Entities ETH (null entries) :', Counter(df_ethz.extended_entities.isnull().values))



# coding: utf-8

# In[ ]:

#Tests for the values of the columns :
print('Contributors EPFL (null entries):',Counter(df_epfl_ds.contributors.isnull().values))
print('Contributors ETH (null entries) :',Counter(df_ethz_ds.contributors.isnull().values))

print('\nCoordinates EPFL (null entries) :', Counter(df_epfl_ds.coordinates.isnull().values))
print('Coordinates ETH (null entries) :', Counter(df_ethz_ds.coordinates.isnull().values))

print('\nGeo Coordinates EPFL (null entries) :', Counter(df_epfl_ds.geo.isnull().values))
print('Geo Coordinates ETH (null entries) :', Counter(df_ethz_ds.geo.isnull().values))

print('\nIn Reply to Status EPFL (null entries) :', Counter(df_epfl_ds.in_reply_to_status_id.isnull().values))
print('In Reply to Status ETH (null entries) :', Counter(df_ethz_ds.in_reply_to_status_id.isnull().values))

print('\nIn Reply to User EPFL (null entries) :', Counter(df_epfl_ds.in_reply_to_user_id_str.isnull().values))
print('In Reply to User ETH (null entries) :', Counter(df_ethz_ds.in_reply_to_user_id_str.isnull().values))

print('\nIs Quote EPFL :', Counter(df_epfl_ds.is_quote_status.values))
print('Is Quote ETH :', Counter(df_ethz_ds.is_quote_status.values))

print('\nQuoted Status EPFL (null entries) :', Counter(df_epfl_ds.quoted_status.isnull().values))
print('Quoted Status ETH (null entries) :', Counter(df_ethz_ds.quoted_status.isnull().values))

print('\nQuoted Status ID EPFL (null entries) :', Counter(df_epfl_ds.quoted_status_id.isnull().values))
print('Quoted Status ID ETH (null entries) :', Counter(df_ethz_ds.quoted_status_id.isnull().values))

print('\nPlace EPFL (null entries) :', Counter(df_epfl_ds.place.isnull().values))
print('Place ETH (null entries) :', Counter(df_ethz_ds.place.isnull().values))

print('\nPossibly Sensitive EPFL :', Counter(df_epfl_ds.possibly_sensitive.isnull().values))
print('Possibly Sensitive ETH :', Counter(df_ethz_ds.possibly_sensitive.isnull().values))

print('\nExtended Entities EPFL (null entries) :', Counter(df_epfl_ds.extended_entities.isnull().values))
print('Extended Entities ETH (null entries) :', Counter(df_ethz_ds.extended_entities.isnull().values))



# coding: utf-8

# In[ ]:

df_epf_year = df_epf.groupby('year')[['retweet_count','favorite_count']].sum()
df_epf_sub_year = df_epf_subsample.groupby('year')[['retweet_count','favorite_count']].sum()
df_epf_year.head(3)

df_eth_year = df_eth.groupby('year')[['retweet_count','favorite_count']].sum()
df_eth_sub_year = df_eth_subsample.groupby('year')[['retweet_count','favorite_count']].sum()
df_eth_year.head(3)

plt.plot(df_epf_year.index, df_epf_year.retweet_count, 'r')
plt.plot(df_epf_year.index, df_epf_year.favorite_count, 'r--')

plt.ticklabel_format(style='plain', axis='x')

plt.plot(df_eth_year.index, df_eth_year.retweet_count, 'b')
plt.plot(df_eth_year.index, df_eth_year.favorite_count, 'b--')

plt.ticklabel_format(style='plain', axis='x')
print('We can see here the comparison bestween retween (line) and favorits (dashed line) between ' + 
      'EPFL (red) and ETHZ (blue)')
plt.show



In [209]:

    
# Shorter lists of (mainly) function calls. Some plotting/some random stuff
for i, cell in enumerate(class_to_cells[3]):
    if i > 20:
        break
    print (cell)









    



# coding: utf-8

# In[ ]:

df_g_hour.plot.bar()
plt.title('ETHZ tweets - variation over the years')
plt.show()
df_f_hour.plot.bar()
plt.title('EPFL tweets - variation over the years')
plt.show()



# coding: utf-8

# In[ ]:

df_g_year.plot.bar()
plt.title('ETHZ tweets - variation over the years')
plt.show()
df_f_year.plot.bar()
plt.title('EPFL tweets - variation over the years')
plt.show()



# coding: utf-8

# In[ ]:

ax_year_retweet = grouped_year_retweet_count['sum'].plot(kind='bar', color = 'blue')
ax_year_retweet.set_xlabel("Year", fontsize=10)
ax_year_favorite = grouped_year_favorite_count['sum'].plot(kind='bar', color='grey', title ="# of retweets (blue) and # liked tweets (gray)", figsize=(5, 5), fontsize=10)
ax_year_favorite.set_xlabel("Year", fontsize=10)
plt.show()



# coding: utf-8

# In[ ]:

plt.plot(rmses,'b')
plt.title("RMSES / estimators")
plt.xlabel("Number of estimators")
plt.ylabel("RMSE")
plt.show()



# coding: utf-8

# In[ ]:

plt.subplots(figsize=(8,6))

sns.countplot(x='year', hue='university', data=engagement)

plt.suptitle('Count of tweets by year', size=20)



# coding: utf-8

# In[ ]:

plt.subplots(figsize=(8,6))

sns.barplot(x='year', y='retweet_count', hue='university', data=engagement)

plt.suptitle('Mean of retweet_count by year', size=20)



# coding: utf-8

# In[ ]:

plt.subplots(figsize=(8,6))

sns.barplot(x='year', y='favorite_count', hue='university', data=engagement)

plt.suptitle('Mean of favorite_count by year', size=20)



# coding: utf-8

# In[ ]:

plt.subplots(figsize=(8,6))

sns.countplot(x='month', hue='university', data=engagement)

plt.suptitle('Count of tweets by month', size=20)



# coding: utf-8

# In[ ]:

plt.subplots(figsize=(8,6))

sns.barplot(x='month', y='retweet_count', hue='university', data=engagement)

plt.suptitle('Mean of retweet_count by month', size=20)



# coding: utf-8

# In[ ]:

plt.subplots(figsize=(8,6))


sns.pointplot(x='month', y='retweet_count', data=summed_l, join=False)
sns.pointplot(x='month', y='retweet_count', data=summed_z, join=False, color=palette[2])



# coding: utf-8

# In[ ]:

plt.subplots(figsize=(8,6))


sns.pointplot(x='month', y='favorite_count', data=summed_l, join=False)
sns.pointplot(x='month', y='favorite_count', data=summed_z, join=False, color=palette[2])



# coding: utf-8

# In[ ]:

plt.subplots(figsize=(8,6))

sns.countplot(x='hour', hue='university', data=engagement)

plt.suptitle('Count of tweets by year', size=20)



# coding: utf-8

# In[ ]:

plt.subplots(figsize=(8,6))


sns.pointplot(x='hour', y='retweet_count', data=summed_l, join=False)
sns.pointplot(x='hour', y='retweet_count', data=summed_z, join=False, color=palette[2])



# coding: utf-8

# In[ ]:

plt.subplots(figsize=(8,6))


sns.pointplot(x='hour', y='favorite_count', data=summed_l, join=False)
sns.pointplot(x='hour', y='favorite_count', data=summed_z, join=False, color=palette[2])



# coding: utf-8

# In[ ]:

wc = WordCloud().generate(raw_corpus)
plt.subplots(figsize=(10,15))
plt.imshow(wc)
plt.axis("off")
plt.show()



# coding: utf-8

# In[ ]:

def plot_stats(epfl, ethz, labels):
    ind = np.arange(N)
    fig = plt.figure()
    width = 0.2
    ax = fig.add_subplot(111)
    yvals = [epfl.favorite_count.sum(), ethz.favorite_count.sum()]
    rects1 = ax.bar(ind, yvals, width, color='r')
    zvals = [epfl.retweet_count.sum(), ethz.retweet_count.sum()]
    rects2 = ax.bar(ind+width, zvals, width, color='g')


    ax.set_ylabel('Count')
    ax.set_xticks(ind + width)
    ax.set_xticklabels(labels)
    ax.legend( (rects1[0], rects2[0]), ('Favorites', 'Retweets'))

    plt.show()
    
print('Full dataset:')
plot_stats(epfl, ethz, labels=['EPFL', 'ETHZ'])
print('Reduced dataset:')
plot_stats(epfl_small, ethz_small, labels=['EPFL', 'ETHZ'])



# coding: utf-8

# In[ ]:

print('Favorites per month:')
plt.plot(group_by_and_aggregate(epfl[['favorite_count', 'created_at']], lambda dt: dt.month), color='r', label='EPFL')
plt.plot(group_by_and_aggregate(ethz[['favorite_count', 'created_at']], lambda dt: dt.month), color='b', label='ETHZ')
plt.xlabel('Month')
plt.ylabel('Favorites')
plt.legend()
plt.show()



# coding: utf-8

# In[ ]:

print('Favorites per hour:')
plt.plot(group_by_and_aggregate(epfl[['favorite_count', 'created_at']], lambda dt: dt.hour), color='r', label='EPFL')
plt.plot(group_by_and_aggregate(ethz[['favorite_count', 'created_at']], lambda dt: dt.hour), color='b', label='ETHZ')
plt.xlabel('Hour')
plt.ylabel('Favorites')
plt.legend()
plt.show()



# coding: utf-8

# In[ ]:

print('Retweets per hour:')
plt.plot(group_by_and_aggregate(epfl[['retweet_count', 'created_at']], lambda dt: dt.hour), color='r', label='EPFL')
plt.plot(group_by_and_aggregate(ethz[['retweet_count', 'created_at']], lambda dt: dt.hour), color='b', label='ETHZ')
plt.xlabel('Hour')
plt.ylabel('Retweets')
plt.legend()
plt.show()



# coding: utf-8

# In[ ]:

ind = np.arange(len(epfl_en.year.value_counts()))
# # width = 0.65
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)
ax.bar(ind,  epfl_en.year.value_counts().sort_index(), color='#deb0b0')
ax.set_xticklabels(sorted(epfl_en.year.unique()))
ax.axhline(epfl_en.year.value_counts().mean(),color='b',linewidth=2,linestyle='dashed')
ax.legend(['mean','year'],loc='best')
plt.title('tweets per year')
plt.tight_layout()
plt.show()



# coding: utf-8

# In[ ]:

p1 = plt.scatter(years, epfl_year_retweet[years],marker = 'x',color='b',s=60)
p2 = plt.scatter(years, eth_year_retweet[years],marker = 'o',color='r',s=60)
plt.xticks(years)
plt.legend((p1, p2), ('EPFL', 'ETHZ'), loc=2)
plt.title("Number of retweets per year")



In [210]:

    
# Longer mixtures of both assignments and function calls
for i, cell in enumerate(class_to_cells[4]):
    if i > 20:
        break
    print (cell)









    



# coding: utf-8

# In[ ]:

# Now let's check per year
df_epfl_year = df_epfl.groupby(df_epfl['created_at'].map(lambda x: x.year)).sum()
df_epfl_year = df_epfl_year[["favorite_count", "retweet_count"]]
df_epfl_year['total'] = df_epfl_year['favorite_count'] + df_epfl_year['retweet_count'] 

df_eth_year = df_eth.groupby(df_eth['created_at'].map(lambda x: x.year)).sum()
df_eth_year = df_eth_year[["favorite_count", "retweet_count"]]
df_eth_year['total'] = df_eth_year['favorite_count'] + df_eth_year['retweet_count'] 

fig = plt.figure() # Create matplotlib figure

ax = fig.add_subplot(111) # Create matplotlib axes

width = 0.4

df_epfl_year.total.plot(kind='bar', color='blue', width=width, position=0)
df_eth_year.total.plot(kind='bar', color='red', width=width, position=1)

plt.show()

# We can see that the account got more active over time and that epfl is on top almost every year

#Need to fix label names , blue = EPFL, red = ETH



# coding: utf-8

# In[ ]:

# Now let's check per month
df_epfl_year = df_epfl.groupby(df_epfl['created_at'].map(lambda x: x.month)).sum()
df_epfl_year = df_epfl_year[["favorite_count", "retweet_count"]]
df_epfl_year['total'] = df_epfl_year['favorite_count'] + df_epfl_year['retweet_count'] 

df_eth_year = df_eth.groupby(df_eth['created_at'].map(lambda x: x.month)).sum()
df_eth_year = df_eth_year[["favorite_count", "retweet_count"]]
df_eth_year['total'] = df_eth_year['favorite_count'] + df_eth_year['retweet_count'] 


fig = plt.figure() # Create matplotlib figure

ax = fig.add_subplot(111) # Create matplotlib axes

width = 0.4

df_epfl_year.total.plot(kind='bar', color='blue', width=width, position=0)
df_eth_year.total.plot(kind='bar', color='red', width=width, position=1)

plt.show()


#Need to fix label names , blue = EPFL, red = ETH



# coding: utf-8

# In[ ]:

# Now let's check per month
df_epfl_year = df_epfl.groupby(df_epfl['created_at'].map(lambda x: x.hour)).sum()
df_epfl_year = df_epfl_year[["favorite_count", "retweet_count"]]
df_epfl_year['total'] = df_epfl_year['favorite_count'] + df_epfl_year['retweet_count'] 

df_eth_year = df_eth.groupby(df_eth['created_at'].map(lambda x: x.hour)).sum()
df_eth_year = df_eth_year[["favorite_count", "retweet_count"]]
df_eth_year['total'] = df_eth_year['favorite_count'] + df_eth_year['retweet_count'] 


fig = plt.figure() # Create matplotlib figure

ax = fig.add_subplot(111) # Create matplotlib axes

width = 0.4

df_epfl_year.total.plot(kind='bar', color='blue', width=width, position=0)
df_eth_year.total.plot(kind='bar', color='red', width=width, position=1)

plt.show()


#At around noon, the twitter from EPFL is quite active

#Need to fix label names , blue = EPFL, red = ETH



# coding: utf-8

# In[ ]:

# First let's a really simple regressor the class to predict is the number of retweets, and the input will be the number of favorite

validation_size = 300 #about 10%

Y = df_epfl['retweet_count'] 
Y2 = df_eth['retweet_count']

Y = Y.append(Y2)

X = df_epfl['favorite_count']
X2 = df_eth['favorite_count']

X = X.append(X2)

# let's keep some part of it for valdiation

validation_features = X[:validation_size]
validation_labels = Y[:validation_size]

train_features = X[validation_size:]
train_labels = Y[validation_size:]

model = LinearRegression()
model.fit(train_features.values.reshape(-1,1), train_labels.values)

# now let's check accuracy on the validation set

y_predict = model.predict(validation_features.values.reshape(-1,1))
#Acc_score = metrics.accuracy_score(validation_labels.values.reshape(-1), np.round(y_predict))

print('R2 score given by model is', model.score(validation_features.values.reshape(-1,1), validation_labels.values))

#let's check the rmse error
from sklearn.metrics import mean_squared_error
from math import sqrt

rms = sqrt(mean_squared_error(validation_labels.values.reshape(-1), y_predict))

print('rmse error is', rms)

# Doesn't seem to be incredible with only one feature, as expected



# coding: utf-8

# In[ ]:

#Let's quickly check the difference in the downsampled with only one feature

validation_size = 100 

Y = df_epfl_dw['retweet_count'] 
Y2 = df_eth_dw['retweet_count']

Y = Y.append(Y2)

X = df_epfl_dw['favorite_count']
X2 = df_eth_dw['favorite_count']

X = X.append(X2)

# let's keep some part of it for valdiation

validation_features = X[:validation_size]
validation_labels = Y[:validation_size]

train_features = X[validation_size:]
train_labels = Y[validation_size:]

model = LinearRegression()
model.fit(train_features.values.reshape(-1,1), train_labels.values)

# now let's check accuracy on the validation set

#y_predict = model.predict(validation_features.values.reshape(-1,1))
#Acc_score = metrics.accuracy_score(validation_labels.values.reshape(-1), np.round(y_predict))

print('R2 score given by model is', model.score(validation_features.values.reshape(-1,1), validation_labels.values))

#let's check the rmse error
from sklearn.metrics import mean_squared_error
from math import sqrt

rms = sqrt(mean_squared_error(validation_labels.values.reshape(-1), y_predict))

print('rmse error is', rms)

#The rmse is smaller but the R2 is worse, the down sampled regressor should be worse, as expected



# coding: utf-8

# In[ ]:

# Now let's also include the time 

validation_size = 300 #about 10%

Y = df_epfl['retweet_count'] 
Y2 = df_eth['retweet_count']
Y = Y.append(Y2)

X = df_epfl[['favorite_count', 'created_at']]
X2 = df_eth[['favorite_count', 'created_at']]

#Now I'll replace with hour when the tweet was created
X['created_at'] = df_epfl['created_at'].map(lambda x: x.hour)
X2['created_at'] = df_eth['created_at'].map(lambda x: x.hour)

X = X.append(X2)

# let's keep some part of it for valdiation

validation_features = X[:validation_size]
validation_labels = Y[:validation_size]

train_features = X[validation_size:]
train_labels = Y[validation_size:]


model = LinearRegression()
model.fit(train_features.values, train_labels.values)

print('R2 score given by model is', model.score(validation_features.values, validation_labels.values))

#let's check the rmse error
from sklearn.metrics import mean_squared_error
from math import sqrt

rms = sqrt(mean_squared_error(validation_labels.values, y_predict))

print('rmse error is', rms)

#R2 is closer to 0 so the model is better now



# coding: utf-8

# In[ ]:

tweetId = [item['id'] for item in t1]
user_id = [item['user']['id'] for item in t1]
favorite = [item['favorite_count'] for item in t1]
date = [item['created_at'] for item in t1]
retweet_count = [item['retweet_count'] for item in t1]
place = [item['place'] for item in t1]
id_str = [item['id_str'] for item in t1]
# user = [item['retweeted_status'] for item in t1]
epfl_df = pd.DataFrame({'tweetId': tweetId , 'id_str':id_str,'user_id': user_id,'favorite': favorite,'date': date,'retweet_count': retweet_count,'place':place})



# coding: utf-8

# In[ ]:

tweetId = [item['id'] for item in t2]
user_id = [item['user']['id'] for item in t2]
favorite = [item['favorite_count'] for item in t2]
date = [item['created_at'] for item in t2]
retweet_count = [item['retweet_count'] for item in t2]
place = [item['place'] for item in t2]
id_str = [item['id_str'] for item in t2]
# user = [item['retweeted_status'] for item in t2]
eth_df = pd.DataFrame({'tweetId': tweetId , 'id_str':id_str,'user_id': user_id,'favorite': favorite,'date': date,'retweet_count': retweet_count,'place':place})



# coding: utf-8

# In[ ]:

df_f = df_big[df_big['Univ'] == 'EPFL']
df_g = df_big[df_big['Univ'] == 'ETHZ']

df_f_year = df_f[['year_tweet','favorite_count','retweet_count']].groupby(['year_tweet']).sum()
df_g_year = df_g[['year_tweet','favorite_count','retweet_count']].groupby(['year_tweet']).sum()

df_f_month = df_f[['month_num_tweet','favorite_count','retweet_count']].groupby(['month_num_tweet']).sum()
df_g_month = df_g[['month_num_tweet','favorite_count','retweet_count']].groupby(['month_num_tweet']).sum()

df_f_hour = df_f[['hour_tweet','favorite_count','retweet_count']].groupby(['hour_tweet']).sum()
df_g_hour = df_g[['hour_tweet','favorite_count','retweet_count']].groupby(['hour_tweet']).sum()



# coding: utf-8

# In[ ]:

EPFL_TWEETS = 'epfl_en.json'
ETH_TWEETS = 'eth_en.json'

epfl_data = open(EPFL_TWEETS).read()
epfl_data = json.loads(epfl_data)

eth_data = open(ETH_TWEETS).read()
eth_data = json.loads(eth_data)

len_epfl = len(epfl_data)
len_eth = len(eth_data)



# coding: utf-8

# In[ ]:

tweets_regression = tweets.copy()
all_tags = []
followers_count = []
friends_count = []
for index, row in tweets_regression.iterrows():
    tags = []
    for hashtags in row['entities']['hashtags']:
        tags.append(hashtags['text'])
    all_tags.append(tags)
    followers_count.append(row['user']['followers_count'])
    friends_count.append(row['user']['friends_count'])

#tweets_regression['tags'] = all_tags
#tweets_regression['friends_count'] = friends_count
#tweets_regression['followers_count'] = followers_count

columns_regression_drop = ['id_str', 'entities', 'user', 'created_at', 'text']
tweets_regression = tweets_regression.drop(columns_regression_drop, axis=1)

labels = tweets_regression.pop('retweet_count')

tweets_regression.head()



# coding: utf-8

# In[ ]:

#encode string
eth_df_needed_col['in_reply_to_screen_name'] = eth_df_needed_col['in_reply_to_screen_name'].fillna('Unknown')
epfl_df_needed_col['in_reply_to_screen_name'] = epfl_df_needed_col['in_reply_to_screen_name'].fillna('Unknown')


eth_df_needed_col['month'] = eth_df_needed_col['month'].fillna('Unknown')
epfl_df_needed_col['month'] = epfl_df_needed_col['month'].fillna('Unknown')


le = preprocessing.LabelEncoder()

le.fit(eth_df_needed_col['in_reply_to_screen_name'])
eth_df_needed_col['in_reply_to_screen_name'] = le.transform(eth_df_needed_col['in_reply_to_screen_name']) 

le.fit(eth_df_needed_col['month'])
eth_df_needed_col['month'] = le.transform(eth_df_needed_col['month'])



le.fit(epfl_df_needed_col['in_reply_to_screen_name'])
epfl_df_needed_col['in_reply_to_screen_name'] = le.transform(epfl_df_needed_col['in_reply_to_screen_name']) 

le.fit(epfl_df_needed_col['month'])
epfl_df_needed_col['month'] = le.transform(epfl_df_needed_col['month'])



# coding: utf-8

# In[ ]:

data_epfl_new = data_epfl.drop('retweet_count', axis=1)
data_epfl_new['Month']=data_epfl_new.created_at.dt.month
data_epfl_new['Year'] = data_epfl_new.created_at.dt.year
data_epfl_new['Houre'] = data_epfl_new.created_at.dt.hour
data_epfl_new = data_epfl_new.drop('created_at', axis=1)
data_epfl_new = data_epfl_new.drop('entities', axis=1)
data_epfl_new = data_epfl_new.drop('source', axis=1)
data_epfl_new = data_epfl_new.drop('text', axis=1)
f_count=[]
for i, row in data_epfl.iterrows():
    f_count.append(row["user"]["followers_count"])
data_epfl_new['followers_count'] = pd.Series(f_count)
data_epfl_new = data_epfl_new.drop('user', axis=1)
data_epfl_new = data_epfl_new.drop('hashtags', axis=1)
data_epfl_new = data_epfl_new.drop('id', axis=1)
data_epfl_new = data_epfl_new.drop('id_str', axis=1)
data_epfl_new.dropna(inplace=True)
data_epfl_new



# coding: utf-8

# In[ ]:

data_eth_new = data_epfl.drop('retweet_count', axis=1)
data_eth_new['Month']=data_eth_new.created_at.dt.month
data_eth_new['Year'] = data_eth_new.created_at.dt.year
data_eth_new['Houre'] = data_eth_new.created_at.dt.hour
data_eth_new = data_eth_new.drop('created_at', axis=1)
data_eth_new = data_eth_new.drop('entities', axis=1)
data_eth_new = data_eth_new.drop('source', axis=1)
data_eth_new = data_eth_new.drop('text', axis=1)
f_count=[]
for i, row in data_eth.iterrows():
    f_count.append(row["user"]["followers_count"])
data_eth_new['followers_count'] = pd.Series(f_count)
data_eth_new = data_eth_new.drop('user', axis=1)
data_eth_new = data_eth_new.drop('hashtags', axis=1)
data_eth_new = data_eth_new.drop('id', axis=1)
data_eth_new = data_eth_new.drop('id_str', axis=1)
data_eth_new.dropna(inplace=True)



# coding: utf-8

# In[ ]:

epfl['created_at_year'] = epfl['created_at'].apply(lambda x: x.year)
epfl['created_at_month'] = epfl['created_at'].apply(lambda x: x.month)
epfl['created_at_day'] = epfl['created_at'].apply(lambda x: x.day)
epfl['created_at_hour'] = epfl['created_at'].apply(lambda x: x.hour)

ethz['created_at_year'] = ethz['created_at'].apply(lambda x: x.year)
ethz['created_at_month'] = ethz['created_at'].apply(lambda x: x.month)
ethz['created_at_day'] = ethz['created_at'].apply(lambda x: x.day)
ethz['created_at_hour'] = ethz['created_at'].apply(lambda x: x.hour)



# coding: utf-8

# In[ ]:

p = figure(title='Favorites across years for EPFL and ETHZ',plot_height=500,plot_width=500)
p.title.align = "center"
p.title.text_font_size = "18px"
p.xaxis[0].axis_label = 'Year'
p.yaxis[0].axis_label = 'Number of favorites'
p.xaxis[0].axis_label_text_font_size = '16px'
p.yaxis[0].axis_label_text_font_size = '16px'
p.circle(epfl_year,epfl_favorites,color='blue',size=8)
p.line(epfl_year,epfl_favorites,color='blue',legend='epfl')
p.circle(eth_year,eth_favorites,color='red',size=8)
p.line(eth_year,eth_favorites,color='red',legend='ethz')
show(p)



# coding: utf-8

# In[ ]:

p = figure(title='Retweets across years for EPFL and ETHZ',plot_height=500,plot_width=500)
p.title.align = "center"
p.title.text_font_size = "18px"
p.xaxis[0].axis_label = 'Year'
p.yaxis[0].axis_label = 'Number of retweets'
p.xaxis[0].axis_label_text_font_size = '16px'
p.yaxis[0].axis_label_text_font_size = '16px'
p.circle(epfl_year,epfl_retweets,color='blue',size=8)
p.line(epfl_year,epfl_retweets,color='blue',legend='epfl')
p.circle(eth_year,eth_retweets,color='red',size=8)
p.line(eth_year,eth_retweets,color='red',legend='ethz')
show(p)



# coding: utf-8

# In[ ]:

p = figure(title='Favorites across months for EPFL and ETHZ (2016)',plot_height=500,plot_width=500)
p.title.align = "center"
p.title.text_font_size = "18px"
p.xaxis[0].axis_label = 'Month'
p.yaxis[0].axis_label = 'Number of favorites'
p.xaxis[0].axis_label_text_font_size = '16px'
p.yaxis[0].axis_label_text_font_size = '16px'
p.circle(epfl_month,epfl_favorites,color='blue',size=8)
p.line(epfl_month,epfl_favorites,color='blue',legend='epfl')
p.circle(eth_month,eth_favorites,color='red',size=8)
p.line(eth_month,eth_favorites,color='red',legend='ethz')
show(p)



# coding: utf-8

# In[ ]:

p = figure(title='Retweets across months for EPFL and ETHZ (2016)',plot_height=500,plot_width=500)
p.title.align = "center"
p.title.text_font_size = "18px"
p.xaxis[0].axis_label = 'Month'
p.yaxis[0].axis_label = 'Number of retweets'
p.xaxis[0].axis_label_text_font_size = '16px'
p.yaxis[0].axis_label_text_font_size = '16px'
p.circle(epfl_month,epfl_retweets,color='blue',size=8)
p.line(epfl_month,epfl_retweets,color='blue',legend='epfl')
p.circle(eth_month,eth_retweets,color='red',size=8)
p.line(eth_month,eth_retweets,color='red',legend='ethz')
show(p)



# coding: utf-8

# In[ ]:

p = figure(title='Favorites across hours of day for EPFL and ETHZ (2016)',plot_height=500,plot_width=500)
p.title.align = "center"
p.title.text_font_size = "18px"
p.xaxis[0].axis_label = 'Hour'
p.yaxis[0].axis_label = 'Number of favorites'
p.xaxis[0].axis_label_text_font_size = '16px'
p.yaxis[0].axis_label_text_font_size = '16px'
p.circle(epfl_day,epfl_favorites,color='blue',size=8)
p.line(epfl_day,epfl_favorites,color='blue',legend='epfl')
p.circle(eth_day,eth_favorites,color='red',size=8)
p.line(eth_day,eth_favorites,color='red',legend='ethz')
show(p)



# coding: utf-8

# In[ ]:

p = figure(title='Retweets across hours of day for EPFL and ETHZ (2016)',plot_height=500,plot_width=500)
p.title.align = "center"
p.title.text_font_size = "18px"
p.xaxis[0].axis_label = 'Hour'
p.yaxis[0].axis_label = 'Number of retweets'
p.xaxis[0].axis_label_text_font_size = '16px'
p.yaxis[0].axis_label_text_font_size = '16px'
p.circle(epfl_day,epfl_retweets,color='blue',size=8)
p.line(epfl_day,epfl_retweets,color='blue',legend='epfl')
p.circle(eth_day,eth_retweets,color='red',size=8)
p.line(eth_day,eth_retweets,color='red',legend='ethz')
show(p)



In [211]:

    
# Import Cells
for i, cell in enumerate(class_to_cells[5]):
    if i > 20:
        break
    print (cell)









    



# coding: utf-8

# In[ ]:

import pandas as pd
import numpy as np
import os

from sklearn.linear_model import LinearRegression
from sklearn import metrics

import matplotlib.pyplot as plt
get_ipython().magic('matplotlib inline')



# coding: utf-8

# In[ ]:

get_ipython().magic('matplotlib inline')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from nltk import word_tokenize

from string import punctuation
from nltk.corpus import stopwords
import nltk

import json
import seaborn as sns
sns.set_context('notebook')



# coding: utf-8

# In[ ]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import string
import re
import operator
from gensim import corpora, models, similarities
from gensim.models import hdpmodel, ldamodel
import nltk
from nltk.probability import FreqDist



# coding: utf-8

# In[ ]:

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from pandas.io.json import json_normalize
import warnings

warnings.filterwarnings('ignore')

get_ipython().magic('matplotlib inline')

sns.set_style('darkgrid')
palette = sns.color_palette()



# coding: utf-8

# In[ ]:

import pandas as pd 
import json 
import warnings
warnings.filterwarnings('ignore')

get_ipython().magic('matplotlib inline')

import matplotlib
import numpy as np
import matplotlib.pyplot as plt

from sklearn import preprocessing
from functools import partial
from scipy.stats import skewtest
import numpy as np



# coding: utf-8

# In[ ]:

from os import path
from collections import Counter
#import pycountry
import nltk
import re # for removing numbers
from nltk.sentiment import SentimentIntensityAnalyzer
from gensim import models,corpora
get_ipython().magic('matplotlib inline')
import re # for removing numbers



# coding: utf-8

# In[ ]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context('notebook')
from datetime import datetime
from datetime import date, time
from dateutil.parser import parse
import requests
from bs4 import BeautifulSoup
import pickle
import scipy.stats as stats
import math
import folium
import json
from sklearn.ensemble import RandomForestClassifier
from  numpy.core.defchararray import split
from sklearn.preprocessing import robust_scale
from sklearn import preprocessing
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.cross_validation import cross_val_score
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from wordcloud import WordCloud
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.probability import FreqDist
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import pycountry
import os
from operator import itemgetter
from gensim import corpora
from gensim.models import ldamodel
import networkx as nx
import community
get_ipython().magic('matplotlib inline')



# coding: utf-8

# In[ ]:

# Data pocessing
import pandas as pd
import numpy as np
from dateutil.parser import parse

# Machine Learning
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import mean_squared_error

# Interactive viz
import folium
import matplotlib.pyplot as plt
import seaborn as sns

# Text processing
import nltk.data
import pycountry
import random
import re
from nltk.tokenize import TweetTokenizer

from collections import Counter
from gensim import models, corpora
from wordcloud import WordCloud, STOPWORDS
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem import PorterStemmer, SnowballStemmer, WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from os import path
from os.path import exists
from PIL import Image


# Utils
import json
import itertools
import collections
import requests as rq
from bs4 import BeautifulSoup as bfs
import math
import scipy.stats as stats
from geopy.geocoders import GeoNames, Nominatim, GoogleV3
get_ipython().magic('matplotlib inline')



# coding: utf-8

# In[ ]:

# This might be useful
import pandas as pd

# Question 2 - Plotly graphs vizualisation
import plotly
import plotly.plotly as py
import plotly.graph_objs as go
# Generate API Key to use Plotly | I trust you ! You can change with another one because the key is limited to a daily amount of vizualisations.
# Also, the following graphs are provided with a link with each graph online, you should be able to view it.
plotly.tools.set_credentials_file(username='Merinorus', api_key='cfLSx6xqd6yBJawneBfu')

# Question 3 - Machine learning with random forest classifier
from sklearn.ensemble import RandomForestClassifier 



# coding: utf-8

# In[ ]:

import pandas as pd
import matplotlib.pyplot as plt
get_ipython().magic('matplotlib inline')
import seaborn as sns
import numpy as np
from bokeh.io import output_notebook
from bokeh.plotting import figure, output_file, show
from bokeh.models import Label
from bokeh.charts import BoxPlot, output_file, show
import scipy.stats as stats
import math
from bokeh.charts import Histogram, output_file, show
import networkx as nx



output_notebook()



# coding: utf-8

# In[ ]:

get_ipython().magic('matplotlib inline')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import re    
from IPython.core.display import display # display(df) plutôt que print()
import seaborn as sns
import nltk
from nltk.tokenize import TweetTokenizer
sns.set_context('notebook')



# coding: utf-8

# In[ ]:

import nltk
import string

import re

import itertools
import gensim

import pyLDAvis.gensim as gensimvis
import pyLDAvis



# coding: utf-8

# In[ ]:

import contextlib
import sys
import os
import json
from collections import OrderedDict
import itertools
import pandas as pd
import numpy as np
get_ipython().magic('matplotlib inline')
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import mean_squared_error



# coding: utf-8

# In[ ]:

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import folium
import sklearn 
import scipy
import bokeh
import nltk



# coding: utf-8

# In[ ]:

from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import re
import stop_words
import string



# coding: utf-8

# In[ ]:

import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
get_ipython().magic('matplotlib inline')
import warnings
warnings.filterwarnings("ignore")



# coding: utf-8

# In[ ]:

import pandas as pd
import json
import numpy as np
get_ipython().magic('matplotlib inline')
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.patches as mpatches



# coding: utf-8

# In[ ]:

import folium
import numpy as np
import pandas as pd
import scipy.stats as stats
import requests
import re
import matplotlib 
import matplotlib.pyplot as plt
get_ipython().magic('matplotlib inline')
get_ipython().magic('load_ext autoreload')
get_ipython().magic('autoreload 1')
from bs4 import BeautifulSoup



# coding: utf-8

# In[ ]:

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.preprocessing import Imputer
from sklearn.model_selection import cross_val_score
from sklearn.metrics import silhouette_score
from sklearn.cluster import MiniBatchKMeans, SpectralClustering
from sklearn.metrics import confusion_matrix
import seaborn as sns



# coding: utf-8

# In[ ]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns
from matplotlib.pyplot import show
import itertools
get_ipython().magic('matplotlib inline')
sns.set_context('notebook')
pd.options.mode.chained_assignment = None  # default='warn'



# coding: utf-8

# In[ ]:

import pandas as pd
import numpy as np
from os import path
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
import nltk.sentiment.util
import gensim
import json
get_ipython().magic('matplotlib inline')
get_ipython().magic('load_ext autoreload')
get_ipython().magic('autoreload 2')



In [ ]: