In [114]:
# Necessary imports
import os
import time
from nbminer.notebook_miner import NotebookMiner
from nbminer.cells.cells import Cell
from nbminer.features.ast_features import ASTFeatures
from nbminer.stats.summary import Summary
from nbminer.stats.multiple_summary import MultipleSummary
In [115]:
#Loading in the notebooks
people = os.listdir('../testbed/Final')
notebooks = []
for person in people:
person = os.path.join('../testbed/Final', person)
if os.path.isdir(person):
direc = os.listdir(person)
notebooks.extend([os.path.join(person, filename) for filename in direc if filename.endswith('.ipynb')])
notebook_objs = [NotebookMiner(file) for file in notebooks]
a = ASTFeatures(notebook_objs)
In [116]:
cells = a.get_list_segments()
cells[0]
Out[116]:
<nbminer.features.featurize.cell_features.SegmentFeatures at 0x12454e5f8>
In [123]:
itemsets = [cell.get_feature('nodes') for cell in cells]
print (itemsets[1])
print (len(itemsets))
[(<class '_ast.Import'>, 4), (<class '_ast.ImportFrom'>, 2), (<class '_ast.Expr'>, 1)]
9366
In [118]:
possible_keys = set()
for nodes in itemsets:
for node in nodes:
if node[0] not in possible_keys:
possible_keys.add(node[0])
classes = {}
class_lookup = {}
for i, key in enumerate(possible_keys):
classes[key] = i
class_lookup[i] = key
In [119]:
class_itemsets = []
for itemset in itemsets:
temp = []
for el in itemset:
temp.append(classes[el[0]])
class_itemsets.append(temp)
In [120]:
print (class_itemsets[1])
[10, 8, 4]
In [194]:
import pyfpgrowth
patterns = pyfpgrowth.find_frequent_patterns(class_itemsets, 10)
In [195]:
for pattern in patterns:
if len(pattern) > 2:
print ([class_lookup[pattern[i]] for i in range(len(pattern))])
[<class '_ast.FunctionDef'>, <class '_ast.ImportFrom'>, <class '_ast.Import'>]
[<class '_ast.Assign'>, <class '_ast.ImportFrom'>, <class '_ast.Import'>]
[<class '_ast.Assign'>, <class '_ast.Expr'>, <class '_ast.ImportFrom'>, <class '_ast.Import'>]
[<class '_ast.Assign'>, <class '_ast.Expr'>, <class '_ast.Import'>]
[<class '_ast.Expr'>, <class '_ast.ImportFrom'>, <class '_ast.Import'>]
[<class '_ast.Assign'>, <class '_ast.For'>, <class '_ast.ImportFrom'>]
[<class '_ast.FunctionDef'>, <class '_ast.Assign'>, <class '_ast.ImportFrom'>]
[<class '_ast.Assign'>, <class '_ast.Expr'>, <class '_ast.ImportFrom'>]
[<class '_ast.Assign'>, <class '_ast.Expr'>, <class '_ast.For'>]
[<class '_ast.FunctionDef'>, <class '_ast.Assign'>, <class '_ast.Expr'>]
In [127]:
second_itemsets = [np.unique(np.array(cell.get_feature('short_name_string'))) for cell in cells]
In [212]:
patterns = pyfpgrowth.find_frequent_patterns(second_itemsets, 20)
In [213]:
for pattern in patterns:
if len(pattern) > 2:
print (pattern)
('get_ipython', 'magic', 'set_context')
('fit', 'mean_squared_error', 'predict')
('arange', 'bar', 'show')
('axis', 'figure', 'show')
('axis', 'figure', 'imshow')
('axis', 'figure', 'imshow', 'show')
('axis', 'generate', 'join')
('WordCloud', 'axis', 'join')
('axis', 'imshow', 'join')
('WordCloud', 'axis', 'generate', 'join')
('axis', 'generate', 'imshow', 'join')
('WordCloud', 'axis', 'imshow', 'join')
('WordCloud', 'axis', 'generate', 'imshow', 'join')
('axis', 'generate', 'show')
('WordCloud', 'axis', 'show')
('axis', 'imshow', 'show')
('WordCloud', 'axis', 'generate', 'show')
('axis', 'generate', 'imshow', 'show')
('WordCloud', 'axis', 'imshow', 'show')
('WordCloud', 'axis', 'generate', 'imshow', 'show')
('WordCloud', 'axis', 'generate')
('axis', 'generate', 'imshow')
('WordCloud', 'axis', 'generate', 'imshow')
('WordCloud', 'axis', 'imshow')
('barplot', 'figure', 'groupby')
('RandomForestRegressor', 'fit', 'predict')
('figure', 'generate', 'imshow')
('WordCloud', 'figure', 'generate')
('WordCloud', 'figure', 'generate', 'imshow')
('generate', 'imshow', 'show')
('WordCloud', 'generate', 'show')
('WordCloud', 'generate', 'imshow', 'show')
('generate', 'imshow', 'join')
('WordCloud', 'generate', 'join')
('WordCloud', 'generate', 'imshow', 'join')
('WordCloud', 'generate', 'imshow')
('LinearRegression', 'fit', 'predict')
('cross_val_score', 'mean', 'print')
('WordCloud', 'imshow', 'join')
('figure', 'imshow', 'show')
('WordCloud', 'figure', 'imshow')
('WordCloud', 'imshow', 'show')
('legend', 'title', 'xticks')
('title', 'xticks', 'ylabel')
('range', 'show', 'xticks')
('bar', 'range', 'xticks')
('bar', 'title', 'xticks')
('bar', 'show', 'title', 'xticks')
('bar', 'show', 'xticks')
('show', 'title', 'xticks')
('fit', 'print', 'train_test_split')
('fit', 'predict', 'train_test_split')
('plot', 'set_title', 'sum')
('set_title', 'set_ylabel', 'sum')
('groupby', 'set_title', 'sum')
('groupby', 'set_title', 'set_xlabel')
('groupby', 'set_title', 'set_xlabel', 'set_ylabel')
('groupby', 'plot', 'set_title', 'set_xlabel')
('groupby', 'plot', 'set_title', 'set_xlabel', 'set_ylabel')
('groupby', 'plot', 'set_title')
('groupby', 'set_title', 'set_ylabel')
('groupby', 'plot', 'set_title', 'set_ylabel')
('set_title', 'set_ylabel', 'subplots')
('plot', 'set_title', 'set_xlabel')
('plot', 'set_title', 'set_ylabel')
('plot', 'set_title', 'set_xlabel', 'set_ylabel')
('set_title', 'set_xlabel', 'set_ylabel')
('LdaModel', 'doc2bow', 'print_topics')
('Dictionary', 'doc2bow', 'print_topics')
('Dictionary', 'LdaModel', 'print_topics')
('Dictionary', 'LdaModel', 'doc2bow', 'print_topics')
('agg', 'groupby', 'plot')
('groupby', 'reset_index', 'sum')
('groupby', 'plot', 'subplots')
('set_xlabel', 'set_ylabel', 'subplots')
('set_xlabel', 'set_ylabel', 'show')
('groupby', 'plot', 'set_ylabel')
('groupby', 'set_xlabel', 'set_ylabel')
('groupby', 'plot', 'set_xlabel', 'set_ylabel')
('plot', 'set_xlabel', 'set_ylabel')
('plot', 'set_xlabel', 'sum')
('groupby', 'set_xlabel', 'sum')
('groupby', 'plot', 'set_xlabel', 'sum')
('groupby', 'plot', 'set_xlabel')
('Dictionary', 'LdaModel', 'lower')
('LdaModel', 'doc2bow', 'lower')
('Dictionary', 'LdaModel', 'doc2bow', 'lower')
('Dictionary', 'doc2bow', 'lower')
('count', 'groupby', 'plot')
('Dictionary', 'LdaModel', 'len')
('LdaModel', 'doc2bow', 'len')
('Dictionary', 'doc2bow', 'len')
('Dictionary', 'LdaModel', 'doc2bow', 'len')
('Dictionary', 'LdaModel', 'doc2bow')
('fit', 'mean', 'predict')
('fit', 'mean', 'predict', 'print')
('mean', 'predict', 'print')
('fit', 'predict', 'print')
('groupby', 'sum', 'xlabel')
('groupby', 'show', 'xlabel')
('groupby', 'plot', 'xlabel')
('groupby', 'plot', 'show', 'xlabel')
('sum', 'title', 'xlabel')
('sum', 'title', 'xlabel', 'ylabel')
('show', 'sum', 'xlabel', 'ylabel')
('plot', 'show', 'sum', 'xlabel')
('plot', 'sum', 'xlabel')
('plot', 'sum', 'xlabel', 'ylabel')
('sum', 'xlabel', 'ylabel')
('figure', 'legend', 'xlabel')
('figure', 'legend', 'xlabel', 'ylabel')
('figure', 'title', 'xlabel')
('figure', 'title', 'xlabel', 'ylabel')
('figure', 'plot', 'xlabel')
('figure', 'plot', 'xlabel', 'ylabel')
('figure', 'xlabel', 'ylabel')
('legend', 'title', 'xlabel')
('legend', 'plot', 'title', 'xlabel')
('legend', 'title', 'xlabel', 'ylabel')
('legend', 'plot', 'title', 'xlabel', 'ylabel')
('legend', 'show', 'xlabel')
('legend', 'plot', 'show', 'xlabel')
('legend', 'plot', 'xlabel')
('legend', 'plot', 'xlabel', 'ylabel')
('legend', 'xlabel', 'ylabel')
('show', 'title', 'xlabel')
('show', 'title', 'xlabel', 'ylabel')
('plot', 'show', 'title', 'xlabel')
('plot', 'show', 'title', 'xlabel', 'ylabel')
('show', 'xlabel', 'ylabel')
('plot', 'show', 'xlabel', 'ylabel')
('plot', 'show', 'xlabel')
('plot', 'title', 'xlabel')
('title', 'xlabel', 'ylabel')
('plot', 'title', 'xlabel', 'ylabel')
('plot', 'xlabel', 'ylabel')
('groupby', 'legend', 'plot')
('figure', 'legend', 'ylabel')
('legend', 'show', 'ylabel')
('legend', 'show', 'title')
('legend', 'plot', 'show')
('legend', 'title', 'ylabel')
('legend', 'plot', 'title', 'ylabel')
('legend', 'plot', 'ylabel')
('legend', 'plot', 'title')
('figure', 'groupby', 'title')
('figure', 'groupby', 'sum')
('figure', 'title', 'ylabel')
('figure', 'plot', 'ylabel')
('figure', 'sum', 'title')
('figure', 'show', 'title')
('groupby', 'plot', 'ylabel')
('groupby', 'title', 'ylabel')
('show', 'sum', 'ylabel')
('show', 'sum', 'title', 'ylabel')
('plot', 'sum', 'ylabel')
('plot', 'show', 'sum', 'ylabel')
('sum', 'title', 'ylabel')
('show', 'title', 'ylabel')
('plot', 'show', 'title', 'ylabel')
('plot', 'show', 'ylabel')
('plot', 'title', 'ylabel')
('fit', 'mean', 'print')
('bar', 'range', 'show')
('bar', 'show', 'title')
('isnull', 'print', 'sum')
('groupby', 'plot', 'show', 'title')
('groupby', 'show', 'sum', 'title')
('groupby', 'plot', 'show', 'sum', 'title')
('groupby', 'plot', 'title')
('groupby', 'plot', 'sum', 'title')
('groupby', 'sum', 'title')
('plot', 'sum', 'title')
('plot', 'show', 'sum', 'title')
('show', 'sum', 'title')
('plot', 'show', 'title')
('append', 'len', 'range')
('groupby', 'show', 'sum')
('groupby', 'plot', 'show', 'sum')
('groupby', 'plot', 'show')
('plot', 'show', 'sum')
('apply', 'groupby', 'sum')
('groupby', 'head', 'sum')
('groupby', 'plot', 'sum')
In [139]:
def cell_to_vec(cell, classes):
cell_vec = [0 for i in range(len(classes.keys()))]
for node in cell:
cell_vec[classes[node[0]]] = node[1]
return cell_vec
In [140]:
import numpy as np
cell_vecs = []
for cell in itemsets:
cv = np.array(cell_to_vec(cell, classes))
cell_vecs.append(cv)
cell_vecs = np.array(cell_vecs)
print (cell_vecs.shape)
(9366, 12)
In [180]:
#CLUSTER
from sklearn.cluster import KMeans
nclust = 6
kmeans = KMeans(n_clusters=nclust, random_state=0).fit(cell_vecs)
np.set_printoptions(precision=4)
#kmeans.cluster_centers_
for cluster in kmeans.cluster_centers_:
print ("\n\nCLUSTER")
for i in range(len(cluster)):
print (class_lookup[i],': %.2f' % cluster[i])
CLUSTER
<class '_ast.If'> : 0.00
<class '_ast.ClassDef'> : -0.00
<class '_ast.FunctionDef'> : 0.08
<class '_ast.Assign'> : 0.22
<class '_ast.Expr'> : 0.58
<class '_ast.Try'> : 0.00
<class '_ast.Delete'> : 0.03
<class '_ast.For'> : 0.04
<class '_ast.ImportFrom'> : 0.06
<class '_ast.With'> : 0.00
<class '_ast.Import'> : 0.05
<class '_ast.AugAssign'> : 0.00
CLUSTER
<class '_ast.If'> : -0.00
<class '_ast.ClassDef'> : -0.00
<class '_ast.FunctionDef'> : 0.05
<class '_ast.Assign'> : 2.92
<class '_ast.Expr'> : 0.45
<class '_ast.Try'> : -0.00
<class '_ast.Delete'> : 0.02
<class '_ast.For'> : 0.11
<class '_ast.ImportFrom'> : 0.05
<class '_ast.With'> : 0.00
<class '_ast.Import'> : 0.02
<class '_ast.AugAssign'> : 0.00
CLUSTER
<class '_ast.If'> : -0.00
<class '_ast.ClassDef'> : -0.00
<class '_ast.FunctionDef'> : 0.26
<class '_ast.Assign'> : 3.30
<class '_ast.Expr'> : 13.80
<class '_ast.Try'> : -0.00
<class '_ast.Delete'> : 0.00
<class '_ast.For'> : 0.20
<class '_ast.ImportFrom'> : 0.02
<class '_ast.With'> : -0.00
<class '_ast.Import'> : 0.11
<class '_ast.AugAssign'> : 0.13
CLUSTER
<class '_ast.If'> : 0.00
<class '_ast.ClassDef'> : 0.00
<class '_ast.FunctionDef'> : 0.06
<class '_ast.Assign'> : 0.90
<class '_ast.Expr'> : 4.52
<class '_ast.Try'> : 0.00
<class '_ast.Delete'> : 0.00
<class '_ast.For'> : 0.03
<class '_ast.ImportFrom'> : 0.02
<class '_ast.With'> : 0.00
<class '_ast.Import'> : 0.04
<class '_ast.AugAssign'> : 0.00
CLUSTER
<class '_ast.If'> : -0.00
<class '_ast.ClassDef'> : -0.00
<class '_ast.FunctionDef'> : 0.04
<class '_ast.Assign'> : 9.66
<class '_ast.Expr'> : 1.94
<class '_ast.Try'> : -0.00
<class '_ast.Delete'> : 0.17
<class '_ast.For'> : 0.24
<class '_ast.ImportFrom'> : 0.14
<class '_ast.With'> : -0.00
<class '_ast.Import'> : 0.01
<class '_ast.AugAssign'> : 0.00
CLUSTER
<class '_ast.If'> : -0.00
<class '_ast.ClassDef'> : 0.01
<class '_ast.FunctionDef'> : 0.05
<class '_ast.Assign'> : 0.14
<class '_ast.Expr'> : 1.35
<class '_ast.Try'> : -0.00
<class '_ast.Delete'> : -0.00
<class '_ast.For'> : -0.00
<class '_ast.ImportFrom'> : 3.92
<class '_ast.With'> : 0.02
<class '_ast.Import'> : 5.84
<class '_ast.AugAssign'> : 0.00
In [200]:
from helper_classes.cond_computer import CondComputer
class_to_cells = {}
node_list = []
for i, nb in enumerate(a.nb_features):
node_list.append('start')
for cell in (nb.get_all_cells()):
t = cell_to_vec(cell.get_feature('nodes'), classes)
t = kmeans.predict([t])[0]
node_list.append(t)
if t not in class_to_cells:
class_to_cells[t] = []
class_to_cells[t].append(cell.get_feature('code'))
node_list.append('end')
cc = CondComputer(node_list)
keys = ['start']
for i in range(nclust):
keys.append(i)
keys.append('end')
In [182]:
arr, arr_names = cc.compute_probabilities(cc.count_totals,.01,keys=keys)
In [183]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
plt.rcParams['figure.figsize'] = (20, 10)
cc.plot_bar(arr, arr_names, 'Probability per Node type')
In [184]:
cc.plot_conditional_bar(arr, arr_names, 0, 'Probability per Node type', x_labels=keys)
start
In [185]:
cc.plot_conditional_bar(arr, arr_names, 1, 'Probability per Node type')
0
In [186]:
cc.plot_conditional_bar(arr, arr_names, 2, 'Probability per Node type')
1
In [187]:
cc.plot_conditional_bar(arr, arr_names, 3, 'Probability per Node type')
3
In [188]:
cc.plot_conditional_bar(arr, arr_names, 4, 'Probability per Node type')
4
In [189]:
cc.plot_conditional_bar(arr, arr_names, 5, 'Probability per Node type')
5
In [205]:
# Comment cells or short one liners
for i, cell in enumerate(class_to_cells[0]):
if i > 20:
break
print (cell)
# Warm up:
# coding: utf-8
# In[ ]:
df_epfl.head(2)
# coding: utf-8
# In[ ]:
df_epfl.size
# coding: utf-8
# In[ ]:
df_epfl_dw.size
# Data Wrangling:
# coding: utf-8
# In[ ]:
# First check for NaN values
df_epfl.isnull().sum()
# coding: utf-8
# In[ ]:
# Machine Learning :
# coding: utf-8
# In[ ]:
get_ipython().magic('matplotlib inline')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.options.mode.chained_assignment = None # default='warn'
import warnings
warnings.filterwarnings('ignore')
# coding: utf-8
# In[ ]:
type(t1)
# coding: utf-8
# In[ ]:
t1[0]
# coding: utf-8
# In[ ]:
# FOR EPFL FIRST select only the values that we need
# coding: utf-8
# In[ ]:
epfl_df
# coding: utf-8
# In[ ]:
# ETH Whole Dataframe
# coding: utf-8
# In[ ]:
# I have already done the first part of data wrangling by reducing the dataframe to only the columns i possibly need
# coding: utf-8
# In[ ]:
epfl_df.head()
# coding: utf-8
# In[ ]:
small_epfl = epfl_df[epfl_df['tweetId'] % 10 == 3]
# coding: utf-8
# In[ ]:
small_epfl.shape
# coding: utf-8
# In[ ]:
small_epfl.columns
In [207]:
# Short lists of (mainly) Assignments
for i, cell in enumerate(class_to_cells[1]):
if i > 20:
break
print (cell)
# coding: utf-8
# In[ ]:
#Load the data in pandas Dataframe
df_epfl = pd.read_json("epfl_en.json")
df_eth = pd.read_json("eth_en.json")
# coding: utf-8
# In[ ]:
#Build the two down sampled Dataframe
df_epfl_dw = df_epfl[df_epfl['id'] % 10 == 2]
df_eth_dw = df_eth[df_eth['id'] % 10 == 2]
# coding: utf-8
# In[ ]:
#It's hard to tell which null values will affect future work at this point, but it seems like the id field is
# never null which is good.
#The other thing that we shall do is to use a proper indexing
df_epfl.index = df_epfl['id']
df_epfl_dw.index = df_epfl_dw['id']
df_eth.index = df_eth['id']
df_eth_dw.index = df_eth_dw['id']
#Also some fields are always null so we can drop them but it doesn't affect future works at this moment.
# coding: utf-8
# In[ ]:
df_epfl['user'] #is a dictionnary we want the user's id
#let's build a df with only the tweeter user's id
#df_epfl['user'].iloc[0]['id']
epfl_user = pd.DataFrame()
for i in range(0, df_epfl.shape[0]):
epfl_user = epfl_user.append([df_epfl['user'].iloc[i]['id']])
#I got tricked and there is only one user as said on exam
unique = epfl_user.drop_duplicates()
# coding: utf-8
# In[ ]:
#lets check the id in eth:
eth_user = pd.DataFrame()
for i in range(0, df_eth.shape[0]):
eth_user = eth_user.append([df_eth['user'].iloc[i]['id']])
#I got tricked and there is only one user tweeting as said on exam
unique2 = eth_user.drop_duplicates()
unique2
# coding: utf-8
# In[ ]:
# First let's count number of overall favorite and retweets
favorites_epfl = df_epfl['retweet_count'].sum()
retweet_epfl = df_epfl['favorite_count'].sum()
favorites_eth = df_eth['retweet_count'].sum()
retweet_eth = df_eth['favorite_count'].sum()
#Let's plot this
plt.figure();
df_show = pd.DataFrame(data=[['EPFL', favorites_epfl+retweet_epfl], ['ETH', favorites_eth + retweet_eth]], columns=['Uni', 'Number of Tweet + Likes'], index=['EPFL', 'ETH'])
df_show.plot(kind='bar')
print(favorites_epfl + retweet_epfl)
#It appears EPFL is more present in the twitter game
#We could also have used a pie chart to show this data effectively
# coding: utf-8
# In[ ]:
# Let's groupby hashtag now
#let's add a column with only the hashtag from the entities column
df_epfl_hashtag = df_epfl
df_hashtags = pd.DataFrame()
for i in range(0, df_epfl_hashtag.shape[0]):
hashtag_i = df_epfl_hashtag['entities'].iloc[i]['hashtags']
if (hashtag_i != ''):
hashtags = hashtag_i[0]['text']
df_hashtags['hashtags'] = df_hashtags.append([hashtags])
else:
df_hashtags['hashtags'] = df_hashtags.append(['No_tag'])
#df_epfl_hashtag
#df_epfl_hashtag = df_epfl.groupby(df_epfl['hashtag']).sum()
# Will fix later if TIME
# coding: utf-8
# In[ ]:
t1 = pd.read_json('epfl_en.json', typ='dataframe')
t2 = pd.read_json('eth_en.json', typ='dataframe')
# coding: utf-8
# In[ ]:
columns_with_nan = pd.isnull(small_epfl).sum() > 0
columns_with_nan = list(columns_with_nan[columns_with_nan].index)
print(columns_with_nan)
# coding: utf-8
# In[ ]:
small_epfl['id_str'] = small_epfl['id_str'].astype('category')
cat_cols = small_epfl.select_dtypes(['category']).columns
small_epfl[cat_cols] = small_epfl[cat_cols].apply(lambda col : col.cat.codes)
# coding: utf-8
# In[ ]:
small_eth['id_str'] = small_eth['id_str'].astype('category')
cat_cols = small_eth.select_dtypes(['category']).columns
small_eth[cat_cols] = small_eth[cat_cols].apply(lambda col : col.cat.codes)
# coding: utf-8
# In[ ]:
columns_with_nan = pd.isnull(small_eth).sum() > 0
columns_with_nan = list(columns_with_nan[columns_with_nan].index)
print(columns_with_nan)
# coding: utf-8
# In[ ]:
list_of_results = []
neighbors = range(2,50)
for n in neighbors:
# define cv object
cv = cross_validation_KFold(X.shape[0], shuffle = True, n_folds=10, random_state=4)
# initialize classifier
neigh = KNeighborsRegressor(n_neighbors=n)
# estimate classifier using CV
avg_test_accuracy = np.mean(cross_val_score(neigh, X, y, cv=cv))
list_of_results.append(avg_test_accuracy)
print(avg_test_accuracy)
# coding: utf-8
# In[ ]:
max_test_accuracy = np.max(list_of_results)
max_pos = list_of_results.index(max_test_accuracy)
opt_n = neighbors[max_pos]
opt_n
# coding: utf-8
# In[ ]:
X = small_eth[['favorite','place','id_str']]
y = small_eth['retweet_count']
list_of_results = []
neighbors = range(2,50)
for n in neighbors:
# define cv object
cv = cross_validation_KFold(X.shape[0], shuffle = True, n_folds=10, random_state=4)
# initialize classifier
neigh = KNeighborsRegressor(n_neighbors=n)
# estimate classifier using CV
avg_test_accuracy = np.mean(cross_val_score(neigh, X, y, cv=cv))
list_of_results.append(avg_test_accuracy)
print(avg_test_accuracy)
# coding: utf-8
# In[ ]:
max_test_accuracy = np.max(list_of_results)
max_pos = list_of_results.index(max_test_accuracy)
opt_n = neighbors[max_pos]
opt_n
# coding: utf-8
# In[ ]:
epfl_en['year'] = [data.year for data in epfl_en.created_at]
epfl_en['month'] = [data.month for data in epfl_en.created_at]
epfl_en['hour'] = [data.hour for data in epfl_en.created_at]
# coding: utf-8
# In[ ]:
eth_en['year'] = [data.year for data in eth_en.created_at]
eth_en['month'] = [data.month for data in eth_en.created_at]
eth_en['hour'] = [data.hour for data in eth_en.created_at]
# coding: utf-8
# In[ ]:
num_topics = 5
num_top_words = 7
# coding: utf-8
# In[ ]:
mod_tokens = mod_text.apply(lambda tweet: tokenizer(tweet))
m_lower_case = mod_tokens.apply(lambda row : to_lower_case(row))
m_filtered_tokens = lower_case.apply(lambda row: filter_tokens(row))
m_lemmas = m_filtered_tokens.apply(lambda row: lemmantizer(row))
# coding: utf-8
# In[ ]:
dictionary = corpora.Dictionary(m_lemmas)
m_corpus = get_corpus(m_lemmas)
In [208]:
# Long lists of (mainly) Function calls. Alot of plotting happening here
for i, cell in enumerate(class_to_cells[2]):
if i > 20:
break
print (cell)
# coding: utf-8
# In[ ]:
print('Favorites per year:')
plt.plot(group_by_and_aggregate(epfl[['favorite_count', 'created_at']], lambda dt: dt.year), color='r', label='EPFL')
plt.plot(group_by_and_aggregate(ethz[['favorite_count', 'created_at']], lambda dt: dt.year), color='b', label='ETHZ')
plt.xlabel('Year')
plt.ylabel('Favorites')
plt.legend()
plt.show()
print('Retweets per year:')
plt.plot(group_by_and_aggregate(epfl[['retweet_count', 'created_at']], lambda dt: dt.year), color='r', label='EPFL')
plt.plot(group_by_and_aggregate(ethz[['retweet_count', 'created_at']], lambda dt: dt.year), color='b', label='ETHZ')
plt.xlabel('Year')
plt.ylabel('Retweets')
plt.legend()
plt.show()
# coding: utf-8
# In[ ]:
epflRetCountYear={}
epflRetCountMonth={}
epflRetCountHour={}
epflRetMeanYear=0
epflRetMeanMonth=0
epflRetMeanHour=0
for year in sorted(epfl_en.year.unique()):
epflRetCountYear[year]=epfl_en[epfl_en['year']==year]['retweet_count'].sum()
epflRetMeanYear+=epflRetCountYear[year]
for month in sorted(epfl_en.month.unique()):
epflRetCountMonth[month]=epfl_en[epfl_en['month']==month]['retweet_count'].sum()
epflRetMeanMonth+=epflRetCountMonth[month]
for hour in sorted(epfl_en.hour.unique()):
epflRetCountHour[hour]=epfl_en[epfl_en['hour']==hour]['retweet_count'].sum()
epflRetMeanHour+=epflRetCountHour[hour]
epflRetMeanHour/=len(epfl_en.hour.unique())
epflRetMeanYear/=len(epfl_en.year.unique())
epflRetMeanMonth/=len(epfl_en.month.unique())
# Year Plot Count
ind = np.arange(len(epfl_en.year.unique()))
# # width = 0.65
fig = plt.figure()
ax = fig.add_subplot(111)
ax.bar(ind, epflRetCountYear.values(), color='#deb0b0')
ax.set_xticklabels(sorted(epfl_en.year.unique()),rotation='vertical')
ax.axhline(epflRetMeanYear,color='b',linewidth=2,linestyle='dashed')
ax.legend(['mean','year'],loc='best')
plt.title('retweets per year')
plt.tight_layout()
plt.show()
# Year Month Count
ind = np.arange(len(epfl_en.month.unique()))
# # width = 0.65
fig = plt.figure()
ax = fig.add_subplot(122)
ax.bar(ind, epflRetCountMonth.values(), color='#deb0b0')
ax.set_xticklabels(sorted(epfl_en.month.unique()),rotation='vertical')
ax.axhline(epflRetMeanMonth,color='b',linewidth=2,linestyle='dashed')
ax.legend(['mean','month'],loc='best')
plt.title('retweets per month')
plt.tight_layout()
plt.show()
# # Year Hour Count
ind = np.arange(len(epfl_en.hour.unique()))
# # width = 0.65
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(133)
ax.bar(ind, epflRetCountHour.values(), color='#deb0b0')
ax.set_xticklabels(sorted(epfl_en.hour.unique()),rotation='vertical')
ax.axhline(epflRetMeanHour,color='b',linewidth=2,linestyle='dashed')
ax.legend(['mean','hour'],loc='best')
plt.title('retweets per hour')
plt.tight_layout()
plt.show()
# coding: utf-8
# In[ ]:
epflFavCountYear={}
epflFavCountMonth={}
epflFavCountHour={}
epflFavMeanYear=0
epflFavMeanMonth=0
epflFavMeanHour=0
for year in sorted(epfl_en.year.unique()):
epflFavCountYear[year]=epfl_en[epfl_en['year']==year]['favorite_count'].sum()
epflFavMeanYear+=epflFavCountYear[year]
for month in sorted(epfl_en.month.unique()):
epflFavCountMonth[month]=epfl_en[epfl_en['month']==month]['favorite_count'].sum()
epflFavMeanMonth+=epflFavCountMonth[month]
for hour in sorted(epfl_en.hour.unique()):
epflFavCountHour[hour]=epfl_en[epfl_en['hour']==hour]['favorite_count'].sum()
epflFavMeanHour+=epflFavCountHour[hour]
epflFavMeanHour/=len(epfl_en.hour.unique())
epflFavMeanYear/=len(epfl_en.year.unique())
epflFavMeanMonth/=len(epfl_en.month.unique())
# Year Plot Count
ind = np.arange(len(epfl_en.year.unique()))
# # width = 0.65
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(144)
ax.bar(ind, epflFavCountYear.values(), color='#deb0b0')
ax.set_xticklabels(sorted(epfl_en.year.unique()),rotation='vertical')
ax.axhline(epflFavMeanYear,color='b',linewidth=2,linestyle='dashed')
ax.legend(['mean','year'],loc='best')
plt.title('favorites per year')
plt.tight_layout()
plt.show()
# Year Month Count
ind = np.arange(len(epfl_en.month.unique()))
# # width = 0.65
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(155)
ax.bar(ind, epflFavCountMonth.values(), color='#deb0b0')
ax.set_xticklabels(sorted(epfl_en.month.unique()))
ax.axhline(epflFavMeanMonth,color='b',linewidth=2,linestyle='dashed')
ax.legend(['mean','month'],loc='best')
plt.title('favorites per month')
plt.tight_layout()
plt.show()
# # Year Hour Count
ind = np.arange(len(epfl_en.hour.unique()))
# # width = 0.65
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(166)
ax.bar(ind, epflFavCountHour.values(), color='#deb0b0')
ax.set_xticklabels(sorted(epfl_en.hour.unique()))
ax.axhline(epflFavMeanHour,color='b',linewidth=2,linestyle='dashed')
ax.legend(['mean','hour'],loc='best')
plt.title('favorites per hour')
plt.tight_layout()
plt.show()
# epfl_en.hour.unique()
# coding: utf-8
# In[ ]:
f, axarr = plt.subplots(3,2, figsize=(10,10), sharey=True)
axarr[0,0].plot(epfl_by_year)
axarr[0,0].set_title('EPFL Favorites and retweets by year')
axarr[0,0].legend(['favorite_count','retweet_count'], loc='best')
axarr[0,1].plot(ethz_by_year)
axarr[0,1].set_title('ETHZ Favorites and retweets by year')
axarr[0,1].legend(['favorite_count','retweet_count'], loc='best')
axarr[1,0].plot(epfl_by_month)
axarr[1,0].set_title('EPFL Favorites and retweets by month')
axarr[1,0].legend(['favorite_count','retweet_count'], loc='best')
axarr[1,1].plot(ethz_by_month)
axarr[1,1].set_title('ETHZ Favorites and retweets by month')
axarr[1,1].legend(['favorite_count','retweet_count'], loc='best')
axarr[2,0].plot(epfl_by_hour)
axarr[2,0].set_title('EPFL Favorites and retweets by hour')
axarr[2,0].legend(['favorite_count','retweet_count'], loc='best')
axarr[2,1].plot(ethz_by_hour)
axarr[2,1].set_title('ETHZ Favorites and retweets by hour')
axarr[2,1].legend(['favorite_count','retweet_count'], loc='best')
f.subplots_adjust(hspace=0.5, wspace=1)
# coding: utf-8
# In[ ]:
# year
import matplotlib.pyplot as plt
f, (ax1, ax2) = plt.subplots(1, 2, sharey=True)
data_retweet = []
data_favourite = []
for year in reversed(df_epfl.year.unique()):
data_retweet.append([list_epfl_retweet['year'][year], list_eth_retweet['year'][year]])
data_favourite.append([list_epfl_favourite['year'][year], list_eth_favourite['year'][year]])
pd.DataFrame(data_retweet, columns=['epfl','eth'], index=reversed(df_epfl.year.unique())).plot(kind='bar', ax = ax1)
pd.DataFrame(data_favourite, columns=['epfl','eth'], index=reversed(df_epfl.year.unique())).plot(kind='bar', ax = ax2)
ax1.set_title("Nb retweets vs year")
ax1.set_xlabel("Nb retweets")
ax1.set_ylabel("Year")
ax2.set_title("Nb favourites vs year")
ax2.set_xlabel("Nb retweets")
ax2.set_ylabel("Year")
plt.show()
# coding: utf-8
# In[ ]:
# month
import matplotlib.pyplot as plt
f, (ax1, ax2) = plt.subplots(1, 2, sharey=True)
data_retweet = []
data_favourite = []
for month in reversed(df_epfl.month.unique()):
data_retweet.append([list_epfl_retweet['month'][month], list_eth_retweet['month'][month]])
data_favourite.append([list_epfl_favourite['month'][month], list_eth_favourite['month'][month]])
pd.DataFrame(data_retweet, columns=['epfl','eth'], index=reversed(df_epfl.month.unique())).plot(kind='bar', ax = ax1)
pd.DataFrame(data_favourite, columns=['epfl','eth'], index=reversed(df_epfl.month.unique())).plot(kind='bar', ax = ax2)
ax1.set_title("Nb retweets vs months")
ax1.set_xlabel("Nb retweets")
ax1.set_ylabel("Month")
ax2.set_title("Nb favourites vs months")
ax2.set_xlabel("Nb retweets")
ax2.set_ylabel("Month")
plt.show()
# coding: utf-8
# In[ ]:
# hour
import matplotlib.pyplot as plt
f, (ax1, ax2) = plt.subplots(1, 2, sharey=True)
data_retweet = []
data_favourite = []
for hour in reversed(df_epfl.hour.unique()):
data_retweet.append([list_epfl_retweet['hour'][hour], list_eth_retweet['hour'][hour]])
data_favourite.append([list_epfl_favourite['hour'][hour], list_eth_favourite['hour'][hour]])
pd.DataFrame(data_retweet, columns=['epfl','eth'], index=reversed(df_epfl.hour.unique())).plot(kind='bar', ax = ax1)
pd.DataFrame(data_favourite, columns=['epfl','eth'], index=reversed(df_epfl.hour.unique())).plot(kind='bar', ax = ax2)
ax1.set_title("Nb retweets vs hours")
ax1.set_xlabel("Nb retweets")
ax1.set_ylabel("Hour")
ax2.set_title("Nb favourites vs hours")
ax2.set_xlabel("Nb retweets")
ax2.set_ylabel("Hour")
plt.show()
# coding: utf-8
# In[ ]:
stopwordlist = stopwords.words(['english', 'french'])
stopwordlist.append("—")
stopwordlist.append("•")
stopwordlist.append("--")
stopwordlist.append("-")
stopwordlist.append("A")
stopwordlist.append("The")
stopwordlist.append("&")
stopwordlist.append("...")
stopwordlist.append("–")
stopwordlist.append("...\"")
stopwordlist.append("&")
stopwordlist.append("\"\"The")
stopwordlist.append("I")
stopwordlist.append("via")
stopwordlist.append("(via")
stopwordlist.append("We")
stopwordlist.append(":)")
stopwordlist.append("|")
stopwordlist.append("!")
# coding: utf-8
# In[ ]:
fig1 = plt.figure()
sub = fig1.add_subplot(111)
x = df_eth_year['year']
y1 = df_eth_year['Total Favorites']
y2 = df_eth_year['Total Retweets']
y3 = df_epfl_year['Total Favorites']
y4 = df_epfl_year['Total Retweets']
sub.plot(x,y1,c='b',label='ETH Favorites')
sub.plot(x,y2,c='r', label='ETH Retweets')
sub.plot(x,y3,c='g', label='EPFL Favorites')
sub.plot(x,y4,c='y', label='EPFL Retweets')
plt.legend(loc='upper left')
plt.ylabel('Total')
plt.xlabel('Year')
plt.title('Yearly Growth of Retweets and Favorites at EPFL and ETH')
plt.show()
# coding: utf-8
# In[ ]:
#plot monthly trends
fig2 = plt.figure()
sub2 = fig2.add_subplot(211)
x = df_eth_month['month']
y1 = df_eth_month['Monthly Favorites']
y2 = df_eth_month['Monthly Retweets']
y3 = df_epfl_month['Monthly Favorites']
y4 = df_epfl_month['Monthly Retweets']
sub2.plot(x,y1,'bo',label='ETH Favorites')
sub2.plot(x,y2,'ro', label='ETH Retweets')
sub2.plot(x,y3,'go', label='EPFL Favorites')
sub2.plot(x,y4,'yo', label='EPFL Retweets')
sub2.legend(bbox_to_anchor=(1.5, 1.5))
plt.ylabel('Total')
plt.xlabel('Month')
plt.title('Monthly Trends of Retweets and Favorites at EPFL and ETH')
plt.show()
# coding: utf-8
# In[ ]:
#plot monthly trends with adjusted axis
fig3 = plt.figure()
sub3 = fig3.add_subplot(211)
x = df_eth_month['month']
y1 = df_eth_month['Monthly Favorites']
y2 = df_eth_month['Monthly Retweets']
y3 = df_epfl_month['Monthly Favorites']
y4 = df_epfl_month['Monthly Retweets']
sub3.plot(x,y1,'bo',label='ETH Favorites')
sub3.plot(x,y2,'ro', label='ETH Retweets')
sub3.plot(x,y3,'go', label='EPFL Favorites')
sub3.plot(x,y4,'yo', label='EPFL Retweets')
plt.axis([0, 13, 0, 2500])
sub3.legend(bbox_to_anchor=(1.5, 1.5))
plt.ylabel('Total')
plt.xlabel('Month')
plt.title('Monthly Trends of Retweets and Favorites at EPFL and ETH')
plt.show()
# coding: utf-8
# In[ ]:
#plot hourly trends
fig4 = plt.figure()
sub4 = fig4.add_subplot(211)
x1 = df_eth_hour['hour']
x2 = df_epfl_hour['hour']
y1 = df_eth_hour['Hourly Favorites']
y2 = df_eth_hour['Hourly Retweets']
y3 = df_epfl_hour['Hourly Favorites']
y4 = df_epfl_hour['Hourly Retweets']
sub4.plot(x1,y1, 'ro',label='ETH Favorites')
sub4.plot(x1,y2, 'bo',label='ETH Retweets')
sub4.plot(x2,y3, 'go',label='EPFL Favorites')
sub4.plot(x2,y4, 'yo',label='EPFL Retweets')
sub4.legend(bbox_to_anchor=(1.5, 1.5))
plt.ylabel('Total')
plt.xlabel('Hour')
plt.title('Hourly Trends of Retweets and Favorites at EPFL and ETH')
plt.show()
# coding: utf-8
# In[ ]:
eth.drop(['contributors'],inplace=True,axis=1,errors='ignore')
eth.drop(['coordinates'],inplace=True,axis=1,errors='ignore')
eth.drop(['geo'],inplace=True,axis=1,errors='ignore')
eth.drop(['in_reply_to_screen_name'],inplace=True,axis=1,errors='ignore')
eth.drop(['in_reply_to_status_id'],inplace=True,axis=1,errors='ignore')
eth.drop(['in_reply_to_status_id_str'],inplace=True,axis=1,errors='ignore')
eth.drop(['in_reply_to_user_id'],inplace=True,axis=1,errors='ignore')
eth.drop(['in_reply_to_user_id_str'],inplace=True,axis=1,errors='ignore')
eth.drop(['place'],inplace=True,axis=1,errors='ignore')
eth.drop(['quoted_status'],inplace=True,axis=1,errors='ignore')
eth.drop(['quoted_status_id'],inplace=True,axis=1,errors='ignore')
eth.drop(['quoted_status_id_str'],inplace=True,axis=1,errors='ignore')
eth.count()
# coding: utf-8
# In[ ]:
epfl_df = pd.read_json('epfl_en.json')
ds_epfl = epfl_df.copy()
ds_epfl = ds_epfl[(epfl_df.id % 10) == 0]
#remove all values with no info
epfl_df.dropna(axis = 0,how = 'all',inplace = True)
epfl_df.columns
epfl_df.coordinates.replace('None', 'NaN',inplace = True)
epfl_df.describe()
epfl_df.drop('contributors',axis = 1,inplace = True)
epfl_df.describe()
epfl_df.columns
#remove those with strings therefore duplicated info and those without extra value such as language
epfl_df = epfl_df.iloc[:,[0,1,2,4,5,6,7,9,10,12,16,17,19,20,21,22,23,25,27]]
epfl_df.columns
epfl_df.describe()
#I did not remove na's so as to retain as much data as possible for further processing. however the None
#were replaced with NaN incase I needed to use fill Na later on
epfl_df.iloc[:,[8,9,12,13]].replace('None', 'NaN',inplace = True)
#remove location information as not doing question 2 and informations that are strings if missed previously
epfl_df.drop(['coordinates','quoted_status_id_str','geo','in_reply_to_screen_name','place','possibly_sensitive'],axis=1,inplace = True)
epfl_df.describe()
# coding: utf-8
# In[ ]:
eth_df = pd.read_json('eth_en.json')
ds_eth = eth_df.copy()
ds_eth = ds_eth[(epfl_df.id % 10) == 0]
eth_df.dropna(axis = 0,how = 'all',inplace = True)
eth_df.columns
eth_df.coordinates.replace('None', 'NaN',inplace = True)
eth_df.describe()
eth_df.drop('contributors',axis = 1,inplace = True)
eth_df.describe()
eth_df.columns
#remove those with strings therefore duplicated info and those without extra value such as language
eth_df = eth_df.iloc[:,[0,1,2,4,5,6,7,9,10,12,16,17,19,20,21,22,23,25,27]]
eth_df.columns
eth_df.describe()
eth_df.iloc[:,[8,9,12,13]].replace('None', 'NaN',inplace = True)
#remove location information as not doing question 2
eth_df.drop(['coordinates','geo','in_reply_to_screen_name','place','possibly_sensitive'],axis=1,inplace = True)
eth_df.head()
# coding: utf-8
# In[ ]:
#remove all values with no info
ds_epfl.dropna(axis = 0,how = 'all',inplace = True)
ds_epfl.columns
ds_epfl.coordinates.replace('None', 'NaN',inplace = True)
ds_epfl.describe()
ds_epfl.drop('contributors',axis = 1,inplace = True)
ds_epfl.describe()
ds_epfl.columns
#remove those with strings therefore duplicated info and those without extra value such as language
ds_epfl = ds_epfl.iloc[:,[0,1,2,4,5,6,7,9,10,12,16,17,19,20,21,22,23,25,27]]
ds_epfl.columns
ds_epfl.describe()
ds_epfl.iloc[:,[8,9,12,13]].replace('None', 'NaN',inplace = True)
#remove location information as not doing question 2
ds_epfl.drop(['coordinates','geo','in_reply_to_screen_name','place','possibly_sensitive'],axis=1,inplace = True)
ds_epfl.head()
# coding: utf-8
# In[ ]:
ds_eth.dropna(axis = 0,how = 'all',inplace = True)
ds_eth.columns
ds_eth.coordinates.replace('None', 'NaN',inplace = True)
ds_eth.describe()
ds_eth.drop('contributors',axis = 1,inplace = True)
ds_eth.describe()
ds_eth.columns
#remove those with strings therefore duplicated info and those without extra value such as language
ds_eth = ds_eth.iloc[:,[0,1,2,4,5,6,7,9,10,12,16,17,19,20,21,22,23,25,27]]
ds_eth.columns
ds_eth.describe()
ds_eth.iloc[:,[8,9,12,13]].replace('None', 'NaN',inplace = True)
#remove location information as not doing question 2
ds_eth.drop(['coordinates','geo','in_reply_to_screen_name','place','possibly_sensitive'],axis=1,inplace = True)
ds_eth.head()
# coding: utf-8
# In[ ]:
print('unique values per month')
print(epfl2['month'].unique())
print(eth2['month'].unique())
print('unique values per year')
print(epfl2['year'].unique())
print(eth2['year'].unique())
print('unique values per daily hour')
print(epfl2['hour_of_day'].unique())
print(eth2['hour_of_day'].unique())
print('unique universities')
print(epfl2['uni'].unique())
print(eth2['uni'].unique())
# coding: utf-8
# In[ ]:
#Tests for the values of the columns :
print('Contributors EPFL (null entries):',Counter(df_epfl.contributors.isnull().values))
print('Contributors ETH (null entries) :',Counter(df_ethz.contributors.isnull().values))
print('\nCoordinates EPFL (null entries) :', Counter(df_epfl.coordinates.isnull().values))
print('Coordinates ETH (null entries) :', Counter(df_ethz.coordinates.isnull().values))
print('\nGeo Coordinates EPFL (null entries) :', Counter(df_epfl.geo.isnull().values))
print('Geo Coordinates ETH (null entries) :', Counter(df_ethz.geo.isnull().values))
print('\nIn Reply to Status EPFL (null entries) :', Counter(df_epfl.in_reply_to_status_id.isnull().values))
print('In Reply to Status ETH (null entries) :', Counter(df_ethz.in_reply_to_status_id.isnull().values))
print('\nIn Reply to User EPFL (null entries) :', Counter(df_epfl.in_reply_to_user_id_str.isnull().values))
print('In Reply to User ETH (null entries) :', Counter(df_ethz.in_reply_to_user_id_str.isnull().values))
print('\nIs Quote EPFL :', Counter(df_epfl.is_quote_status.values))
print('Is Quote ETH :', Counter(df_ethz.is_quote_status.values))
print('\nQuoted Status EPFL (null entries) :', Counter(df_epfl.quoted_status.isnull().values))
print('Quoted Status ETH (null entries) :', Counter(df_ethz.quoted_status.isnull().values))
print('\nQuoted Status ID EPFL (null entries) :', Counter(df_epfl.quoted_status_id.isnull().values))
print('Quoted Status ID ETH (null entries) :', Counter(df_ethz.quoted_status_id.isnull().values))
print('\nPlace EPFL (null entries) :', Counter(df_epfl.place.isnull().values))
print('Place ETH (null entries) :', Counter(df_ethz.place.isnull().values))
print('\nPossibly Sensitive EPFL :', Counter(df_epfl.possibly_sensitive.isnull().values))
print('Possibly Sensitive ETH :', Counter(df_ethz.possibly_sensitive.isnull().values))
print('\nExtended Entities EPFL (null entries) :', Counter(df_epfl.extended_entities.isnull().values))
print('Extended Entities ETH (null entries) :', Counter(df_ethz.extended_entities.isnull().values))
# coding: utf-8
# In[ ]:
#Tests for the values of the columns :
print('Contributors EPFL (null entries):',Counter(df_epfl_ds.contributors.isnull().values))
print('Contributors ETH (null entries) :',Counter(df_ethz_ds.contributors.isnull().values))
print('\nCoordinates EPFL (null entries) :', Counter(df_epfl_ds.coordinates.isnull().values))
print('Coordinates ETH (null entries) :', Counter(df_ethz_ds.coordinates.isnull().values))
print('\nGeo Coordinates EPFL (null entries) :', Counter(df_epfl_ds.geo.isnull().values))
print('Geo Coordinates ETH (null entries) :', Counter(df_ethz_ds.geo.isnull().values))
print('\nIn Reply to Status EPFL (null entries) :', Counter(df_epfl_ds.in_reply_to_status_id.isnull().values))
print('In Reply to Status ETH (null entries) :', Counter(df_ethz_ds.in_reply_to_status_id.isnull().values))
print('\nIn Reply to User EPFL (null entries) :', Counter(df_epfl_ds.in_reply_to_user_id_str.isnull().values))
print('In Reply to User ETH (null entries) :', Counter(df_ethz_ds.in_reply_to_user_id_str.isnull().values))
print('\nIs Quote EPFL :', Counter(df_epfl_ds.is_quote_status.values))
print('Is Quote ETH :', Counter(df_ethz_ds.is_quote_status.values))
print('\nQuoted Status EPFL (null entries) :', Counter(df_epfl_ds.quoted_status.isnull().values))
print('Quoted Status ETH (null entries) :', Counter(df_ethz_ds.quoted_status.isnull().values))
print('\nQuoted Status ID EPFL (null entries) :', Counter(df_epfl_ds.quoted_status_id.isnull().values))
print('Quoted Status ID ETH (null entries) :', Counter(df_ethz_ds.quoted_status_id.isnull().values))
print('\nPlace EPFL (null entries) :', Counter(df_epfl_ds.place.isnull().values))
print('Place ETH (null entries) :', Counter(df_ethz_ds.place.isnull().values))
print('\nPossibly Sensitive EPFL :', Counter(df_epfl_ds.possibly_sensitive.isnull().values))
print('Possibly Sensitive ETH :', Counter(df_ethz_ds.possibly_sensitive.isnull().values))
print('\nExtended Entities EPFL (null entries) :', Counter(df_epfl_ds.extended_entities.isnull().values))
print('Extended Entities ETH (null entries) :', Counter(df_ethz_ds.extended_entities.isnull().values))
# coding: utf-8
# In[ ]:
df_epf_year = df_epf.groupby('year')[['retweet_count','favorite_count']].sum()
df_epf_sub_year = df_epf_subsample.groupby('year')[['retweet_count','favorite_count']].sum()
df_epf_year.head(3)
df_eth_year = df_eth.groupby('year')[['retweet_count','favorite_count']].sum()
df_eth_sub_year = df_eth_subsample.groupby('year')[['retweet_count','favorite_count']].sum()
df_eth_year.head(3)
plt.plot(df_epf_year.index, df_epf_year.retweet_count, 'r')
plt.plot(df_epf_year.index, df_epf_year.favorite_count, 'r--')
plt.ticklabel_format(style='plain', axis='x')
plt.plot(df_eth_year.index, df_eth_year.retweet_count, 'b')
plt.plot(df_eth_year.index, df_eth_year.favorite_count, 'b--')
plt.ticklabel_format(style='plain', axis='x')
print('We can see here the comparison bestween retween (line) and favorits (dashed line) between ' +
'EPFL (red) and ETHZ (blue)')
plt.show
In [209]:
# Shorter lists of (mainly) function calls. Some plotting/some random stuff
for i, cell in enumerate(class_to_cells[3]):
if i > 20:
break
print (cell)
# coding: utf-8
# In[ ]:
df_g_hour.plot.bar()
plt.title('ETHZ tweets - variation over the years')
plt.show()
df_f_hour.plot.bar()
plt.title('EPFL tweets - variation over the years')
plt.show()
# coding: utf-8
# In[ ]:
df_g_year.plot.bar()
plt.title('ETHZ tweets - variation over the years')
plt.show()
df_f_year.plot.bar()
plt.title('EPFL tweets - variation over the years')
plt.show()
# coding: utf-8
# In[ ]:
ax_year_retweet = grouped_year_retweet_count['sum'].plot(kind='bar', color = 'blue')
ax_year_retweet.set_xlabel("Year", fontsize=10)
ax_year_favorite = grouped_year_favorite_count['sum'].plot(kind='bar', color='grey', title ="# of retweets (blue) and # liked tweets (gray)", figsize=(5, 5), fontsize=10)
ax_year_favorite.set_xlabel("Year", fontsize=10)
plt.show()
# coding: utf-8
# In[ ]:
plt.plot(rmses,'b')
plt.title("RMSES / estimators")
plt.xlabel("Number of estimators")
plt.ylabel("RMSE")
plt.show()
# coding: utf-8
# In[ ]:
plt.subplots(figsize=(8,6))
sns.countplot(x='year', hue='university', data=engagement)
plt.suptitle('Count of tweets by year', size=20)
# coding: utf-8
# In[ ]:
plt.subplots(figsize=(8,6))
sns.barplot(x='year', y='retweet_count', hue='university', data=engagement)
plt.suptitle('Mean of retweet_count by year', size=20)
# coding: utf-8
# In[ ]:
plt.subplots(figsize=(8,6))
sns.barplot(x='year', y='favorite_count', hue='university', data=engagement)
plt.suptitle('Mean of favorite_count by year', size=20)
# coding: utf-8
# In[ ]:
plt.subplots(figsize=(8,6))
sns.countplot(x='month', hue='university', data=engagement)
plt.suptitle('Count of tweets by month', size=20)
# coding: utf-8
# In[ ]:
plt.subplots(figsize=(8,6))
sns.barplot(x='month', y='retweet_count', hue='university', data=engagement)
plt.suptitle('Mean of retweet_count by month', size=20)
# coding: utf-8
# In[ ]:
plt.subplots(figsize=(8,6))
sns.pointplot(x='month', y='retweet_count', data=summed_l, join=False)
sns.pointplot(x='month', y='retweet_count', data=summed_z, join=False, color=palette[2])
# coding: utf-8
# In[ ]:
plt.subplots(figsize=(8,6))
sns.pointplot(x='month', y='favorite_count', data=summed_l, join=False)
sns.pointplot(x='month', y='favorite_count', data=summed_z, join=False, color=palette[2])
# coding: utf-8
# In[ ]:
plt.subplots(figsize=(8,6))
sns.countplot(x='hour', hue='university', data=engagement)
plt.suptitle('Count of tweets by year', size=20)
# coding: utf-8
# In[ ]:
plt.subplots(figsize=(8,6))
sns.pointplot(x='hour', y='retweet_count', data=summed_l, join=False)
sns.pointplot(x='hour', y='retweet_count', data=summed_z, join=False, color=palette[2])
# coding: utf-8
# In[ ]:
plt.subplots(figsize=(8,6))
sns.pointplot(x='hour', y='favorite_count', data=summed_l, join=False)
sns.pointplot(x='hour', y='favorite_count', data=summed_z, join=False, color=palette[2])
# coding: utf-8
# In[ ]:
wc = WordCloud().generate(raw_corpus)
plt.subplots(figsize=(10,15))
plt.imshow(wc)
plt.axis("off")
plt.show()
# coding: utf-8
# In[ ]:
def plot_stats(epfl, ethz, labels):
ind = np.arange(N)
fig = plt.figure()
width = 0.2
ax = fig.add_subplot(111)
yvals = [epfl.favorite_count.sum(), ethz.favorite_count.sum()]
rects1 = ax.bar(ind, yvals, width, color='r')
zvals = [epfl.retweet_count.sum(), ethz.retweet_count.sum()]
rects2 = ax.bar(ind+width, zvals, width, color='g')
ax.set_ylabel('Count')
ax.set_xticks(ind + width)
ax.set_xticklabels(labels)
ax.legend( (rects1[0], rects2[0]), ('Favorites', 'Retweets'))
plt.show()
print('Full dataset:')
plot_stats(epfl, ethz, labels=['EPFL', 'ETHZ'])
print('Reduced dataset:')
plot_stats(epfl_small, ethz_small, labels=['EPFL', 'ETHZ'])
# coding: utf-8
# In[ ]:
print('Favorites per month:')
plt.plot(group_by_and_aggregate(epfl[['favorite_count', 'created_at']], lambda dt: dt.month), color='r', label='EPFL')
plt.plot(group_by_and_aggregate(ethz[['favorite_count', 'created_at']], lambda dt: dt.month), color='b', label='ETHZ')
plt.xlabel('Month')
plt.ylabel('Favorites')
plt.legend()
plt.show()
# coding: utf-8
# In[ ]:
print('Favorites per hour:')
plt.plot(group_by_and_aggregate(epfl[['favorite_count', 'created_at']], lambda dt: dt.hour), color='r', label='EPFL')
plt.plot(group_by_and_aggregate(ethz[['favorite_count', 'created_at']], lambda dt: dt.hour), color='b', label='ETHZ')
plt.xlabel('Hour')
plt.ylabel('Favorites')
plt.legend()
plt.show()
# coding: utf-8
# In[ ]:
print('Retweets per hour:')
plt.plot(group_by_and_aggregate(epfl[['retweet_count', 'created_at']], lambda dt: dt.hour), color='r', label='EPFL')
plt.plot(group_by_and_aggregate(ethz[['retweet_count', 'created_at']], lambda dt: dt.hour), color='b', label='ETHZ')
plt.xlabel('Hour')
plt.ylabel('Retweets')
plt.legend()
plt.show()
# coding: utf-8
# In[ ]:
ind = np.arange(len(epfl_en.year.value_counts()))
# # width = 0.65
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)
ax.bar(ind, epfl_en.year.value_counts().sort_index(), color='#deb0b0')
ax.set_xticklabels(sorted(epfl_en.year.unique()))
ax.axhline(epfl_en.year.value_counts().mean(),color='b',linewidth=2,linestyle='dashed')
ax.legend(['mean','year'],loc='best')
plt.title('tweets per year')
plt.tight_layout()
plt.show()
# coding: utf-8
# In[ ]:
p1 = plt.scatter(years, epfl_year_retweet[years],marker = 'x',color='b',s=60)
p2 = plt.scatter(years, eth_year_retweet[years],marker = 'o',color='r',s=60)
plt.xticks(years)
plt.legend((p1, p2), ('EPFL', 'ETHZ'), loc=2)
plt.title("Number of retweets per year")
In [210]:
# Longer mixtures of both assignments and function calls
for i, cell in enumerate(class_to_cells[4]):
if i > 20:
break
print (cell)
# coding: utf-8
# In[ ]:
# Now let's check per year
df_epfl_year = df_epfl.groupby(df_epfl['created_at'].map(lambda x: x.year)).sum()
df_epfl_year = df_epfl_year[["favorite_count", "retweet_count"]]
df_epfl_year['total'] = df_epfl_year['favorite_count'] + df_epfl_year['retweet_count']
df_eth_year = df_eth.groupby(df_eth['created_at'].map(lambda x: x.year)).sum()
df_eth_year = df_eth_year[["favorite_count", "retweet_count"]]
df_eth_year['total'] = df_eth_year['favorite_count'] + df_eth_year['retweet_count']
fig = plt.figure() # Create matplotlib figure
ax = fig.add_subplot(111) # Create matplotlib axes
width = 0.4
df_epfl_year.total.plot(kind='bar', color='blue', width=width, position=0)
df_eth_year.total.plot(kind='bar', color='red', width=width, position=1)
plt.show()
# We can see that the account got more active over time and that epfl is on top almost every year
#Need to fix label names , blue = EPFL, red = ETH
# coding: utf-8
# In[ ]:
# Now let's check per month
df_epfl_year = df_epfl.groupby(df_epfl['created_at'].map(lambda x: x.month)).sum()
df_epfl_year = df_epfl_year[["favorite_count", "retweet_count"]]
df_epfl_year['total'] = df_epfl_year['favorite_count'] + df_epfl_year['retweet_count']
df_eth_year = df_eth.groupby(df_eth['created_at'].map(lambda x: x.month)).sum()
df_eth_year = df_eth_year[["favorite_count", "retweet_count"]]
df_eth_year['total'] = df_eth_year['favorite_count'] + df_eth_year['retweet_count']
fig = plt.figure() # Create matplotlib figure
ax = fig.add_subplot(111) # Create matplotlib axes
width = 0.4
df_epfl_year.total.plot(kind='bar', color='blue', width=width, position=0)
df_eth_year.total.plot(kind='bar', color='red', width=width, position=1)
plt.show()
#Need to fix label names , blue = EPFL, red = ETH
# coding: utf-8
# In[ ]:
# Now let's check per month
df_epfl_year = df_epfl.groupby(df_epfl['created_at'].map(lambda x: x.hour)).sum()
df_epfl_year = df_epfl_year[["favorite_count", "retweet_count"]]
df_epfl_year['total'] = df_epfl_year['favorite_count'] + df_epfl_year['retweet_count']
df_eth_year = df_eth.groupby(df_eth['created_at'].map(lambda x: x.hour)).sum()
df_eth_year = df_eth_year[["favorite_count", "retweet_count"]]
df_eth_year['total'] = df_eth_year['favorite_count'] + df_eth_year['retweet_count']
fig = plt.figure() # Create matplotlib figure
ax = fig.add_subplot(111) # Create matplotlib axes
width = 0.4
df_epfl_year.total.plot(kind='bar', color='blue', width=width, position=0)
df_eth_year.total.plot(kind='bar', color='red', width=width, position=1)
plt.show()
#At around noon, the twitter from EPFL is quite active
#Need to fix label names , blue = EPFL, red = ETH
# coding: utf-8
# In[ ]:
# First let's a really simple regressor the class to predict is the number of retweets, and the input will be the number of favorite
validation_size = 300 #about 10%
Y = df_epfl['retweet_count']
Y2 = df_eth['retweet_count']
Y = Y.append(Y2)
X = df_epfl['favorite_count']
X2 = df_eth['favorite_count']
X = X.append(X2)
# let's keep some part of it for valdiation
validation_features = X[:validation_size]
validation_labels = Y[:validation_size]
train_features = X[validation_size:]
train_labels = Y[validation_size:]
model = LinearRegression()
model.fit(train_features.values.reshape(-1,1), train_labels.values)
# now let's check accuracy on the validation set
y_predict = model.predict(validation_features.values.reshape(-1,1))
#Acc_score = metrics.accuracy_score(validation_labels.values.reshape(-1), np.round(y_predict))
print('R2 score given by model is', model.score(validation_features.values.reshape(-1,1), validation_labels.values))
#let's check the rmse error
from sklearn.metrics import mean_squared_error
from math import sqrt
rms = sqrt(mean_squared_error(validation_labels.values.reshape(-1), y_predict))
print('rmse error is', rms)
# Doesn't seem to be incredible with only one feature, as expected
# coding: utf-8
# In[ ]:
#Let's quickly check the difference in the downsampled with only one feature
validation_size = 100
Y = df_epfl_dw['retweet_count']
Y2 = df_eth_dw['retweet_count']
Y = Y.append(Y2)
X = df_epfl_dw['favorite_count']
X2 = df_eth_dw['favorite_count']
X = X.append(X2)
# let's keep some part of it for valdiation
validation_features = X[:validation_size]
validation_labels = Y[:validation_size]
train_features = X[validation_size:]
train_labels = Y[validation_size:]
model = LinearRegression()
model.fit(train_features.values.reshape(-1,1), train_labels.values)
# now let's check accuracy on the validation set
#y_predict = model.predict(validation_features.values.reshape(-1,1))
#Acc_score = metrics.accuracy_score(validation_labels.values.reshape(-1), np.round(y_predict))
print('R2 score given by model is', model.score(validation_features.values.reshape(-1,1), validation_labels.values))
#let's check the rmse error
from sklearn.metrics import mean_squared_error
from math import sqrt
rms = sqrt(mean_squared_error(validation_labels.values.reshape(-1), y_predict))
print('rmse error is', rms)
#The rmse is smaller but the R2 is worse, the down sampled regressor should be worse, as expected
# coding: utf-8
# In[ ]:
# Now let's also include the time
validation_size = 300 #about 10%
Y = df_epfl['retweet_count']
Y2 = df_eth['retweet_count']
Y = Y.append(Y2)
X = df_epfl[['favorite_count', 'created_at']]
X2 = df_eth[['favorite_count', 'created_at']]
#Now I'll replace with hour when the tweet was created
X['created_at'] = df_epfl['created_at'].map(lambda x: x.hour)
X2['created_at'] = df_eth['created_at'].map(lambda x: x.hour)
X = X.append(X2)
# let's keep some part of it for valdiation
validation_features = X[:validation_size]
validation_labels = Y[:validation_size]
train_features = X[validation_size:]
train_labels = Y[validation_size:]
model = LinearRegression()
model.fit(train_features.values, train_labels.values)
print('R2 score given by model is', model.score(validation_features.values, validation_labels.values))
#let's check the rmse error
from sklearn.metrics import mean_squared_error
from math import sqrt
rms = sqrt(mean_squared_error(validation_labels.values, y_predict))
print('rmse error is', rms)
#R2 is closer to 0 so the model is better now
# coding: utf-8
# In[ ]:
tweetId = [item['id'] for item in t1]
user_id = [item['user']['id'] for item in t1]
favorite = [item['favorite_count'] for item in t1]
date = [item['created_at'] for item in t1]
retweet_count = [item['retweet_count'] for item in t1]
place = [item['place'] for item in t1]
id_str = [item['id_str'] for item in t1]
# user = [item['retweeted_status'] for item in t1]
epfl_df = pd.DataFrame({'tweetId': tweetId , 'id_str':id_str,'user_id': user_id,'favorite': favorite,'date': date,'retweet_count': retweet_count,'place':place})
# coding: utf-8
# In[ ]:
tweetId = [item['id'] for item in t2]
user_id = [item['user']['id'] for item in t2]
favorite = [item['favorite_count'] for item in t2]
date = [item['created_at'] for item in t2]
retweet_count = [item['retweet_count'] for item in t2]
place = [item['place'] for item in t2]
id_str = [item['id_str'] for item in t2]
# user = [item['retweeted_status'] for item in t2]
eth_df = pd.DataFrame({'tweetId': tweetId , 'id_str':id_str,'user_id': user_id,'favorite': favorite,'date': date,'retweet_count': retweet_count,'place':place})
# coding: utf-8
# In[ ]:
df_f = df_big[df_big['Univ'] == 'EPFL']
df_g = df_big[df_big['Univ'] == 'ETHZ']
df_f_year = df_f[['year_tweet','favorite_count','retweet_count']].groupby(['year_tweet']).sum()
df_g_year = df_g[['year_tweet','favorite_count','retweet_count']].groupby(['year_tweet']).sum()
df_f_month = df_f[['month_num_tweet','favorite_count','retweet_count']].groupby(['month_num_tweet']).sum()
df_g_month = df_g[['month_num_tweet','favorite_count','retweet_count']].groupby(['month_num_tweet']).sum()
df_f_hour = df_f[['hour_tweet','favorite_count','retweet_count']].groupby(['hour_tweet']).sum()
df_g_hour = df_g[['hour_tweet','favorite_count','retweet_count']].groupby(['hour_tweet']).sum()
# coding: utf-8
# In[ ]:
EPFL_TWEETS = 'epfl_en.json'
ETH_TWEETS = 'eth_en.json'
epfl_data = open(EPFL_TWEETS).read()
epfl_data = json.loads(epfl_data)
eth_data = open(ETH_TWEETS).read()
eth_data = json.loads(eth_data)
len_epfl = len(epfl_data)
len_eth = len(eth_data)
# coding: utf-8
# In[ ]:
tweets_regression = tweets.copy()
all_tags = []
followers_count = []
friends_count = []
for index, row in tweets_regression.iterrows():
tags = []
for hashtags in row['entities']['hashtags']:
tags.append(hashtags['text'])
all_tags.append(tags)
followers_count.append(row['user']['followers_count'])
friends_count.append(row['user']['friends_count'])
#tweets_regression['tags'] = all_tags
#tweets_regression['friends_count'] = friends_count
#tweets_regression['followers_count'] = followers_count
columns_regression_drop = ['id_str', 'entities', 'user', 'created_at', 'text']
tweets_regression = tweets_regression.drop(columns_regression_drop, axis=1)
labels = tweets_regression.pop('retweet_count')
tweets_regression.head()
# coding: utf-8
# In[ ]:
#encode string
eth_df_needed_col['in_reply_to_screen_name'] = eth_df_needed_col['in_reply_to_screen_name'].fillna('Unknown')
epfl_df_needed_col['in_reply_to_screen_name'] = epfl_df_needed_col['in_reply_to_screen_name'].fillna('Unknown')
eth_df_needed_col['month'] = eth_df_needed_col['month'].fillna('Unknown')
epfl_df_needed_col['month'] = epfl_df_needed_col['month'].fillna('Unknown')
le = preprocessing.LabelEncoder()
le.fit(eth_df_needed_col['in_reply_to_screen_name'])
eth_df_needed_col['in_reply_to_screen_name'] = le.transform(eth_df_needed_col['in_reply_to_screen_name'])
le.fit(eth_df_needed_col['month'])
eth_df_needed_col['month'] = le.transform(eth_df_needed_col['month'])
le.fit(epfl_df_needed_col['in_reply_to_screen_name'])
epfl_df_needed_col['in_reply_to_screen_name'] = le.transform(epfl_df_needed_col['in_reply_to_screen_name'])
le.fit(epfl_df_needed_col['month'])
epfl_df_needed_col['month'] = le.transform(epfl_df_needed_col['month'])
# coding: utf-8
# In[ ]:
data_epfl_new = data_epfl.drop('retweet_count', axis=1)
data_epfl_new['Month']=data_epfl_new.created_at.dt.month
data_epfl_new['Year'] = data_epfl_new.created_at.dt.year
data_epfl_new['Houre'] = data_epfl_new.created_at.dt.hour
data_epfl_new = data_epfl_new.drop('created_at', axis=1)
data_epfl_new = data_epfl_new.drop('entities', axis=1)
data_epfl_new = data_epfl_new.drop('source', axis=1)
data_epfl_new = data_epfl_new.drop('text', axis=1)
f_count=[]
for i, row in data_epfl.iterrows():
f_count.append(row["user"]["followers_count"])
data_epfl_new['followers_count'] = pd.Series(f_count)
data_epfl_new = data_epfl_new.drop('user', axis=1)
data_epfl_new = data_epfl_new.drop('hashtags', axis=1)
data_epfl_new = data_epfl_new.drop('id', axis=1)
data_epfl_new = data_epfl_new.drop('id_str', axis=1)
data_epfl_new.dropna(inplace=True)
data_epfl_new
# coding: utf-8
# In[ ]:
data_eth_new = data_epfl.drop('retweet_count', axis=1)
data_eth_new['Month']=data_eth_new.created_at.dt.month
data_eth_new['Year'] = data_eth_new.created_at.dt.year
data_eth_new['Houre'] = data_eth_new.created_at.dt.hour
data_eth_new = data_eth_new.drop('created_at', axis=1)
data_eth_new = data_eth_new.drop('entities', axis=1)
data_eth_new = data_eth_new.drop('source', axis=1)
data_eth_new = data_eth_new.drop('text', axis=1)
f_count=[]
for i, row in data_eth.iterrows():
f_count.append(row["user"]["followers_count"])
data_eth_new['followers_count'] = pd.Series(f_count)
data_eth_new = data_eth_new.drop('user', axis=1)
data_eth_new = data_eth_new.drop('hashtags', axis=1)
data_eth_new = data_eth_new.drop('id', axis=1)
data_eth_new = data_eth_new.drop('id_str', axis=1)
data_eth_new.dropna(inplace=True)
# coding: utf-8
# In[ ]:
epfl['created_at_year'] = epfl['created_at'].apply(lambda x: x.year)
epfl['created_at_month'] = epfl['created_at'].apply(lambda x: x.month)
epfl['created_at_day'] = epfl['created_at'].apply(lambda x: x.day)
epfl['created_at_hour'] = epfl['created_at'].apply(lambda x: x.hour)
ethz['created_at_year'] = ethz['created_at'].apply(lambda x: x.year)
ethz['created_at_month'] = ethz['created_at'].apply(lambda x: x.month)
ethz['created_at_day'] = ethz['created_at'].apply(lambda x: x.day)
ethz['created_at_hour'] = ethz['created_at'].apply(lambda x: x.hour)
# coding: utf-8
# In[ ]:
p = figure(title='Favorites across years for EPFL and ETHZ',plot_height=500,plot_width=500)
p.title.align = "center"
p.title.text_font_size = "18px"
p.xaxis[0].axis_label = 'Year'
p.yaxis[0].axis_label = 'Number of favorites'
p.xaxis[0].axis_label_text_font_size = '16px'
p.yaxis[0].axis_label_text_font_size = '16px'
p.circle(epfl_year,epfl_favorites,color='blue',size=8)
p.line(epfl_year,epfl_favorites,color='blue',legend='epfl')
p.circle(eth_year,eth_favorites,color='red',size=8)
p.line(eth_year,eth_favorites,color='red',legend='ethz')
show(p)
# coding: utf-8
# In[ ]:
p = figure(title='Retweets across years for EPFL and ETHZ',plot_height=500,plot_width=500)
p.title.align = "center"
p.title.text_font_size = "18px"
p.xaxis[0].axis_label = 'Year'
p.yaxis[0].axis_label = 'Number of retweets'
p.xaxis[0].axis_label_text_font_size = '16px'
p.yaxis[0].axis_label_text_font_size = '16px'
p.circle(epfl_year,epfl_retweets,color='blue',size=8)
p.line(epfl_year,epfl_retweets,color='blue',legend='epfl')
p.circle(eth_year,eth_retweets,color='red',size=8)
p.line(eth_year,eth_retweets,color='red',legend='ethz')
show(p)
# coding: utf-8
# In[ ]:
p = figure(title='Favorites across months for EPFL and ETHZ (2016)',plot_height=500,plot_width=500)
p.title.align = "center"
p.title.text_font_size = "18px"
p.xaxis[0].axis_label = 'Month'
p.yaxis[0].axis_label = 'Number of favorites'
p.xaxis[0].axis_label_text_font_size = '16px'
p.yaxis[0].axis_label_text_font_size = '16px'
p.circle(epfl_month,epfl_favorites,color='blue',size=8)
p.line(epfl_month,epfl_favorites,color='blue',legend='epfl')
p.circle(eth_month,eth_favorites,color='red',size=8)
p.line(eth_month,eth_favorites,color='red',legend='ethz')
show(p)
# coding: utf-8
# In[ ]:
p = figure(title='Retweets across months for EPFL and ETHZ (2016)',plot_height=500,plot_width=500)
p.title.align = "center"
p.title.text_font_size = "18px"
p.xaxis[0].axis_label = 'Month'
p.yaxis[0].axis_label = 'Number of retweets'
p.xaxis[0].axis_label_text_font_size = '16px'
p.yaxis[0].axis_label_text_font_size = '16px'
p.circle(epfl_month,epfl_retweets,color='blue',size=8)
p.line(epfl_month,epfl_retweets,color='blue',legend='epfl')
p.circle(eth_month,eth_retweets,color='red',size=8)
p.line(eth_month,eth_retweets,color='red',legend='ethz')
show(p)
# coding: utf-8
# In[ ]:
p = figure(title='Favorites across hours of day for EPFL and ETHZ (2016)',plot_height=500,plot_width=500)
p.title.align = "center"
p.title.text_font_size = "18px"
p.xaxis[0].axis_label = 'Hour'
p.yaxis[0].axis_label = 'Number of favorites'
p.xaxis[0].axis_label_text_font_size = '16px'
p.yaxis[0].axis_label_text_font_size = '16px'
p.circle(epfl_day,epfl_favorites,color='blue',size=8)
p.line(epfl_day,epfl_favorites,color='blue',legend='epfl')
p.circle(eth_day,eth_favorites,color='red',size=8)
p.line(eth_day,eth_favorites,color='red',legend='ethz')
show(p)
# coding: utf-8
# In[ ]:
p = figure(title='Retweets across hours of day for EPFL and ETHZ (2016)',plot_height=500,plot_width=500)
p.title.align = "center"
p.title.text_font_size = "18px"
p.xaxis[0].axis_label = 'Hour'
p.yaxis[0].axis_label = 'Number of retweets'
p.xaxis[0].axis_label_text_font_size = '16px'
p.yaxis[0].axis_label_text_font_size = '16px'
p.circle(epfl_day,epfl_retweets,color='blue',size=8)
p.line(epfl_day,epfl_retweets,color='blue',legend='epfl')
p.circle(eth_day,eth_retweets,color='red',size=8)
p.line(eth_day,eth_retweets,color='red',legend='ethz')
show(p)
In [211]:
# Import Cells
for i, cell in enumerate(class_to_cells[5]):
if i > 20:
break
print (cell)
# coding: utf-8
# In[ ]:
import pandas as pd
import numpy as np
import os
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import matplotlib.pyplot as plt
get_ipython().magic('matplotlib inline')
# coding: utf-8
# In[ ]:
get_ipython().magic('matplotlib inline')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk import word_tokenize
from string import punctuation
from nltk.corpus import stopwords
import nltk
import json
import seaborn as sns
sns.set_context('notebook')
# coding: utf-8
# In[ ]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import string
import re
import operator
from gensim import corpora, models, similarities
from gensim.models import hdpmodel, ldamodel
import nltk
from nltk.probability import FreqDist
# coding: utf-8
# In[ ]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from pandas.io.json import json_normalize
import warnings
warnings.filterwarnings('ignore')
get_ipython().magic('matplotlib inline')
sns.set_style('darkgrid')
palette = sns.color_palette()
# coding: utf-8
# In[ ]:
import pandas as pd
import json
import warnings
warnings.filterwarnings('ignore')
get_ipython().magic('matplotlib inline')
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from functools import partial
from scipy.stats import skewtest
import numpy as np
# coding: utf-8
# In[ ]:
from os import path
from collections import Counter
#import pycountry
import nltk
import re # for removing numbers
from nltk.sentiment import SentimentIntensityAnalyzer
from gensim import models,corpora
get_ipython().magic('matplotlib inline')
import re # for removing numbers
# coding: utf-8
# In[ ]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context('notebook')
from datetime import datetime
from datetime import date, time
from dateutil.parser import parse
import requests
from bs4 import BeautifulSoup
import pickle
import scipy.stats as stats
import math
import folium
import json
from sklearn.ensemble import RandomForestClassifier
from numpy.core.defchararray import split
from sklearn.preprocessing import robust_scale
from sklearn import preprocessing
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.cross_validation import cross_val_score
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from wordcloud import WordCloud
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.probability import FreqDist
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import pycountry
import os
from operator import itemgetter
from gensim import corpora
from gensim.models import ldamodel
import networkx as nx
import community
get_ipython().magic('matplotlib inline')
# coding: utf-8
# In[ ]:
# Data pocessing
import pandas as pd
import numpy as np
from dateutil.parser import parse
# Machine Learning
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import mean_squared_error
# Interactive viz
import folium
import matplotlib.pyplot as plt
import seaborn as sns
# Text processing
import nltk.data
import pycountry
import random
import re
from nltk.tokenize import TweetTokenizer
from collections import Counter
from gensim import models, corpora
from wordcloud import WordCloud, STOPWORDS
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem import PorterStemmer, SnowballStemmer, WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from os import path
from os.path import exists
from PIL import Image
# Utils
import json
import itertools
import collections
import requests as rq
from bs4 import BeautifulSoup as bfs
import math
import scipy.stats as stats
from geopy.geocoders import GeoNames, Nominatim, GoogleV3
get_ipython().magic('matplotlib inline')
# coding: utf-8
# In[ ]:
# This might be useful
import pandas as pd
# Question 2 - Plotly graphs vizualisation
import plotly
import plotly.plotly as py
import plotly.graph_objs as go
# Generate API Key to use Plotly | I trust you ! You can change with another one because the key is limited to a daily amount of vizualisations.
# Also, the following graphs are provided with a link with each graph online, you should be able to view it.
plotly.tools.set_credentials_file(username='Merinorus', api_key='cfLSx6xqd6yBJawneBfu')
# Question 3 - Machine learning with random forest classifier
from sklearn.ensemble import RandomForestClassifier
# coding: utf-8
# In[ ]:
import pandas as pd
import matplotlib.pyplot as plt
get_ipython().magic('matplotlib inline')
import seaborn as sns
import numpy as np
from bokeh.io import output_notebook
from bokeh.plotting import figure, output_file, show
from bokeh.models import Label
from bokeh.charts import BoxPlot, output_file, show
import scipy.stats as stats
import math
from bokeh.charts import Histogram, output_file, show
import networkx as nx
output_notebook()
# coding: utf-8
# In[ ]:
get_ipython().magic('matplotlib inline')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import re
from IPython.core.display import display # display(df) plutôt que print()
import seaborn as sns
import nltk
from nltk.tokenize import TweetTokenizer
sns.set_context('notebook')
# coding: utf-8
# In[ ]:
import nltk
import string
import re
import itertools
import gensim
import pyLDAvis.gensim as gensimvis
import pyLDAvis
# coding: utf-8
# In[ ]:
import contextlib
import sys
import os
import json
from collections import OrderedDict
import itertools
import pandas as pd
import numpy as np
get_ipython().magic('matplotlib inline')
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import mean_squared_error
# coding: utf-8
# In[ ]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import folium
import sklearn
import scipy
import bokeh
import nltk
# coding: utf-8
# In[ ]:
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import re
import stop_words
import string
# coding: utf-8
# In[ ]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
get_ipython().magic('matplotlib inline')
import warnings
warnings.filterwarnings("ignore")
# coding: utf-8
# In[ ]:
import pandas as pd
import json
import numpy as np
get_ipython().magic('matplotlib inline')
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.patches as mpatches
# coding: utf-8
# In[ ]:
import folium
import numpy as np
import pandas as pd
import scipy.stats as stats
import requests
import re
import matplotlib
import matplotlib.pyplot as plt
get_ipython().magic('matplotlib inline')
get_ipython().magic('load_ext autoreload')
get_ipython().magic('autoreload 1')
from bs4 import BeautifulSoup
# coding: utf-8
# In[ ]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.preprocessing import Imputer
from sklearn.model_selection import cross_val_score
from sklearn.metrics import silhouette_score
from sklearn.cluster import MiniBatchKMeans, SpectralClustering
from sklearn.metrics import confusion_matrix
import seaborn as sns
# coding: utf-8
# In[ ]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.pyplot import show
import itertools
get_ipython().magic('matplotlib inline')
sns.set_context('notebook')
pd.options.mode.chained_assignment = None # default='warn'
# coding: utf-8
# In[ ]:
import pandas as pd
import numpy as np
from os import path
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
import nltk.sentiment.util
import gensim
import json
get_ipython().magic('matplotlib inline')
get_ipython().magic('load_ext autoreload')
get_ipython().magic('autoreload 2')
In [ ]:
Content source: DataPilot/notebook-miner
Similar notebooks: