In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%pylab inline
matplotlib.style.use('ggplot')
In [2]:
# Directory di Staging
dir_df = os.path.join(os.path.abspath(''),'stg')
dir_out = os.path.join(os.path.abspath(''),'out')
In [3]:
# Dataset Salvini
df_filename = r'df_posts_likes_salvini.pkl'
df_fullpath = os.path.join(dir_df, df_filename)
df_posts_salvini = pd.read_pickle(df_fullpath)
# Statistiche
# Numero Posts
df_posts_salvini['ID'].count()
Out[3]:
In [4]:
# Numero Likes
df_posts_salvini['Likes'].sum()
Out[4]:
In [5]:
# Dataset Renzi
df_filename = r'df_posts_likes_renzi.pkl'
df_fullpath = os.path.join(dir_df, df_filename)
df_posts_renzi = pd.read_pickle(df_fullpath)
# Statistiche
# Numero Posts
df_posts_renzi['ID'].count()
Out[5]:
In [6]:
# Numero Likes
df_posts_renzi['Likes'].sum()
Out[6]:
In [7]:
# Dataset M5S
df_filename = r'df_posts_likes_m5s.pkl'
df_fullpath = os.path.join(dir_df, df_filename)
df_posts_m5s = pd.read_pickle(df_fullpath)
# Statistiche
# Numero Posts
df_posts_m5s['ID'].count()
Out[7]:
In [8]:
# Numero Likes
df_posts_m5s['Likes'].sum()
Out[8]:
In [9]:
# Dimensione Temporale -> ANNO
df_posts_m5s['Post_Date'] = df_posts_m5s['Post_Date'].str[:4]
df_posts_salvini['Post_Date'] = df_posts_m5s['Post_Date'].str[:4]
df_posts_renzi['Post_Date'] = df_posts_m5s['Post_Date'].str[:4]
In [10]:
df_posts_m5s = df_posts_m5s.groupby('Post_Date',as_index=False).agg({'ID':'count', 'Likes': 'sum'})
df_posts_salvini = df_posts_salvini.groupby('Post_Date',as_index=False).agg({'ID':'count', 'Likes': 'sum'})
df_posts_renzi = df_posts_renzi.groupby('Post_Date',as_index=False).agg({'ID':'count', 'Likes': 'sum'})
In [11]:
df_posts_m5s.rename(columns={'ID': 'Posts_M5S', 'Likes': 'Likes_M5S'}, inplace=True)
df_posts_m5s = df_posts_m5s.set_index(['Post_Date'])
df_posts_m5s.head(2)
Out[11]:
In [12]:
df_posts_renzi.rename(columns={'ID': 'Posts_Renzi', 'Likes': 'Likes_Renzi'}, inplace=True)
df_posts_renzi = df_posts_renzi.set_index(['Post_Date'])
df_posts_renzi.head(2)
Out[12]:
In [13]:
df_posts_salvini.rename(columns={'ID': 'Posts_Salvini', 'Likes': 'Likes_Salvini'}, inplace=True)
df_posts_salvini = df_posts_salvini.set_index(['Post_Date'])
df_posts_salvini.head(2)
Out[13]:
In [14]:
# Numero Posts
result_post = pd.concat([df_posts_renzi, df_posts_salvini, df_posts_m5s], axis=1)
del result_post['Likes_Renzi']
del result_post['Likes_Salvini']
del result_post['Likes_M5S']
result_post.rename(columns={'Posts_Renzi': 'Renzi', 'Posts_Salvini': 'Salvini', 'Posts_M5S': 'M5S'}, inplace=True)
In [15]:
result_post.plot(
kind='bar'
)
Out[15]:
In [16]:
result_post.to_csv(os.path.join(dir_out,r'Distr_Posts.csv'),header=True, index=True)
In [17]:
# Numero Likes
result_likes = pd.concat([df_posts_renzi, df_posts_salvini, df_posts_m5s], axis=1)
del result_likes['Posts_Renzi']
del result_likes['Posts_Salvini']
del result_likes['Posts_M5S']
result_likes.rename(columns={'Likes_Renzi': 'Renzi', 'Likes_Salvini': 'Salvini', 'Likes_M5S': 'M5S'}, inplace=True)
In [18]:
result_likes.plot(
kind='bar'
)
Out[18]:
In [19]:
result_likes.to_csv(os.path.join(dir_out,r'Distr_Likes.csv'),header=True, index=True)
In [20]:
# Dataset Salvini
df_filename = r'df_posts_likes_salvini.pkl'
df_fullpath = os.path.join(dir_df, df_filename)
df_posts = pd.read_pickle(df_fullpath)
# Estraggo la Data da Str
df_posts['Post_Date'] = df_posts['Post_Date'].str[:10]
# Converto in Date
df_posts['Post_Date'] = pd.to_datetime(df_posts['Post_Date'])
# Ordino per Data
df_posts = df_posts.sort_values(by='Post_Date')
# Mi tengo DS totale per Data per analisi successive
df_posts_dett = df_posts
df_posts_dett = df_posts_dett.set_index(['Post_Date'])
# Raggruppo per Data
df_posts = df_posts.groupby('Post_Date',as_index=False).agg({'ID':'count', 'Likes': 'sum'})
# Elimino i le date per cui non ho post con likes (privacy ?)
df_posts = df_posts[np.isfinite(df_posts['Likes'])]
# Setto indice la Data
df_posts = df_posts.set_index(['Post_Date'])
# Lavoro con TimeSeries, raggruppo tutto per Anno/Mese (la data era per giorno)
df_posts = df_posts.groupby(pd.TimeGrouper("M")).sum()
# Elimino Numero di Posts
del df_posts['ID']
In [21]:
# Ok, i numeri tornano dopo le elaborazioni
df_posts['Likes'].sum()
Out[21]:
In [40]:
df_posts.sort_values(by='Likes').head(5)
Out[40]:
In [75]:
# Costruisco il Grafico, l'obiettivo è analizzare i picchi e capire a quale evento è collegato
tp = df_posts.plot(
marker='o',
markersize=7,
# x-axis da 0 a 84
markevery=[58,60,63,65,70])
tp.set_xlabel("Data del Post")
vals = tp.get_yticks()
tp.set_yticklabels(['{:,.0f}'.format(x) for x in vals])
fig_posts = tp.get_figure()
fig_posts.tight_layout()
fig_posts.savefig(os.path.join(dir_out,'Distr_Posts_Salvini.png'), format='png', dpi=300)
In [39]:
df_post_14 = df_posts['20140101':'20141231']
tp_14 = df_post_14.plot()
fig_posts_14 = tp_14.get_figure()
fig_posts_14.tight_layout()
fig_posts_14.savefig(os.path.join(dir_out,'posts_2014.png'), format='png', dpi=300)
In [40]:
# Dettaglio 2014
df_posts_dett['20140101':'20141231'].sort_values(by=['Likes'],ascending=False).head(1)
# Analizzo gli ID direttamente dall API Graph Tool di Facebook
Out[40]:
Principali post tra Ottobre e Novembre
In [41]:
df_post_15 = df_posts['20150101':'20151231']
tp_15 = df_post_15.plot()
fig_posts_15 = tp_15.get_figure()
fig_posts_15.tight_layout()
fig_posts_15.savefig(os.path.join(dir_out,'posts_2015.png'), format='png', dpi=300)
In [42]:
# Controllo 2015
df_posts_dett['20150101':'20151231'].sort_values(by=['Likes'],ascending=False).head(3)
# Analizzo gli ID direttamente dall API Graph Tool di Facebook
Out[42]:
In [43]:
df_post_16 = df_posts['20160101':'20161231']
tp_16 = df_post_16.plot()
fig_posts_16 = tp_16.get_figure()
fig_posts_16.tight_layout()
fig_posts_16.savefig(os.path.join(dir_out,'posts_2016.png'), format='png', dpi=300)
In [ ]: