In [1]:
def testFacebookPageFeedData(page_id, access_token):
# construct the URL string
base = "https://graph.facebook.com/v2.10"
node = "/" + page_id + "/feed" # changed
parameters = "/?fields=message,created_time,reactions.type(LOVE).limit(0).summary(total_count).as(reactions_love),reactions.type(WOW).limit(0).summary(total_count).as(reactions_wow),reactions.type(HAHA).limit(0).summary(total_count).as(reactions_haha),reactions.type(ANGRY).limit(0).summary(total_count).as(reactions_angry),reactions.type(SAD).limit(0).summary(total_count).as(reactions_sad),reactions.type(LIKE).limit(0).summary(total_count).as(reactions_like)&limit={}&access_token={}".format(100, access_token) # changed
url = base + node + parameters
# retrieve data
data = json.loads(request_until_succeed(url))
return data
In [2]:
def Get_News(limit = 10):
result = {}
nex = None
for i in range(limit):
range_dates = []
range_messages = []
range_ids= []
if i == 0:
data = testFacebookPageFeedData(page_id,access_token)
nex = data['paging']['next']
for d in data['data']:
range_dates.append(d['created_time'])
range_messages.append(d['message'])
range_ids.append(d['id'])
result['dates'] = range_dates
result['messages'] = range_messages
result['angry'] = range_angry
result['id'] = range_ids
else:
data = json.loads(request_until_succeed(nex))
try:
nex = data['paging']['next']
except:
break
for d in data['data']:
try:
range_messages.append(d['message'])
range_dates.append(d['created_time'])
range_ids.append(d['id'])
except:
print(d)
result['dates'].extend(range_dates)
result['messages'].extend(range_messages)
result['id'].extend(range_ids)
result_df = pd.DataFrame(result)
return result_df
In [3]:
import pandas as pd
pd.set_option('chained_assignment',None)
diario_libre_fb = pd.read_csv('diario_libre_fb.csv',encoding='latin1')
In [6]:
def get_url(url):
urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', url)
try:
result = urls[0]
except:
result = 'Not found'
return result
In [5]:
diario_libre_fb.head()
Out[5]:
In [17]:
import os
path = os.getcwd()
csv_files =[]
for file in os.listdir(path):
if file.endswith(".csv") and 'diario_libre_fb' not in file:
csv_files.append(os.path.join(path, file))
In [18]:
from matplotlib import rcParams
rcParams['figure.figsize'] = (8, 4) # Size of plot
rcParams['figure.dpi'] = 100 #Dots per inch of plot
rcParams['lines.linewidth'] = 2 # Width of lines of the plot
rcParams['axes.facecolor'] = 'white' #Color of the axes
rcParams['font.size'] = 12 # Size of the text.
rcParams['patch.edgecolor'] = 'white' #Patch edge color.
rcParams['font.family'] = 'StixGeneral' #Font of the plot text.
In [19]:
diarios = ['Diario Libre','El Dia','Hoy','Listin Diario','El Nacional']
noticias_df_all = None
for i,periodico in enumerate(csv_files):
noticias_df = pd.read_csv(csv_files[0],encoding = 'latin1').iloc[:,1:]
noticias_df['Diario'] = diarios[i]
if noticias_df_all is None:
noticias_df_all = noticias_df
else:
noticias_df_all = noticias_df_all.append(noticias_df)
noticias_df_all.reset_index(drop = True,inplace = True)
noticias_df_all.describe()
Out[19]:
In [20]:
noticias_df_completas = noticias_df_all.loc[pd.notnull(noticias_df_all.contenidos)]
noticias_df_completas.shape
Out[20]:
Convertir a minuscula: Santiago -> santiago
Eliminar caracteres no alfabeticos -> No pararon. -> No pararon
Eliminar tildes -> República Dominicana -> Republica Dominicana
Eliminar palabras sin ningun valor análitico -> Falleció la mañana de este sábado -> Falleció mañana sabado
In [40]:
pd.options.mode.chained_assignment = None
In [21]:
import nltk
spanish_stops = set(nltk.corpus.stopwords.words('Spanish'))
list(spanish_stops)[:10]
Out[21]:
In [38]:
import unicodedata
import re
def strip_accents(s):
return ''.join(c for c in unicodedata.normalize('NFD', s)
if unicodedata.category(c) != 'Mn')
def Clean_Text(text):
words = text.lower().split()
removed_stops = [strip_accents(w) for w in words if w not in spanish_stops and len(w)!=1]
stops_together = " ".join(removed_stops)
letters_only = re.sub("[^a-zA-Z]"," ", stops_together)
return letters_only
In [41]:
noticias_df_completas['contenido limpio'] = noticias_df_completas.contenidos.apply(Clean_Text)
noticias_df_completas[['contenidos','contenido limpio']].head()
Out[41]:
In [42]:
from nltk.stem.snowball import SnowballStemmer
spanish_stemmer = SnowballStemmer("spanish")
print(spanish_stemmer.stem("corriendo"))
print(spanish_stemmer.stem("correr"))
In [43]:
def stem_text(text):
stemmed_text = [spanish_stemmer.stem(word) for word in text.split()]
return " ".join(stemmed_text)
noticias_df_completas['contenido stemmed'] = noticias_df_completas['contenido limpio'].apply(stem_text)
noticias_df_completas.head()
Out[43]:
In [56]:
import itertools
def Create_ngrams(all_text,number=1):
result = {}
for text in all_text:
text = [w for w in text.split() if len(w) != 1]
for comb in list(itertools.combinations(text, number)):
found = False
temp_dict = {}
i =0
while not found and i < len(comb):
if comb[i] not in temp_dict:
temp_dict[comb[i]] = "Found"
else:
found = True
i += 1
if not found:
if comb not in result:
result[comb]= 1
else:
result[comb]+=1
df = pd.DataFrame({ str(number) + "-Combinations": list(result.keys()),"Count":list(result.values())})
return df.sort_values(by="Count",ascending=False)
In [57]:
one_ngrams = Create_ngrams(noticias_df_completas['contenido limpio'])
one_ngrams.head()
Out[57]:
In [51]:
from matplotlib import rcParams
rcParams['figure.figsize'] = (8, 4) # Size of plot
rcParams['figure.dpi'] = 100 #Dots per inch of plot
rcParams['lines.linewidth'] = 2 # Width of lines of the plot
rcParams['axes.facecolor'] = 'white' #Color of the axes
rcParams['font.size'] = 12 # Size of the text.
rcParams['patch.edgecolor'] = 'white' #Patch edge color.
rcParams['font.family'] = 'StixGeneral' #Font of the plot text.
In [52]:
import seaborn as sns
import matplotlib.pyplot as plt
def Plot_nCombination(comb_df,n,title):
sns.barplot(x=str(n) + "-Combinations",y = "Count",data = comb_df.head(10))
plt.title(title)
plt.xlabel("Combination")
plt.ylabel("Count")
plt.xticks(rotation = "75")
plt.show()
Plot_nCombination(one_ngrams,1,"Top 10 palabras más comunes, noticias.")
In [58]:
two_ngrams = Create_ngrams(noticias_df_completas['contenido limpio'],2)
Plot_nCombination(two_ngrams,2,"Top 10 pares de palabras más comunes.")
In [59]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
def Calculate_tfidf(text):
corpus = text
vectorizer = TfidfVectorizer( min_df = 0.025, max_df = 0.25)
vector_weights = vectorizer.fit_transform(corpus)
weights= list(np.asarray(vector_weights.mean(axis=0)).ravel())
df = pd.DataFrame({"Word":vectorizer.get_feature_names(),"Score":weights})
df = df.sort_values(by = "Score" ,ascending = False)
return df,vector_weights.toarray()
In [61]:
def Plot_Score(data,title):
sns.barplot(x="Word",y = "Score",data = data.head(10))
plt.title(title)
plt.xlabel("Palabra")
plt.ylabel("Score")
plt.xticks(rotation = "75")
plt.show()
Text_TfIdf,Text_Vector = Calculate_tfidf(noticias_df_completas['contenido limpio'])
Plot_Score(Text_TfIdf,"TF-IDF Top 10 palabras")
In [62]:
noticias_df_completas = noticias_df_completas.loc[pd.notnull(noticias_df_completas.fechas)]
noticias_df_completas.fechas = pd.to_datetime(noticias_df_completas.fechas)
noticias_df_completas['Mes'] = noticias_df_completas.fechas.dt.month
noticias_df_completas['Año'] = noticias_df_completas.fechas.dt.year
noticias_df_completas.head()
Out[62]:
In [63]:
from wordcloud import WordCloud
rcParams['figure.dpi'] = 600
def crear_wordcloud_mes_anio(data,mes,anio):
data = data.loc[(data.Mes == mes) & (data.Año == anio)]
print("Existen {} articulos en los datos para el mes {} del año {}.".format(data.shape[0],mes,anio))
wordcloud = WordCloud(background_color='white',max_words=200,
max_font_size=40,random_state=42).generate(str(data['contenido limpio']))
fig = plt.figure(1)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
In [65]:
crear_wordcloud_mes_anio(noticias_df_completas,9,2017)
In [ ]: