In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
There are problems, no matter what you use:
Assumes a bag-of-words model: The other does not matter We use the work from Dodds et al. http://www.pnas.org/content/112/8/2389.abstract Each word has a hapiness level, the hapiness of the text is the average hapiness of all the words involved
In [87]:
def get_happiness_raw(string_study,lang="english"):
"""
Available languages: german portuguese arabic hindi russian chinese indonesian spanish english korean urdu french pashto
"""
from labMTsimple.storyLab import emotionFileReader, emotion, stopper, emotionV
labMT,labMTvector,labMTwordList = emotionFileReader(stopval=0.0,lang=lang,returnVector=True)
#Then we can score the text and get the word vector at the same time:
Valence,Fvec = emotion(string_study,labMT,shift=True,happsList=labMTvector)
return Valence
def get_happiness_delete_neutral(string_study,lang="english"):
"""
Available languages: german portuguese arabic hindi russian chinese indonesian spanish english korean urdu french pashto
"""
from labMTsimple.storyLab import emotionFileReader, emotion, stopper, emotionV
labMT,labMTvector,labMTwordList = emotionFileReader(stopval=0.0,lang=lang,returnVector=True)
#Then we can score the text and get the word vector at the same time:
Valence,Fvec = emotion(string_study,labMT,shift=True,happsList=labMTvector)
StoppedVec = stopper(Fvec,labMTvector,labMTwordList,stopVal=1.0)
Valence = emotionV(StoppedVec,labMTvector)
return Valence
countries_happ = []
languages_happ = []
countries = []
with open("./data/country_language.csv") as f:
for line in f:
try: country,language = line.split("\t")
except: continue
countries_happ.append(get_happiness_raw(country))
languages_happ.append(get_happiness_raw(language))
countries.append(country)
In [58]:
import pylab as plt
plt.plot(countries_happ,languages_happ,"o")
plt.ylabel("Perceived happiness of the language")
plt.xlabel("Perceived happiness of the country")
plt.show()
In [83]:
countries_happ_proc = []
languages_happ_proc = []
for i in range(len(countries_happ)):
if countries_happ[i] > 0 and languages_happ[i] > 0:
countries_happ_proc.append(countries_happ[i])
languages_happ_proc.append(languages_happ[i])
plt.plot(countries_happ_proc,languages_happ_proc,"o")
plt.ylabel("Perceived happiness of the language")
plt.xlabel("Perceived happiness of the country")
plt.show()
In [81]:
from random import random
countries_happ_proc = []
languages_happ_proc = []
countries_proc = []
for i in range(len(countries_happ)):
if countries_happ[i] > 0 and languages_happ[i] > 0:
countries_happ_proc.append(countries_happ[i])
languages_happ_proc.append(languages_happ[i])
countries_proc.append(countries[i])
plt.figure(figsize=(15,9))
for (x,y,name) in list(zip(countries_happ_proc,languages_happ_proc,countries_proc)):
plt.annotate(name,(x+0.02,y+random()/10-0.05))
plt.plot(countries_happ_proc,languages_happ_proc,"o")
plt.ylabel("Perceived happiness of the language")
plt.xlabel("Perceived happiness of the country")
plt.ylim((5,7))
plt.show()
In [90]:
from random import random
countries_happ_proc = []
languages_happ_proc = []
countries_proc = []
for i in range(len(countries_happ)):
if countries_happ[i] > 0 and languages_happ[i] > 0 and len(countries[i].split()) == 1:
countries_happ_proc.append(countries_happ[i])
languages_happ_proc.append(languages_happ[i])
countries_proc.append(countries[i])
plt.figure(figsize=(15,9))
for (x,y,name) in list(zip(countries_happ_proc,languages_happ_proc,countries_proc)):
plt.annotate(name,(x+0.02,y+random()/10-0.05))
plt.plot(countries_happ_proc,languages_happ_proc,"o")
plt.ylabel("Perceived happiness of the language")
plt.xlabel("Perceived happiness of the country")
plt.ylim((5,7))
plt.show()
In [ ]:
# All together
In [104]:
#Delete punctuation
def remove_punctuation(string_to_remove):
transtable = {ord(c): None for c in string.punctuation}
return string_to_remove.translate(transtable).lower()
def remove_stop_words(text):
text = ' '.join([word for word in text.split() if word not in cached_stop])
return text
In [2]:
from nltk.corpus import stopwords
import pandas as pd
import numpy as np
from labMTsimple.storyLab import emotionFileReader, emotion, stopper, emotionV
import string
from dateutil.parser import parse
from nltk.stem.snowball import SnowballStemmer
"""
Available languages:
german portuguese
arabic hindi
russian chinese
indonesian spanish
english korean
urdu french
pashto """
language = "spanish"
separator_csv = "\t" #tab
header = None
index_col = None
columns_csv = ["Date","Newspaper","Article"]
column_text = "Article" #leave empty to not get the valence
column_date = "Date" #leave empty to not parse the date
dayfirst = True
extra_stop_words = [_.lower() for _ in ["Podemos","Pablo","Iglesias","Ciudadanos","Albert","Rivera","Pedro","Sánchez","PSOE","Mariano","Rajoy","PP"]]
filenames = ["./data/Pedro_Sanchez_PSOE.csv","./data/Pablo_Iglesias_Podemos.csv","./data/Albert_Rivera_Ciudadanos.csv","./data/Mariano_Rajoy_PP.csv"]
cached_stop = set(stopwords.words(language) + extra_stop_words)
## Fill up
for filename in filenames:
print(filename)
df = pd.read_csv(filename,sep=separator_csv,header=header,index_col=index_col)
df.columns = columns_csv
if column_text:
valences_articles = []
for article in df[column_text]:
try:
t_article = remove_punctuation(article)
t_article = remove_stop_words(t_article)
valence = get_happiness_delete_neutral(t_article,language)
except:
valence = np.NaN
valences_articles.append(valence)
df["Valence"] = valences_articles
if column_date:
dates = []
for date in df[column_date]:
try:
date = parse(date,fuzzy=True,dayfirst=dayfirst)
except:
valence = pd.NaT
dates.append(date)
df["Date_parsed"] = dates
df.to_csv(filename[:-4]+"_formatted.csv",sep=separator_csv,index=index_col)
In [164]:
from statsmodels.nonparametric.smoothers_lowess import lowess
def plot_valence(filename,color="blue",smoothing=2/3,print_points=True):
df = pd.read_csv(filename[:-4]+"_formatted.csv",sep=separator_csv,index_col=index_col)
df["Date_parsed"] = df["Date_parsed"].apply(lambda x: parse(x,yearfirst=True,dayfirst=dayfirst))
df = df.loc[df["Valence"]>0,:]
df = df.groupby("Date_parsed").mean().reset_index()
y = lowess(df["Valence"],df["Date_parsed"],frac=smoothing,return_sorted=False)
plt.plot(df["Date_parsed"],y,color=color)
if print_points:
plt.plot(df["Date_parsed"],df["Valence"],"o",color=color)
print_points = False
smoothing = 0.2
plt.figure(figsize=(15,5))
filename = "./data/Pablo_Iglesias_Podemos.csv"
plot_valence(filename,color="#912CEE",smoothing=smoothing,print_points=print_points)
filename = "./data/Albert_Rivera_Ciudadanos.csv"
plot_valence(filename,color="orange",smoothing=smoothing,print_points=print_points)
filename = "./data/Mariano_Rajoy_PP.csv"
plot_valence(filename,color="#00BFFF",smoothing=smoothing,print_points=print_points)
filename = "./data/Pedro_Sanchez_PSOE.csv"
plot_valence(filename,color="red",smoothing=smoothing,print_points=print_points)
elections = parse("Dec 20, 2015")
plt.plot([elections,elections],[6.1,6.4],color="grey")
plt.show()