In [1]:
import os
import sys
import pandas as pd
import numpy as np
from nltk.tokenize import TweetTokenizer
from sklearn.ensemble import GradientBoostingRegressor, BaggingRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from scipy.stats import spearmanr
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
%matplotlib inline
from emoint.featurizers.emoint_featurizer import EmoIntFeaturizer
from sklearn.preprocessing import StandardScaler, RobustScaler
from emoint.ensembles.blending import blend
from sklearn.linear_model import Ridge
In [2]:
def get_xy(path, tokenizer, featurizer):
df = pd.read_csv(path, header=None, sep='\t')
tweets = df[1]
intensities = df[3]
X = []
for t in tweets:
tokens = tokenizer.tokenize(t)
features = featurizer.featurize(tokens)
X.append(features)
X, y = np.array(X), np.array(intensities)
print("Shapes X: {}, y: {}".format(X.shape, y.shape))
return X, y
def metrics(y_pred, y, print_metrics=False):
p1 = pearsonr(y_pred, y)[0]
s1 = spearmanr(y_pred, y)[0]
ind = np.where(y >= 0.5)
ydt = np.take(y_pred, ind).reshape(-1)
ydpt = np.take(y, ind).reshape(-1)
p2 = pearsonr(ydt, ydpt)[0]
s2 = spearmanr(ydt, ydpt)[0]
if print_metrics:
print("Validation Pearsonr: {}".format(p1))
print("Validation Spearmanr: {}".format(s1))
print("Validation Pearsonr >= 0.5: {}".format(p2))
print("Validation Spearmanr >= 0.5: {}".format(s2))
return np.array((p1, s1, p2, s2))
def train(train_path, dev_path, featurizer, tokenizer):
X_train, y_train = get_xy(train_path, tokenizer, featurizer)
X_dev, y_dev = get_xy(dev_path, tokenizer, featurizer)
regr = GradientBoostingRegressor()
regr.fit(X_train, y_train)
y_dev_pred = regr.predict(X_dev)
metrics(y_dev_pred, y_dev, True)
return regr, featurizer.features
In [3]:
featurizer = EmoIntFeaturizer()
tokenizer = TweetTokenizer()
print("---------------- ANGER --------------------")
anger_regr, anger_features = train('../resources/emoint/anger-ratings-0to1.train.txt',
'../resources/emoint/anger-ratings-0to1.dev.gold.txt', featurizer, tokenizer)
print("---------------- FEAR --------------------")
fear_regr, fear_features = train('../resources/emoint/fear-ratings-0to1.train.txt',
'../resources/emoint/fear-ratings-0to1.dev.gold.txt', featurizer, tokenizer)
print("---------------- JOY --------------------")
joy_regr, joy_features = train('../resources/emoint/joy-ratings-0to1.train.txt',
'../resources/emoint/joy-ratings-0to1.dev.gold.txt', featurizer, tokenizer)
print("---------------- SADNESS --------------------")
sadness_regr, sadness_features = train('../resources/emoint/sadness-ratings-0to1.train.txt',
'../resources/emoint/sadness-ratings-0to1.dev.gold.txt', featurizer, tokenizer)
In [4]:
def plot_fig(f, regr, labels, title, num, subnum, cnt):
ax = f.add_subplot(subnum)
f.subplots_adjust(hspace=.4)
if subnum % 2 == 0:
ax.yaxis.tick_right()
indices = np.argsort(regr.feature_importances_)[-10:]
plt.barh(np.arange(len(indices)), regr.feature_importances_[indices], color=colors[cnt])
plt.yticks(np.arange(len(indices)) + 0.75/2 , np.array(labels)[indices])
plt.xlabel('Relative importance')
plt.title(title)
In [5]:
colors = ['#e41a1c', '#377eb8', '#4daf4a', '#984ea3']
f = plt.figure(figsize=(10, 6))
plot_fig(f, anger_regr, anger_features, 'Anger', 1, 221, 0)
plot_fig(f, fear_regr, fear_features, 'FEAR', 1, 222, 1)
plot_fig(f, joy_regr, joy_features, 'JOY', 2, 223, 2)
plot_fig(f, sadness_regr, sadness_features, 'SADNESS', 2, 224, 3)
In [6]:
def blend_train(train_path, dev_path, featurizer, tokenizer, clfs):
featurizer = EmoIntFeaturizer()
tokenizer = TweetTokenizer()
X_train, y_train = get_xy(train_path, tokenizer, featurizer)
X_dev, y_dev = get_xy(dev_path, tokenizer, featurizer)
y_dev_pred = blend(X_train, y_train, X_dev, clfs, regr=True, blend_clf=Ridge())
metrics(y_dev_pred, y_dev, True)
In [7]:
clfs = [RandomForestRegressor(), ExtraTreesRegressor(), BaggingRegressor(), GradientBoostingRegressor()]
print("---------------- ANGER --------------------")
blend_train('../resources/emoint/anger-ratings-0to1.train.txt',
'../resources/emoint/anger-ratings-0to1.dev.gold.txt', featurizer, tokenizer, clfs)
print("---------------- FEAR --------------------")
blend_train('../resources/emoint/fear-ratings-0to1.train.txt',
'../resources/emoint/fear-ratings-0to1.dev.gold.txt', featurizer, tokenizer, clfs)
print("---------------- JOY --------------------")
blend_train('../resources/emoint/joy-ratings-0to1.train.txt',
'../resources/emoint/joy-ratings-0to1.dev.gold.txt', featurizer, tokenizer, clfs)
print("---------------- SADNESS --------------------")
blend_train('../resources/emoint/sadness-ratings-0to1.train.txt',
'../resources/emoint/sadness-ratings-0to1.dev.gold.txt', featurizer, tokenizer, clfs)