In [ ]:
import time
import json
import pickle
import re
import html
import sys
import gc
import gensim
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import PolynomialFeatures, scale
from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBRegressor
import pandas as pd
from pandas import DataFrame
from scipy.stats import spearmanr, pearsonr
import numpy as np
import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Flatten, Embedding, Input
from keras.layers.wrappers import Bidirectional
from keras.wrappers.scikit_learn import KerasRegressor
from keras.optimizers import SGD, Adagrad
from keras.layers.convolutional import Conv1D
from keras.layers.pooling import MaxPooling1D, GlobalMaxPooling1D, AveragePooling1D
from keras.layers.normalization import BatchNormalization
from keras.layers.recurrent import LSTM, GRU
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from nltk import word_tokenize
from nltk import bigrams
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import sentiwordnet as swn
from nltk.tokenize.casual import TweetTokenizer
from IPython.display import display, HTML
In [ ]:
def browser_alert(message):
display(HTML('<script type="text/javascript">alert("' + message + '");</script>'))
def browser_notify(message):
display(HTML('<script type="text/javascript">var notification=new Notification("Jupyter Notification",{icon:"http://blog.jupyter.org/content/images/2015/02/jupyter-sq-text.png",body:"' + message + '"});</script>'))
In [ ]:
word_vector_path = "/home/v2john/"
wassa_home = "/home/v2john/WASSA-Task/"
In [ ]:
def loadGloveModel(gloveFile):
print("Loading Glove Model")
f = open(gloveFile,'r')
model = {}
num = 1
for line in f:
try:
splitLine = line.split()
word = splitLine[0]
embedding = [float(val) for val in splitLine[1:]]
model[word] = np.array(embedding)
num += 1
except Exception as e:
print("Failed at line " + str(num))
print("Done.",len(model)," words loaded!")
return model
In [ ]:
# Google news pretrained vectors
wv_model_path = word_vector_path + "GoogleNews-vectors-negative300.bin.gz"
wv_model = gensim.models.KeyedVectors.load_word2vec_format(wv_model_path, binary=True, unicode_errors='ignore')
In [ ]:
# Twitter pretrained vectors
wv_model_path_1 = word_vector_path + "word2vec_twitter_model.bin"
wv_model_1 = gensim.models.KeyedVectors.load_word2vec_format(wv_model_path_1, binary=True, unicode_errors='ignore')
In [ ]:
wv_model_path_2 = word_vector_path + "glove.twitter.27B.200d.txt"
wv_model_2 = loadGloveModel(wv_model_path_2)
In [ ]:
wv_model_path_3 = word_vector_path + "glove.6B.300d.txt"
wv_model_3 = loadGloveModel(wv_model_path_3)
In [ ]:
wv_model_path_4 = word_vector_path + "glove.42B.300d.txt"
wv_model_4 = loadGloveModel(wv_model_path_4)
In [ ]:
wv_model_path_5 = word_vector_path + "glove.840B.300d.txt"
wv_model_5 = loadGloveModel(wv_model_path_5)
In [ ]:
w2v_dimensions = len(wv_model['word'])
w2v_dimensions_1 = len(wv_model_1['word'])
w2v_dimensions_2 = len(wv_model_2['word'])
w2v_dimensions_3 = len(wv_model_3['word'])
w2v_dimensions_4 = len(wv_model_4['word'])
w2v_dimensions_5 = len(wv_model_5['word'])
print(w2v_dimensions, w2v_dimensions_1, w2v_dimensions_2, w2v_dimensions_3, w2v_dimensions_4, w2v_dimensions_5)
In [ ]:
def get_word2vec_embedding(word, model, dimensions):
vec_rep = np.zeros(dimensions)
if word in model:
vec_rep = model[word]
return vec_rep
In [ ]:
# get_word2vec_embedding("charm", wv_model_2, 200)
In [ ]:
wnl = WordNetLemmatizer()
tknzr = TweetTokenizer()
In [ ]:
def remove_stopwords(string):
split_string = \
[word for word in string.split()
if word not in stopwords.words('english')]
return " ".join(split_string)
In [ ]:
def clean_str(string):
string = html.unescape(string)
string = string.replace("\\n", " ")
string = string.replace("_NEG", "")
string = string.replace("_NEGFIRST", "")
string = re.sub(r"@[A-Za-z0-9_(),!?\'\`]+", " ", string) # removing any twitter handle mentions
string = re.sub(r"\d+", " ", string) # removing any words with numbers
string = re.sub(r"_", " ", string)
string = re.sub(r":", " ", string)
string = re.sub(r"/", " ", string)
string = re.sub(r"#", " ", string)
string = re.sub(r"\.", " ", string)
string = re.sub(r"\*", " ", string)
string = re.sub(r"\'s", " ", string)
string = re.sub(r"\'m", " am", string)
string = re.sub(r"\'ve", " have", string)
string = re.sub(r"n\'t", " not", string)
string = re.sub(r"n\’t", " not", string)
string = re.sub(r"\'re", " are", string)
string = re.sub(r"\’re", " are", string)
string = re.sub(r"\'d", " would", string)
string = re.sub(r"\’d", " would", string)
string = re.sub(r"\'ll", " will", string)
string = re.sub(r"\’ll", " will", string)
string = re.sub(r"'", " ", string)
string = re.sub(r",", "", string)
string = re.sub(r"!", " !", string)
string = re.sub(r"\(", "", string)
string = re.sub(r"\)", "", string)
string = re.sub(r"\?", " ?", string)
string = re.sub(r"-", " ", string)
string = re.sub(r"<", " ", string)
string = re.sub(r">", " ", string)
string = re.sub(r";", " ", string)
string = re.sub(r"\s{2,}", " ", string)
return remove_stopwords(string.strip().lower())
In [ ]:
class Tweet(object):
def __init__(self, id, text, emotion, intensity):
self.id = id
self.text = text
self.emotion = emotion
self.intensity = intensity
def __repr__(self):
return \
"id: " + self.id + \
", text: " + self.text + \
", emotion: " + self.emotion + \
", intensity: " + self.intensity
In [ ]:
def read_training_data(training_data_file_path):
train_list = list()
with open(training_data_file_path) as input_file:
for line in input_file:
line = line.strip()
array = line.split('\t')
train_list.append(Tweet(array[0], list(tknzr.tokenize(clean_str(array[1]))),
array[2], float(array[3])))
return train_list
def read_training_data_verbatim(training_data_file_path):
train_list = list()
with open(training_data_file_path) as input_file:
for line in input_file:
line = line.strip()
array = line.split('\t')
train_list.append(Tweet(array[0], array[1], array[2], float(array[3])))
return train_list
def read_test_data(training_data_file_path):
test_list = list()
with open(training_data_file_path) as input_file:
for line in input_file:
line = line.strip()
array = line.split('\t')
test_list.append(Tweet(array[0], clean_str(array[1]), array[2], None))
return test_list
In [ ]:
non_linear_factor = PolynomialFeatures(3)
In [ ]:
emotion = "anger"
In [ ]:
training_data_file_path = \
wassa_home + "dataset/" + \
emotion + "-ratings-0to1.train.txt"
predictions_file_path = \
wassa_home + "predictions/" + \
emotion + "-pred.txt"
dev_set_path = \
wassa_home + "dataset/dev-set/" + \
emotion + "-ratings-0to1.dev.gold.txt"
test_data_file_path = \
wassa_home + "dataset/test-set/" + \
emotion + "-ratings-0to1.test.gold.txt"
debug_file_path = \
wassa_home + "dataset/test-set/debug/" + \
emotion + ".tsv"
word_embeddings_path = "/home/v2john/" + emotion + "-word-embeddings.pkl"
In [ ]:
with open(wassa_home + 'lexicons/emoji_map.json') as emoji_file:
emoji_list = json.load(emoji_file)
emoji_dict = dict()
for emoji in emoji_list:
emoji_dict[emoji["emoji"]] = (emoji["name"], emoji["polarity"])
In [ ]:
def get_emoji_intensity(word):
score = 0.0
if word in emoji_dict.keys():
score = float(emoji_dict[word][1])
vec_rep = np.array(score)
return non_linear_factor.fit_transform([vec_rep])[0]
In [ ]:
get_emoji_intensity("💯")
In [ ]:
affect_intensity_file_path = \
wassa_home + \
"lexicons/NRC-AffectIntensity-Lexicon.txt"
def get_word_affect_intensity_dict(emotion):
word_intensities = dict()
with open(affect_intensity_file_path) as affect_intensity_file:
for line in affect_intensity_file:
word_int_array = line.replace("\n", "").split("\t")
if (word_int_array[2] == emotion):
word_intensities[word_int_array[0]] = float(word_int_array[1])
return word_intensities
In [ ]:
word_intensities = get_word_affect_intensity_dict(emotion)
In [ ]:
def get_emo_int_vector(word):
score = 0.0
if word in word_intensities.keys():
score = float(word_intensities[word])
vec_rep = np.array([score])
return non_linear_factor.fit_transform([vec_rep])[0]
In [ ]:
# get_emo_int_vector('fury')
In [ ]:
def get_sentiwordnetscore(word):
vec_rep = np.zeros(2)
synsetlist = list(swn.senti_synsets(word))
if synsetlist:
vec_rep[0] = synsetlist[0].pos_score()
vec_rep[1] = synsetlist[0].neg_score()
return non_linear_factor.fit_transform([vec_rep])[0]
In [ ]:
# get_sentiwordnetscore("fury")
In [ ]:
sentiment_emotion_lex_file_path = \
wassa_home + \
"lexicons/NRC-Sentiment-Emotion-Lexicons/Lexicons/NRC-Emotion-Lexicon-v0.92/" + \
"NRC-Emotion-Lexicon-Wordlevel-v0.92.txt"
def get_affect_presence_list(emotion):
word_list = list()
with open(sentiment_emotion_lex_file_path) as sentiment_emotion_lex_file:
for line in sentiment_emotion_lex_file:
word_array = line.replace("\n", "").split("\t")
if (word_array[1] == emotion and word_array[2] == '1'):
word_list.append(word_array[0])
return word_list
In [ ]:
sentiment_emotion_lex_word_list = get_affect_presence_list(emotion)
In [ ]:
def get_sentiment_emotion_feature(word):
score = 0.0
if word in sentiment_emotion_lex_word_list:
score = 1.0
vec_rep = np.array([score])
return non_linear_factor.fit_transform([vec_rep])[0]
In [ ]:
# get_sentiment_emotion_feature("fury")
In [ ]:
hashtag_emotion_lex_file_path = \
wassa_home + \
"lexicons/NRC-Sentiment-Emotion-Lexicons/Lexicons/NRC-Hashtag-Emotion-Lexicon-v0.2/" + \
"NRC-Hashtag-Emotion-Lexicon-v0.2.txt"
def get_hashtag_emotion_intensity(emotion):
hastag_intensities = dict()
with open(hashtag_emotion_lex_file_path) as hashtag_emotion_lex_file:
for line in hashtag_emotion_lex_file:
word_array = line.replace("\n", "").split("\t")
if (word_array[0] == emotion):
hastag_intensities[word_array[1]] = float(word_array[2])
return hastag_intensities
In [ ]:
hashtag_emotion_intensities = get_hashtag_emotion_intensity(emotion)
In [ ]:
def get_hashtag_emotion_vector(word):
score = 0.0
if word in hashtag_emotion_intensities.keys():
score = float(hashtag_emotion_intensities[word])
vec_rep = np.array([score])
return non_linear_factor.fit_transform([vec_rep])[0]
In [ ]:
# get_hashtag_emotion_vector("#fury")
In [ ]:
emoticon_lexicon_unigrams_file_path = \
wassa_home + \
"lexicons/NRC-Sentiment-Emotion-Lexicons/Lexicons/NRC-Emoticon-Lexicon-v1.0/Emoticon-unigrams.txt"
emoticon_lexicon_bigrams_file_path = \
wassa_home + \
"lexicons/NRC-Sentiment-Emotion-Lexicons/Lexicons/NRC-Emoticon-Lexicon-v1.0/Emoticon-bigrams.txt"
emoticon_lexicon_unigrams = dict()
emoticon_lexicon_bigrams = dict()
def get_emoticon_lexicon_unigram_dict():
with open(emoticon_lexicon_unigrams_file_path) as emoticon_lexicon_file:
for line in emoticon_lexicon_file:
word_array = line.replace("\n", "").split("\t")
emoticon_lexicon_unigrams[word_array[0]] = np.array([float(val) for val in word_array[1:]])
return emoticon_lexicon_unigrams
def get_emoticon_lexicon_bigram_dict():
with open(emoticon_lexicon_bigrams_file_path) as emoticon_lexicon_file:
for line in emoticon_lexicon_file:
word_array = line.replace("\n", "").split("\t")
emoticon_lexicon_bigrams[word_array[0]] = np.array([float(val) for val in word_array[1:]])
return emoticon_lexicon_bigrams
In [ ]:
emoticon_lexicon_unigram_dict = get_emoticon_lexicon_unigram_dict()
In [ ]:
emoticon_lexicon_bigram_dict = get_emoticon_lexicon_bigram_dict()
In [ ]:
def get_unigram_sentiment_emoticon_lexicon_vector(word):
vec_rep = np.zeros(3)
if word in emoticon_lexicon_unigram_dict.keys():
vec_rep = emoticon_lexicon_unigram_dict[word]
return non_linear_factor.fit_transform([vec_rep])[0]
def get_bigram_sentiment_emoticon_lexicon_vector(word):
vec_rep = np.zeros(3)
if word in emoticon_lexicon_bigram_dict.keys():
vec_rep = emoticon_lexicon_bigram_dict[word]
return non_linear_factor.fit_transform([vec_rep])[0]
In [ ]:
get_unigram_sentiment_emoticon_lexicon_vector("fury")
In [ ]:
get_bigram_sentiment_emoticon_lexicon_vector("add everyone")
In [ ]:
emoticon_afflex_unigrams_file_path = \
wassa_home + \
"lexicons/NRC-Sentiment-Emotion-Lexicons/Lexicons/NRC-Emoticon-AffLexNegLex-v1.0/" + \
"Emoticon-AFFLEX-NEGLEX-unigrams.txt"
emoticon_afflex_bigrams_file_path = \
wassa_home + \
"lexicons/NRC-Sentiment-Emotion-Lexicons/Lexicons/NRC-Emoticon-AffLexNegLex-v1.0/" + \
"Emoticon-AFFLEX-NEGLEX-bigrams.txt"
emoticon_afflex_unigrams = dict()
emoticon_afflex_bigrams = dict()
def get_emoticon_afflex_unigram_dict():
with open(emoticon_afflex_unigrams_file_path) as emoticon_lexicon_file:
for line in emoticon_lexicon_file:
word_array = line.replace("\n", "").split("\t")
emoticon_afflex_unigrams[word_array[0]] = np.array([float(val) for val in word_array[1:]])
return emoticon_afflex_unigrams
def get_emoticon_afflex_bigram_dict():
with open(emoticon_afflex_bigrams_file_path) as emoticon_lexicon_file:
for line in emoticon_lexicon_file:
word_array = line.replace("\n", "").split("\t")
emoticon_afflex_bigrams[word_array[0]] = np.array([float(val) for val in word_array[1:]])
return emoticon_afflex_bigrams
In [ ]:
emoticon_afflex_unigram_dict = get_emoticon_afflex_unigram_dict()
In [ ]:
emoticon_afflex_bigram_dict = get_emoticon_afflex_bigram_dict()
In [ ]:
def get_unigram_sentiment_emoticon_afflex_vector(word):
vec_rep = np.zeros(3)
if word in emoticon_afflex_unigram_dict.keys():
vec_rep = emoticon_afflex_unigram_dict[word]
return non_linear_factor.fit_transform([vec_rep])[0]
def get_bigram_sentiment_emoticon_afflex_vector(word):
vec_rep = np.zeros(3)
if word in emoticon_afflex_bigram_dict.keys():
vec_rep = emoticon_afflex_bigram_dict[word]
return non_linear_factor.fit_transform([vec_rep])[0]
In [ ]:
get_unigram_sentiment_emoticon_afflex_vector("fury")
In [ ]:
# get_bigram_sentiment_emoticon_afflex_vector("pay vip")
In [ ]:
hashtag_affneglex_unigrams_file_path = \
wassa_home + \
"lexicons/NRC-Sentiment-Emotion-Lexicons/Lexicons/NRC-Hashtag-Sentiment-AffLexNegLex-v1.0/" + \
"HS-AFFLEX-NEGLEX-unigrams.txt"
hashtag_affneglex_bigrams_file_path = \
wassa_home + \
"lexicons/NRC-Sentiment-Emotion-Lexicons/Lexicons/NRC-Hashtag-Sentiment-AffLexNegLex-v1.0/" + \
"HS-AFFLEX-NEGLEX-bigrams.txt"
hashtag_affneglex_unigrams = dict()
hashtag_affneglex_bigrams = dict()
def get_hashtag_affneglex_unigram_dict():
with open(hashtag_affneglex_unigrams_file_path) as hashtag_sent_lex_file:
for line in hashtag_sent_lex_file:
word_array = line.replace("\n", "").split("\t")
hashtag_affneglex_unigrams[word_array[0]] = np.array([float(val) for val in word_array[1:]])
return hashtag_affneglex_unigrams
def get_hashtag_affneglex_bigram_dict():
with open(hashtag_affneglex_bigrams_file_path) as hashtag_sent_lex_file:
for line in hashtag_sent_lex_file:
word_array = line.replace("\n", "").split("\t")
hashtag_affneglex_bigrams[word_array[0]] = np.array([float(val) for val in word_array[1:]])
return hashtag_affneglex_bigrams
In [ ]:
hashtag_affneglex_unigram_dict = get_hashtag_affneglex_unigram_dict()
In [ ]:
hashtag_affneglex_bigram_dict = get_hashtag_affneglex_bigram_dict()
In [ ]:
def get_unigram_sentiment_hashtag_affneglex_vector(word):
vec_rep = np.zeros(3)
if word in hashtag_affneglex_unigram_dict.keys():
vec_rep = hashtag_affneglex_unigram_dict[word]
return non_linear_factor.fit_transform([vec_rep])[0]
def get_bigram_sentiment_hashtag_affneglex_vector(word):
vec_rep = np.zeros(3)
if word in hashtag_affneglex_bigram_dict.keys():
vec_rep = hashtag_affneglex_bigram_dict[word]
return non_linear_factor.fit_transform([vec_rep])[0]
In [ ]:
get_unigram_sentiment_hashtag_affneglex_vector("#great")
In [ ]:
get_bigram_sentiment_hashtag_affneglex_vector("#good luck")
In [ ]:
hash_sent_lex_unigrams_file_path = \
wassa_home + \
"lexicons/NRC-Sentiment-Emotion-Lexicons/Lexicons/NRC-Hashtag-Sentiment-Lexicon-v1.0/HS-unigrams.txt"
hash_sent_lex_bigrams_file_path = \
wassa_home + \
"lexicons/NRC-Sentiment-Emotion-Lexicons/Lexicons/NRC-Hashtag-Sentiment-Lexicon-v1.0/HS-bigrams.txt"
def get_hash_sent_lex_unigram_dict():
hash_sent_lex_unigrams = dict()
with open(hash_sent_lex_unigrams_file_path) as hash_sent_lex_file:
for line in hash_sent_lex_file:
word_array = line.replace("\n", "").split("\t")
if clean_str(word_array[0]):
hash_sent_lex_unigrams[word_array[0]] = np.array([float(val) for val in word_array[1:]])
return hash_sent_lex_unigrams
def get_hash_sent_lex_bigram_dict():
hash_sent_lex_bigrams = dict()
with open(hash_sent_lex_bigrams_file_path) as hash_sent_lex_file:
for line in hash_sent_lex_file:
word_array = line.replace("\n", "").split("\t")
if clean_str(word_array[0]):
hash_sent_lex_bigrams[word_array[0]] = np.array([float(val) for val in word_array[1:]])
return hash_sent_lex_bigrams
In [ ]:
hash_sent_lex_unigram_dict = get_hash_sent_lex_unigram_dict()
In [ ]:
hash_sent_lex_bigram_dict = get_hash_sent_lex_bigram_dict()
In [ ]:
def get_unigram_sentiment_hash_sent_lex_vector(word):
vec_rep = np.zeros(3)
if word in hash_sent_lex_unigram_dict.keys():
vec_rep = hash_sent_lex_unigram_dict[word]
return non_linear_factor.fit_transform([vec_rep])[0]
def get_bigram_sentiment_hash_sent_lex_vector(word):
vec_rep = np.zeros(3)
if word in hash_sent_lex_bigram_dict.keys():
vec_rep = hash_sent_lex_bigram_dict[word]
return non_linear_factor.fit_transform([vec_rep])[0]
In [ ]:
# get_unigram_sentiment_hash_sent_lex_vector("#fabulous")
In [ ]:
# get_bigram_sentiment_hash_sent_lex_vector(". #perfection")
In [ ]:
depeche_mood_file_path = \
wassa_home + \
"lexicons/DepecheMood_V1.0/DepecheMood_normfreq.txt"
In [ ]:
def get_depeche_vector_dict():
depeche_vector_dict = dict()
with open(depeche_mood_file_path) as depeche_mood_file:
for line in depeche_mood_file:
word_array = line.replace("\n", "").split("\t")
depeche_vector_dict[word_array[0].split("#")[0]] = np.array([float(val) for val in word_array[1:]])
return depeche_vector_dict
In [ ]:
depeche_vector_dict = get_depeche_vector_dict()
In [ ]:
def get_depeche_mood_vector(word):
vec_rep = np.zeros(8)
if word in depeche_vector_dict.keys():
vec_rep = np.array(depeche_vector_dict[word])
return non_linear_factor.fit_transform([vec_rep])[0]
In [ ]:
# get_depeche_mood_vector("120th")
In [ ]:
# print(embedding_features.shape)
# lexicon_features = get_unigram_embedding(word, embedding_info[0], unigram_feature_string)
# poly_lexicon_features = non_linear_factor.fit_transform([lexicon_features])[0]
# print(poly_lexicon_features.shape)
# final_features = np.concatenate((embedding_features, poly_lexicon_features))
# print(final_features.shape)
In [ ]:
def is_active_vector_method(string):
return int(string)
def learn_unigram_word_embedding(word):
word_feature_embedding_dict = dict()
'''Pre-trained Word embeddings'''
index = 0
word_feature_embedding_dict[index] = get_word2vec_embedding(word, wv_model, w2v_dimensions)
index = 1
word_feature_embedding_dict[index] = get_word2vec_embedding(word, wv_model_1, w2v_dimensions_1)
index = 2
word_feature_embedding_dict[index] = get_word2vec_embedding(word, wv_model_2, w2v_dimensions_2)
index = 3
word_feature_embedding_dict[index] = get_word2vec_embedding(word, wv_model_3, w2v_dimensions_3)
index = 4
word_feature_embedding_dict[index] = get_word2vec_embedding(word, wv_model_4, w2v_dimensions_4)
index = 5
word_feature_embedding_dict[index] = get_word2vec_embedding(word, wv_model_5, w2v_dimensions_5)
'''NRC Emotion Intensity Lexicon'''
index = 6
word_feature_embedding_dict[index] = get_emo_int_vector(word)
'''WordNet'''
index = 7
word_feature_embedding_dict[index] = get_sentiwordnetscore(word)
'''NRC Sentiment Lexica'''
index = 8
word_feature_embedding_dict[index] = get_sentiment_emotion_feature(word)
index = 9
word_feature_embedding_dict[index] = get_unigram_sentiment_emoticon_lexicon_vector(word)
index = 10
word_feature_embedding_dict[index] = get_unigram_sentiment_emoticon_afflex_vector(word)
'''NRC Hashtag Lexica'''
index = 11
word_feature_embedding_dict[index] = get_hashtag_emotion_vector(word)
index = 12
word_feature_embedding_dict[index] = get_unigram_sentiment_hash_sent_lex_vector(word)
index = 13
word_feature_embedding_dict[index] = get_unigram_sentiment_hashtag_affneglex_vector(word)
'''Emoji Polarities'''
index = 14
word_feature_embedding_dict[index] = get_emoji_intensity(word)
'''Depeche Mood'''
index = 15
word_feature_embedding_dict[index] = get_depeche_mood_vector(word)
return word_feature_embedding_dict
def learn_bigram_word_embedding(word):
word_feature_embedding_dict = dict()
'''NRC Sentiment Lexica'''
index = 0
word_feature_embedding_dict[index] = get_bigram_sentiment_emoticon_lexicon_vector(word)
index = 1
word_feature_embedding_dict[index] = get_bigram_sentiment_emoticon_afflex_vector(word)
'''NRC Hashtag Lexica'''
index = 2
word_feature_embedding_dict[index] = get_bigram_sentiment_hash_sent_lex_vector(word)
index = 3
word_feature_embedding_dict[index] = get_bigram_sentiment_hashtag_affneglex_vector(word)
return word_feature_embedding_dict
In [ ]:
def get_unigram_embedding(word, word_embedding_dict, bin_string):
word_feature_embedding_dict = word_embedding_dict[word]
final_embedding = np.array([])
for i in range(16):
if is_active_vector_method(bin_string[i]):
final_embedding = np.append(final_embedding, word_feature_embedding_dict[i])
return final_embedding
def get_bigram_embedding(bigram, word_embedding_dict, bin_string):
word_feature_embedding_dict = word_embedding_dict[word]
final_embedding = np.array([])
for i in range(4):
if is_active_vector_method(bin_string[i]):
final_embedding = np.append(final_embedding, word_feature_embedding_dict[i])
return final_embedding
In [ ]:
unigram_feature_string = "1111111111111111"
bigram_feature_string = "1111"
In [ ]:
training_tweets = read_training_data(training_data_file_path)
dev_tweets = read_training_data(dev_set_path)
score_train = list()
tweet_train = list()
for tweet in training_tweets:
tweet_train.append(tweet.text)
score_train.append(float(tweet.intensity))
for tweet in dev_tweets:
tweet_train.append(tweet.text)
score_train.append(float(tweet.intensity))
print(len(score_train))
score_train = np.asarray(score_train)
In [ ]:
raw_test_tweets = read_training_data_verbatim(test_data_file_path)
test_tweets = read_training_data(test_data_file_path)
tweet_test_raw = list()
tweet_test = list()
y_gold = list()
for tweet in raw_test_tweets:
tweet_test_raw.append(tweet.text)
for tweet in test_tweets:
tweet_test.append(tweet.text)
y_gold.append(float(tweet.intensity))
print(len(y_gold))
In [ ]:
def build_word_embeddings(tweets):
max_tweet_length = -1
word_embedding_dict = dict()
for tweet in tweets:
if len(tweet) > max_tweet_length:
max_tweet_length = len(tweet)
for token in tweet:
if token not in word_embedding_dict.keys():
word_embedding_dict[token] = learn_unigram_word_embedding(token)
return word_embedding_dict, max_tweet_length
In [ ]:
# all_tweets = tweet_train + tweet_test
# embedding_info = build_word_embeddings(all_tweets)
In [ ]:
# # Save vectors
# with open(word_embeddings_path, 'wb') as word_embeddings_file:
# pickle.dump(embedding_info, word_embeddings_file)
In [ ]:
browser_notify("Persisted to disk")
In [ ]:
# Restore vectors
with open(word_embeddings_path, 'rb') as word_embeddings_file:
embedding_info = pickle.load(word_embeddings_file)
In [ ]:
embeddings_index = embedding_info[0]
MAX_SEQUENCE_LENGTH = embedding_info[1]
MAX_NB_WORDS = 20000
EMBEDDING_DIM = len(get_unigram_embedding("glad", embedding_info[0], unigram_feature_string))
print(MAX_SEQUENCE_LENGTH, EMBEDDING_DIM)
In [ ]:
word_indices = dict()
current_index = 1
In [ ]:
def sequence_tweets(tweets):
global current_index
vectors = list()
for tweet in tweets:
vector = list()
for word in tweet:
word_index = None
if word in word_indices:
word_index = word_indices[word]
else:
word_index = current_index
current_index += 1
word_indices[word] = word_index
vector.append(word_index)
vectors.append(vector)
return vectors
In [ ]:
x_train = sequence_tweets(tweet_train)
x_test = sequence_tweets(tweet_test)
In [ ]:
len(word_indices)
In [ ]:
# display(tweet_train)
In [ ]:
word_embedding_matrix = list()
word_embedding_matrix.append(np.zeros(EMBEDDING_DIM))
for word in sorted(word_indices, key=word_indices.get):
embedding_features = get_unigram_embedding(word, embedding_info[0], unigram_feature_string)
word_embedding_matrix.append(embedding_features)
word_embedding_matrix = np.asarray(word_embedding_matrix, dtype='f')
In [ ]:
word_embedding_matrix.shape
In [ ]:
word_embedding_matrix = scale(word_embedding_matrix)
In [ ]:
browser_notify("Vectorization Done")
In [ ]:
pre_padding = 6
x_train = sequence.pad_sequences(x_train, maxlen=MAX_SEQUENCE_LENGTH, padding="post", truncating="post")
x_test = sequence.pad_sequences(x_test, maxlen=MAX_SEQUENCE_LENGTH, padding="post", truncating="post")
x_train = sequence.pad_sequences(x_train, maxlen=MAX_SEQUENCE_LENGTH + pre_padding, padding="pre")
x_test = sequence.pad_sequences(x_test, maxlen=MAX_SEQUENCE_LENGTH + pre_padding, padding="pre")
In [ ]:
len(x_train), len(x_test), len(x_train[0])
In [ ]:
embed_1 = Embedding(len(word_indices) + 1, EMBEDDING_DIM, weights=[word_embedding_matrix],
input_length=MAX_SEQUENCE_LENGTH + pre_padding, trainable=True)
conv_1 = Conv1D(128, 3, activation='relu', name='conv1')
conv_2 = Conv1D(128, 3, activation='relu', name='conv2')
conv_3 = Conv1D(256, 3, activation='relu', name='conv3')
conv_4 = Conv1D(256, 3, activation='relu', name='conv4')
conv_5 = Conv1D(256, 3, activation='relu', name='conv5')
conv_6 = Conv1D(1024, 3, activation='relu', name='conv6')
conv_7 = Conv1D(1024, 3, activation='relu', name='conv7')
conv_8 = Conv1D(1024, 3, activation='relu', name='conv8')
pool_1 = AveragePooling1D(pool_size=3, strides=2, name='pool1')
pool_2 = AveragePooling1D(pool_size=3, strides=2, name='pool2')
pool_3 = MaxPooling1D(pool_size=3, strides=2, name='pool3')
pool_4 = MaxPooling1D(pool_size=3, strides=2, name='pool4')
lstm_1 = LSTM(256, dropout=0.2, recurrent_dropout=0.2, name='lstm1', return_sequences=True)
lstm_2 = LSTM(128, dropout=0.2, recurrent_dropout=0.2, name='lstm2', return_sequences=True)
lstm_3 = LSTM(64, dropout=0.2, recurrent_dropout=0.2, name='lstm3')
gru_1 = GRU(256, dropout=0.25, recurrent_dropout=0.25, name='gru1', return_sequences=True)
gru_2 = GRU(256, dropout=0.25, recurrent_dropout=0.25, name='gru2', return_sequences=True)
gru_3 = GRU(256, dropout=0.25, recurrent_dropout=0.25, name='gru3')
bi_lstm_1 = Bidirectional(lstm_1, name='bilstm1')
bi_lstm_2 = Bidirectional(lstm_2, name='bilstm2')
bi_lstm_3 = Bidirectional(lstm_3, name='bilstm3')
dense_1 = Dense(20000, activation='relu', name='dense1')
dense_2 = Dense(1, activation='sigmoid', name='dense2')
drop_1 = Dropout(0.5, name='drop1')
drop_2 = Dropout(0.5, name='drop2')
In [ ]:
def get_rnn_model():
model = Sequential()
model.add(embed_1)
model.add(conv_1)
model.add(conv_2)
# model.add(pool_1)
# model.add(conv_3)
# model.add(conv_4)
# model.add(pool_2)
# model.add(conv_5)
model.add(lstm_1)
model.add(lstm_2)
model.add(lstm_3)
model.add(dense_1)
# model.add(drop_1)
model.add(dense_2)
model.compile(loss='mean_squared_error', optimizer="adam")
return model
In [ ]:
nn_model = KerasRegressor(build_fn=get_rnn_model, epochs=50, batch_size=64, verbose=1)
score_train = np.asarray(score_train)
# ml_model = AdaBoostRegressor(base_estimator=nn_model)
nn_model.fit(x_train, score_train) # , epochs=100, batch_size=64, verbose=1)
y_pred = nn_model.predict(x_test)
In [ ]:
browser_notify("NN Training Done")
In [ ]:
y_pred = np.reshape(y_pred, len(y_pred))
In [ ]:
print(emotion)
print(pearsonr(y_pred, y_gold))
print(spearmanr(y_pred, y_gold))
In [ ]:
# with open(predictions_file_path, 'w') as predictions_file:
# for i in range(len(y_pred)):
# predictions_file.write(
# str(raw_test_tweets[i].id) + "\t" + \
# raw_test_tweets[i].text + "\t" + \
# raw_test_tweets[i].emotion + "\t" + \
# str(y_pred[i]) + "\n"
# )
In [ ]:
browser_notify("Neural Net Trained")
In [ ]:
def get_predictions_list(y_pred, y_gold):
predictions = list()
for i in range(len(y_gold)):
prediction = {
"raw_tweet": tweet_test_raw[i],
"cleaned_tweet": tweet_test[i],
"prediction": y_pred[i],
"actual": y_gold[i],
"diff": (y_gold[i] - y_pred[i])
}
predictions.append(prediction)
return predictions
In [ ]:
predictions = get_predictions_list(y_pred, y_gold)
In [ ]:
pred_df = DataFrame(predictions)
pred_df = pred_df[["raw_tweet", "cleaned_tweet", "prediction", "actual", "diff"]]
pred_df = pred_df.sort_values(by=['diff'], ascending=[True])
In [ ]:
display(pred_df)
In [ ]:
with open(debug_file_path, 'w') as debug_file:
pred_df.to_csv(debug_file, sep='\t')
In [ ]: