Define evaluation logic


In [ ]:
import time
import json
import pickle
import re
import html
import sys
import gc

import gensim

from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import PolynomialFeatures, scale
from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBRegressor

import pandas as pd
from pandas import DataFrame

from scipy.stats import spearmanr, pearsonr
import numpy as np

import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Flatten, Embedding, Input
from keras.layers.wrappers import Bidirectional
from keras.wrappers.scikit_learn import KerasRegressor
from keras.optimizers import SGD, Adagrad
from keras.layers.convolutional import Conv1D
from keras.layers.pooling import MaxPooling1D, GlobalMaxPooling1D, AveragePooling1D
from keras.layers.normalization import BatchNormalization
from keras.layers.recurrent import LSTM, GRU
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from nltk import word_tokenize
from nltk import bigrams
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import sentiwordnet as swn
from nltk.tokenize.casual import TweetTokenizer

from IPython.display import display, HTML

In [ ]:
def browser_alert(message):
    display(HTML('<script type="text/javascript">alert("' + message + '");</script>'))
    
def browser_notify(message):
    display(HTML('<script type="text/javascript">var notification=new Notification("Jupyter Notification",{icon:"http://blog.jupyter.org/content/images/2015/02/jupyter-sq-text.png",body:"' + message + '"});</script>'))

In [ ]:
word_vector_path = "/home/v2john/"
wassa_home = "/home/v2john/WASSA-Task/"

Word2Vec + GloVe


In [ ]:
def loadGloveModel(gloveFile):
    print("Loading Glove Model")
    f = open(gloveFile,'r')
    model = {}
    num = 1
    for line in f:
        try:
            splitLine = line.split()
            word = splitLine[0]
            embedding = [float(val) for val in splitLine[1:]]
            model[word] = np.array(embedding)
            num += 1
        except Exception as e:
            print("Failed at line " + str(num))
    print("Done.",len(model)," words loaded!")
    return model

In [ ]:
# Google news pretrained vectors
wv_model_path = word_vector_path + "GoogleNews-vectors-negative300.bin.gz"
wv_model = gensim.models.KeyedVectors.load_word2vec_format(wv_model_path, binary=True, unicode_errors='ignore')

In [ ]:
# Twitter pretrained vectors
wv_model_path_1 = word_vector_path + "word2vec_twitter_model.bin"
wv_model_1 = gensim.models.KeyedVectors.load_word2vec_format(wv_model_path_1, binary=True, unicode_errors='ignore')

In [ ]:
wv_model_path_2 = word_vector_path + "glove.twitter.27B.200d.txt"
wv_model_2 = loadGloveModel(wv_model_path_2)

In [ ]:
wv_model_path_3 = word_vector_path + "glove.6B.300d.txt"
wv_model_3 = loadGloveModel(wv_model_path_3)

In [ ]:
wv_model_path_4 = word_vector_path + "glove.42B.300d.txt"
wv_model_4 = loadGloveModel(wv_model_path_4)

In [ ]:
wv_model_path_5 = word_vector_path + "glove.840B.300d.txt"
wv_model_5 = loadGloveModel(wv_model_path_5)

In [ ]:
w2v_dimensions = len(wv_model['word'])
w2v_dimensions_1 = len(wv_model_1['word'])
w2v_dimensions_2 = len(wv_model_2['word'])
w2v_dimensions_3 = len(wv_model_3['word'])
w2v_dimensions_4 = len(wv_model_4['word'])
w2v_dimensions_5 = len(wv_model_5['word'])

print(w2v_dimensions, w2v_dimensions_1, w2v_dimensions_2, w2v_dimensions_3, w2v_dimensions_4, w2v_dimensions_5)

In [ ]:
def get_word2vec_embedding(word, model, dimensions):

    vec_rep = np.zeros(dimensions)
    if word in model:
        vec_rep = model[word]
    
    return vec_rep

In [ ]:
# get_word2vec_embedding("charm", wv_model_2, 200)

Data Cleaning


In [ ]:
wnl = WordNetLemmatizer()
tknzr = TweetTokenizer()

In [ ]:
def remove_stopwords(string):
    split_string = \
        [word for word in string.split()
         if word not in stopwords.words('english')]
    
    return " ".join(split_string)

In [ ]:
def clean_str(string):  
    string = html.unescape(string)
    string = string.replace("\\n", " ")
    string = string.replace("_NEG", "")
    string = string.replace("_NEGFIRST", "")
    string = re.sub(r"@[A-Za-z0-9_(),!?\'\`]+", " ", string) # removing any twitter handle mentions
    string = re.sub(r"\d+", " ", string) # removing any words with numbers
    string = re.sub(r"_", " ", string)
    string = re.sub(r":", " ", string)
    string = re.sub(r"/", " ", string)
    string = re.sub(r"#", " ", string)
    string = re.sub(r"\.", " ", string)
    string = re.sub(r"\*", " ", string)
    string = re.sub(r"\'s", " ", string)
    string = re.sub(r"\'m", " am", string)
    string = re.sub(r"\'ve", " have", string)
    string = re.sub(r"n\'t", " not", string)
    string = re.sub(r"n\’t", " not", string)
    string = re.sub(r"\'re", " are", string)
    string = re.sub(r"\’re", " are", string)
    string = re.sub(r"\'d", " would", string)
    string = re.sub(r"\’d", " would", string)
    string = re.sub(r"\'ll", " will", string)
    string = re.sub(r"\’ll", " will", string)
    string = re.sub(r"'", " ", string)
    string = re.sub(r",", "", string)
    string = re.sub(r"!", " !", string)
    string = re.sub(r"\(", "", string)
    string = re.sub(r"\)", "", string)
    string = re.sub(r"\?", " ?", string)
    string = re.sub(r"-", " ", string)
    string = re.sub(r"<", " ", string)
    string = re.sub(r">", " ", string)
    string = re.sub(r";", " ", string)
    string = re.sub(r"\s{2,}", " ", string)

    return remove_stopwords(string.strip().lower())

Metadata and Class Definitions


In [ ]:
class Tweet(object):

    def __init__(self, id, text, emotion, intensity):
        self.id = id
        self.text = text
        self.emotion = emotion
        self.intensity = intensity

    def __repr__(self):
        return \
            "id: " + self.id + \
            ", text: " + self.text + \
            ", emotion: " + self.emotion + \
            ", intensity: " + self.intensity

In [ ]:
def read_training_data(training_data_file_path):

    train_list = list()
    with open(training_data_file_path) as input_file:
        for line in input_file:
            line = line.strip()
            array = line.split('\t')
            train_list.append(Tweet(array[0], list(tknzr.tokenize(clean_str(array[1]))), 
                                    array[2], float(array[3])))
    return train_list
            
def read_training_data_verbatim(training_data_file_path):

    train_list = list()
    with open(training_data_file_path) as input_file:
        for line in input_file:
            line = line.strip()
            array = line.split('\t')
            train_list.append(Tweet(array[0], array[1], array[2], float(array[3])))
    return train_list
    
def read_test_data(training_data_file_path):

    test_list = list()
    with open(training_data_file_path) as input_file:
        for line in input_file:
            line = line.strip()
            array = line.split('\t')
            test_list.append(Tweet(array[0], clean_str(array[1]), array[2], None))
    return test_list

In [ ]:
non_linear_factor = PolynomialFeatures(3)

In [ ]:
emotion = "anger"

In [ ]:
training_data_file_path = \
    wassa_home + "dataset/" + \
    emotion + "-ratings-0to1.train.txt"
predictions_file_path = \
    wassa_home + "predictions/" + \
    emotion + "-pred.txt"
dev_set_path = \
    wassa_home + "dataset/dev-set/" + \
    emotion + "-ratings-0to1.dev.gold.txt"
test_data_file_path = \
    wassa_home + "dataset/test-set/" + \
    emotion + "-ratings-0to1.test.gold.txt"
debug_file_path = \
    wassa_home + "dataset/test-set/debug/" + \
    emotion + ".tsv"
word_embeddings_path = "/home/v2john/" + emotion + "-word-embeddings.pkl"

Feature Extraction Snippets

Emoji Intensity


In [ ]:
with open(wassa_home + 'lexicons/emoji_map.json') as emoji_file:
    emoji_list = json.load(emoji_file)
    
emoji_dict = dict()

for emoji in emoji_list:
    emoji_dict[emoji["emoji"]] = (emoji["name"], emoji["polarity"])

In [ ]:
def get_emoji_intensity(word):
    
    score = 0.0
    if word in emoji_dict.keys():
        score = float(emoji_dict[word][1])
    
    vec_rep = np.array(score)
    
    return non_linear_factor.fit_transform([vec_rep])[0]

In [ ]:
get_emoji_intensity("💯")

Emotion Intensity Lexicon


In [ ]:
affect_intensity_file_path = \
    wassa_home + \
    "lexicons/NRC-AffectIntensity-Lexicon.txt"
    
def get_word_affect_intensity_dict(emotion):
    word_intensities = dict()

    with open(affect_intensity_file_path) as affect_intensity_file:
        for line in affect_intensity_file:
            word_int_array = line.replace("\n", "").split("\t")

            if (word_int_array[2] == emotion):
                word_intensities[word_int_array[0]] = float(word_int_array[1])

    return word_intensities

In [ ]:
word_intensities = get_word_affect_intensity_dict(emotion)

In [ ]:
def get_emo_int_vector(word):
    
    score = 0.0
    if word in word_intensities.keys():
        score = float(word_intensities[word])
        
    vec_rep = np.array([score])
    
    return non_linear_factor.fit_transform([vec_rep])[0]

In [ ]:
# get_emo_int_vector('fury')

SentiWordNet


In [ ]:
def get_sentiwordnetscore(word):
    
    vec_rep = np.zeros(2)
    
    synsetlist = list(swn.senti_synsets(word))

    if synsetlist:
        vec_rep[0] = synsetlist[0].pos_score()
        vec_rep[1] = synsetlist[0].neg_score()

    return non_linear_factor.fit_transform([vec_rep])[0]

In [ ]:
# get_sentiwordnetscore("fury")

Sentiment Emotion Presence Lexicon


In [ ]:
sentiment_emotion_lex_file_path = \
    wassa_home + \
    "lexicons/NRC-Sentiment-Emotion-Lexicons/Lexicons/NRC-Emotion-Lexicon-v0.92/" + \
    "NRC-Emotion-Lexicon-Wordlevel-v0.92.txt"

def get_affect_presence_list(emotion):
    word_list = list()
    
    with open(sentiment_emotion_lex_file_path) as sentiment_emotion_lex_file:
        for line in sentiment_emotion_lex_file:
            word_array = line.replace("\n", "").split("\t")

            if (word_array[1] == emotion and word_array[2] == '1'):
                word_list.append(word_array[0])
                
    return word_list

In [ ]:
sentiment_emotion_lex_word_list = get_affect_presence_list(emotion)

In [ ]:
def get_sentiment_emotion_feature(word):
    
    score = 0.0
    if word in sentiment_emotion_lex_word_list:
        score = 1.0
    vec_rep = np.array([score])
    
    return non_linear_factor.fit_transform([vec_rep])[0]

In [ ]:
# get_sentiment_emotion_feature("fury")

Hashtag Emotion Intensity


In [ ]:
hashtag_emotion_lex_file_path = \
    wassa_home + \
    "lexicons/NRC-Sentiment-Emotion-Lexicons/Lexicons/NRC-Hashtag-Emotion-Lexicon-v0.2/" + \
    "NRC-Hashtag-Emotion-Lexicon-v0.2.txt"
    
def get_hashtag_emotion_intensity(emotion):
    hastag_intensities = dict()
    
    with open(hashtag_emotion_lex_file_path) as hashtag_emotion_lex_file:
        for line in hashtag_emotion_lex_file:
            word_array = line.replace("\n", "").split("\t")
            if (word_array[0] == emotion):
                hastag_intensities[word_array[1]] = float(word_array[2])
                
    return hastag_intensities

In [ ]:
hashtag_emotion_intensities = get_hashtag_emotion_intensity(emotion)

In [ ]:
def get_hashtag_emotion_vector(word):
    
    score = 0.0
    
    if word in hashtag_emotion_intensities.keys():
        score = float(hashtag_emotion_intensities[word])
        
    vec_rep = np.array([score])
            
    return non_linear_factor.fit_transform([vec_rep])[0]

In [ ]:
# get_hashtag_emotion_vector("#fury")

Emoticon Sentiment Lexicon


In [ ]:
emoticon_lexicon_unigrams_file_path = \
    wassa_home + \
    "lexicons/NRC-Sentiment-Emotion-Lexicons/Lexicons/NRC-Emoticon-Lexicon-v1.0/Emoticon-unigrams.txt"
emoticon_lexicon_bigrams_file_path = \
    wassa_home + \
    "lexicons/NRC-Sentiment-Emotion-Lexicons/Lexicons/NRC-Emoticon-Lexicon-v1.0/Emoticon-bigrams.txt"
    
emoticon_lexicon_unigrams = dict()
emoticon_lexicon_bigrams = dict()

def get_emoticon_lexicon_unigram_dict():
    with open(emoticon_lexicon_unigrams_file_path) as emoticon_lexicon_file:
        for line in emoticon_lexicon_file:
            word_array = line.replace("\n", "").split("\t")
            emoticon_lexicon_unigrams[word_array[0]] = np.array([float(val) for val in word_array[1:]])
    
    return emoticon_lexicon_unigrams

def get_emoticon_lexicon_bigram_dict():
    with open(emoticon_lexicon_bigrams_file_path) as emoticon_lexicon_file:
        for line in emoticon_lexicon_file:
            word_array = line.replace("\n", "").split("\t")
            emoticon_lexicon_bigrams[word_array[0]] = np.array([float(val) for val in word_array[1:]])
    
    return emoticon_lexicon_bigrams

In [ ]:
emoticon_lexicon_unigram_dict = get_emoticon_lexicon_unigram_dict()

In [ ]:
emoticon_lexicon_bigram_dict = get_emoticon_lexicon_bigram_dict()

In [ ]:
def get_unigram_sentiment_emoticon_lexicon_vector(word):
    
    vec_rep = np.zeros(3)
    if word in emoticon_lexicon_unigram_dict.keys():
        vec_rep = emoticon_lexicon_unigram_dict[word]
        
    return non_linear_factor.fit_transform([vec_rep])[0]

def get_bigram_sentiment_emoticon_lexicon_vector(word):
    
    vec_rep = np.zeros(3)
    if word in emoticon_lexicon_bigram_dict.keys():
        vec_rep = emoticon_lexicon_bigram_dict[word]
        
    return non_linear_factor.fit_transform([vec_rep])[0]

In [ ]:
get_unigram_sentiment_emoticon_lexicon_vector("fury")

In [ ]:
get_bigram_sentiment_emoticon_lexicon_vector("add everyone")

Emoticon Sentiment Aff-Neg Lexicon


In [ ]:
emoticon_afflex_unigrams_file_path = \
    wassa_home + \
    "lexicons/NRC-Sentiment-Emotion-Lexicons/Lexicons/NRC-Emoticon-AffLexNegLex-v1.0/" + \
    "Emoticon-AFFLEX-NEGLEX-unigrams.txt"
emoticon_afflex_bigrams_file_path = \
    wassa_home + \
    "lexicons/NRC-Sentiment-Emotion-Lexicons/Lexicons/NRC-Emoticon-AffLexNegLex-v1.0/" + \
    "Emoticon-AFFLEX-NEGLEX-bigrams.txt"
    
emoticon_afflex_unigrams = dict()
emoticon_afflex_bigrams = dict()

def get_emoticon_afflex_unigram_dict():
    with open(emoticon_afflex_unigrams_file_path) as emoticon_lexicon_file:
        for line in emoticon_lexicon_file:
            word_array = line.replace("\n", "").split("\t")
            emoticon_afflex_unigrams[word_array[0]] = np.array([float(val) for val in word_array[1:]])
    
    return emoticon_afflex_unigrams

def get_emoticon_afflex_bigram_dict():
    with open(emoticon_afflex_bigrams_file_path) as emoticon_lexicon_file:
        for line in emoticon_lexicon_file:
            word_array = line.replace("\n", "").split("\t")
            emoticon_afflex_bigrams[word_array[0]] = np.array([float(val) for val in word_array[1:]])
    
    return emoticon_afflex_bigrams

In [ ]:
emoticon_afflex_unigram_dict = get_emoticon_afflex_unigram_dict()

In [ ]:
emoticon_afflex_bigram_dict = get_emoticon_afflex_bigram_dict()

In [ ]:
def get_unigram_sentiment_emoticon_afflex_vector(word):
    
    vec_rep = np.zeros(3)
    if word in emoticon_afflex_unigram_dict.keys():
        vec_rep = emoticon_afflex_unigram_dict[word]
        
    return non_linear_factor.fit_transform([vec_rep])[0]

def get_bigram_sentiment_emoticon_afflex_vector(word):
    
    vec_rep = np.zeros(3)
    if word in emoticon_afflex_bigram_dict.keys():
        vec_rep = emoticon_afflex_bigram_dict[word]
    
    return non_linear_factor.fit_transform([vec_rep])[0]

In [ ]:
get_unigram_sentiment_emoticon_afflex_vector("fury")

In [ ]:
# get_bigram_sentiment_emoticon_afflex_vector("pay vip")

Hashtag Sentiment Aff-Neg Lexicon


In [ ]:
hashtag_affneglex_unigrams_file_path = \
    wassa_home + \
    "lexicons/NRC-Sentiment-Emotion-Lexicons/Lexicons/NRC-Hashtag-Sentiment-AffLexNegLex-v1.0/" + \
    "HS-AFFLEX-NEGLEX-unigrams.txt"
hashtag_affneglex_bigrams_file_path = \
    wassa_home + \
    "lexicons/NRC-Sentiment-Emotion-Lexicons/Lexicons/NRC-Hashtag-Sentiment-AffLexNegLex-v1.0/" + \
    "HS-AFFLEX-NEGLEX-bigrams.txt"
    
hashtag_affneglex_unigrams = dict()
hashtag_affneglex_bigrams = dict()

def get_hashtag_affneglex_unigram_dict():
    with open(hashtag_affneglex_unigrams_file_path) as hashtag_sent_lex_file:
        for line in hashtag_sent_lex_file:
            word_array = line.replace("\n", "").split("\t")
            hashtag_affneglex_unigrams[word_array[0]] = np.array([float(val) for val in word_array[1:]])
    
    return hashtag_affneglex_unigrams

def get_hashtag_affneglex_bigram_dict():
    with open(hashtag_affneglex_bigrams_file_path) as hashtag_sent_lex_file:
        for line in hashtag_sent_lex_file:
            word_array = line.replace("\n", "").split("\t")
            hashtag_affneglex_bigrams[word_array[0]] = np.array([float(val) for val in word_array[1:]])

    return hashtag_affneglex_bigrams

In [ ]:
hashtag_affneglex_unigram_dict = get_hashtag_affneglex_unigram_dict()

In [ ]:
hashtag_affneglex_bigram_dict = get_hashtag_affneglex_bigram_dict()

In [ ]:
def get_unigram_sentiment_hashtag_affneglex_vector(word):
    
    vec_rep = np.zeros(3)
    if word in hashtag_affneglex_unigram_dict.keys():
        vec_rep = hashtag_affneglex_unigram_dict[word]
        
    return non_linear_factor.fit_transform([vec_rep])[0]

def get_bigram_sentiment_hashtag_affneglex_vector(word):
    
    vec_rep = np.zeros(3)
    if word in hashtag_affneglex_bigram_dict.keys():
        vec_rep = hashtag_affneglex_bigram_dict[word]
        
    return non_linear_factor.fit_transform([vec_rep])[0]

In [ ]:
get_unigram_sentiment_hashtag_affneglex_vector("#great")

In [ ]:
get_bigram_sentiment_hashtag_affneglex_vector("#good luck")

Hashtag Sentiment Lexicon


In [ ]:
hash_sent_lex_unigrams_file_path = \
    wassa_home + \
    "lexicons/NRC-Sentiment-Emotion-Lexicons/Lexicons/NRC-Hashtag-Sentiment-Lexicon-v1.0/HS-unigrams.txt"
hash_sent_lex_bigrams_file_path = \
    wassa_home + \
    "lexicons/NRC-Sentiment-Emotion-Lexicons/Lexicons/NRC-Hashtag-Sentiment-Lexicon-v1.0/HS-bigrams.txt"

def get_hash_sent_lex_unigram_dict():
    
    hash_sent_lex_unigrams = dict()
    with open(hash_sent_lex_unigrams_file_path) as hash_sent_lex_file:
        for line in hash_sent_lex_file:
            word_array = line.replace("\n", "").split("\t")
            if clean_str(word_array[0]):
                hash_sent_lex_unigrams[word_array[0]] = np.array([float(val) for val in word_array[1:]])
    
    return hash_sent_lex_unigrams

def get_hash_sent_lex_bigram_dict():

    hash_sent_lex_bigrams = dict()
    with open(hash_sent_lex_bigrams_file_path) as hash_sent_lex_file:
        for line in hash_sent_lex_file:
            word_array = line.replace("\n", "").split("\t")
            if clean_str(word_array[0]):
                hash_sent_lex_bigrams[word_array[0]] = np.array([float(val) for val in word_array[1:]])
    
    return hash_sent_lex_bigrams

In [ ]:
hash_sent_lex_unigram_dict = get_hash_sent_lex_unigram_dict()

In [ ]:
hash_sent_lex_bigram_dict = get_hash_sent_lex_bigram_dict()

In [ ]:
def get_unigram_sentiment_hash_sent_lex_vector(word):
    
    vec_rep = np.zeros(3)
    if word in hash_sent_lex_unigram_dict.keys():
        vec_rep = hash_sent_lex_unigram_dict[word]
        
    return non_linear_factor.fit_transform([vec_rep])[0]


def get_bigram_sentiment_hash_sent_lex_vector(word):

    vec_rep = np.zeros(3)
    if word in hash_sent_lex_bigram_dict.keys():
        vec_rep = hash_sent_lex_bigram_dict[word]
            
    return non_linear_factor.fit_transform([vec_rep])[0]

In [ ]:
# get_unigram_sentiment_hash_sent_lex_vector("#fabulous")

In [ ]:
# get_bigram_sentiment_hash_sent_lex_vector(". #perfection")

Depeche Mood


In [ ]:
depeche_mood_file_path = \
    wassa_home + \
    "lexicons/DepecheMood_V1.0/DepecheMood_normfreq.txt"

In [ ]:
def get_depeche_vector_dict():
    depeche_vector_dict = dict()
    with open(depeche_mood_file_path) as depeche_mood_file:
        for line in depeche_mood_file:
            word_array = line.replace("\n", "").split("\t")
            depeche_vector_dict[word_array[0].split("#")[0]] = np.array([float(val) for val in word_array[1:]])
    
    return depeche_vector_dict

In [ ]:
depeche_vector_dict = get_depeche_vector_dict()

In [ ]:
def get_depeche_mood_vector(word):
    
    vec_rep = np.zeros(8)
    if word in depeche_vector_dict.keys():
        vec_rep = np.array(depeche_vector_dict[word])

    return non_linear_factor.fit_transform([vec_rep])[0]

In [ ]:
# get_depeche_mood_vector("120th")

Reading & Vectorizing Data


In [ ]:
#     print(embedding_features.shape)
    
#     lexicon_features = get_unigram_embedding(word, embedding_info[0], unigram_feature_string)
#     poly_lexicon_features = non_linear_factor.fit_transform([lexicon_features])[0]
#     print(poly_lexicon_features.shape)

#     final_features = np.concatenate((embedding_features, poly_lexicon_features))
#     print(final_features.shape)

In [ ]:
def is_active_vector_method(string):
    return int(string)

def learn_unigram_word_embedding(word):
    
    word_feature_embedding_dict = dict()
    
    '''Pre-trained Word embeddings'''
    index = 0
    word_feature_embedding_dict[index] = get_word2vec_embedding(word, wv_model, w2v_dimensions)

    index = 1
    word_feature_embedding_dict[index] = get_word2vec_embedding(word, wv_model_1, w2v_dimensions_1)

    index = 2
    word_feature_embedding_dict[index] = get_word2vec_embedding(word, wv_model_2, w2v_dimensions_2)

    index = 3
    word_feature_embedding_dict[index] = get_word2vec_embedding(word, wv_model_3, w2v_dimensions_3)

    index = 4
    word_feature_embedding_dict[index] = get_word2vec_embedding(word, wv_model_4, w2v_dimensions_4)

    index = 5
    word_feature_embedding_dict[index] = get_word2vec_embedding(word, wv_model_5, w2v_dimensions_5)

    '''NRC Emotion Intensity Lexicon'''
    index = 6
    word_feature_embedding_dict[index] = get_emo_int_vector(word)

    '''WordNet'''
    index = 7
    word_feature_embedding_dict[index] = get_sentiwordnetscore(word)

    '''NRC Sentiment Lexica'''
    index = 8
    word_feature_embedding_dict[index] = get_sentiment_emotion_feature(word)

    index = 9
    word_feature_embedding_dict[index] = get_unigram_sentiment_emoticon_lexicon_vector(word)

    index = 10
    word_feature_embedding_dict[index] = get_unigram_sentiment_emoticon_afflex_vector(word)

    '''NRC Hashtag Lexica'''
    index = 11
    word_feature_embedding_dict[index] = get_hashtag_emotion_vector(word)

    index = 12
    word_feature_embedding_dict[index] = get_unigram_sentiment_hash_sent_lex_vector(word)

    index = 13
    word_feature_embedding_dict[index] = get_unigram_sentiment_hashtag_affneglex_vector(word)

    '''Emoji Polarities'''
    index = 14
    word_feature_embedding_dict[index] = get_emoji_intensity(word)
    
    '''Depeche Mood'''
    index = 15
    word_feature_embedding_dict[index] = get_depeche_mood_vector(word)

    return word_feature_embedding_dict


def learn_bigram_word_embedding(word):
    
    word_feature_embedding_dict = dict()
    
    '''NRC Sentiment Lexica'''

    index = 0
    word_feature_embedding_dict[index] = get_bigram_sentiment_emoticon_lexicon_vector(word)

    index = 1
    word_feature_embedding_dict[index] = get_bigram_sentiment_emoticon_afflex_vector(word)

    '''NRC Hashtag Lexica'''
    index = 2
    word_feature_embedding_dict[index] = get_bigram_sentiment_hash_sent_lex_vector(word)

    index = 3
    word_feature_embedding_dict[index] = get_bigram_sentiment_hashtag_affneglex_vector(word)

    return word_feature_embedding_dict

In [ ]:
def get_unigram_embedding(word, word_embedding_dict, bin_string):
    
    word_feature_embedding_dict = word_embedding_dict[word]
    final_embedding = np.array([])
    
    for i in range(16):
        if is_active_vector_method(bin_string[i]):
            final_embedding = np.append(final_embedding, word_feature_embedding_dict[i])
    
    return final_embedding

def get_bigram_embedding(bigram, word_embedding_dict, bin_string):
    
    word_feature_embedding_dict = word_embedding_dict[word]
    final_embedding = np.array([])
    
    for i in range(4):
        if is_active_vector_method(bin_string[i]):
            final_embedding = np.append(final_embedding, word_feature_embedding_dict[i])
    
    return final_embedding

In [ ]:
unigram_feature_string = "1111111111111111"
bigram_feature_string = "1111"

In [ ]:
training_tweets = read_training_data(training_data_file_path)
dev_tweets = read_training_data(dev_set_path)

score_train = list()
tweet_train = list()
for tweet in training_tweets:
    tweet_train.append(tweet.text)
    score_train.append(float(tweet.intensity))

for tweet in dev_tweets:
    tweet_train.append(tweet.text)
    score_train.append(float(tweet.intensity))
    
print(len(score_train))
score_train = np.asarray(score_train)

In [ ]:
raw_test_tweets = read_training_data_verbatim(test_data_file_path)
test_tweets = read_training_data(test_data_file_path)

tweet_test_raw = list()
tweet_test = list()
y_gold = list()

for tweet in raw_test_tweets:
    tweet_test_raw.append(tweet.text)

for tweet in test_tweets:
    tweet_test.append(tweet.text)
    y_gold.append(float(tweet.intensity))
    
print(len(y_gold))

In [ ]:
def build_word_embeddings(tweets):
    
    max_tweet_length = -1
    word_embedding_dict = dict()

    for tweet in tweets:
        if len(tweet) > max_tweet_length:
            max_tweet_length = len(tweet)

        for token in tweet:
            if token not in word_embedding_dict.keys():
                word_embedding_dict[token] = learn_unigram_word_embedding(token)
                
    return word_embedding_dict, max_tweet_length

In [ ]:
# all_tweets = tweet_train + tweet_test
# embedding_info = build_word_embeddings(all_tweets)

In [ ]:
# # Save vectors
# with open(word_embeddings_path, 'wb') as word_embeddings_file:
#     pickle.dump(embedding_info, word_embeddings_file)

In [ ]:
browser_notify("Persisted to disk")

In [ ]:
# Restore vectors
with open(word_embeddings_path, 'rb') as word_embeddings_file:
    embedding_info = pickle.load(word_embeddings_file)

In [ ]:
embeddings_index = embedding_info[0]
MAX_SEQUENCE_LENGTH = embedding_info[1]
MAX_NB_WORDS = 20000
EMBEDDING_DIM = len(get_unigram_embedding("glad", embedding_info[0], unigram_feature_string))
print(MAX_SEQUENCE_LENGTH, EMBEDDING_DIM)

In [ ]:
word_indices = dict()
current_index = 1

In [ ]:
def sequence_tweets(tweets):
    global current_index
    vectors = list()
    for tweet in tweets:        
        vector = list()
        for word in tweet:
            word_index = None
            
            if word in word_indices:
                word_index = word_indices[word]
            else:
                word_index = current_index
                current_index += 1
                word_indices[word] = word_index
            
            vector.append(word_index)
        
        vectors.append(vector)

    return vectors

In [ ]:
x_train = sequence_tweets(tweet_train)
x_test = sequence_tweets(tweet_test)

In [ ]:
len(word_indices)

In [ ]:
# display(tweet_train)

In [ ]:
word_embedding_matrix = list()
word_embedding_matrix.append(np.zeros(EMBEDDING_DIM))

for word in sorted(word_indices, key=word_indices.get):
    embedding_features = get_unigram_embedding(word, embedding_info[0], unigram_feature_string)    
    word_embedding_matrix.append(embedding_features)

word_embedding_matrix = np.asarray(word_embedding_matrix, dtype='f')

In [ ]:
word_embedding_matrix.shape

In [ ]:
word_embedding_matrix = scale(word_embedding_matrix)

In [ ]:
browser_notify("Vectorization Done")

Recurrent Neural Network Implementation in Keras


In [ ]:
pre_padding = 6

x_train = sequence.pad_sequences(x_train, maxlen=MAX_SEQUENCE_LENGTH, padding="post", truncating="post")
x_test = sequence.pad_sequences(x_test, maxlen=MAX_SEQUENCE_LENGTH, padding="post", truncating="post")

x_train = sequence.pad_sequences(x_train, maxlen=MAX_SEQUENCE_LENGTH + pre_padding, padding="pre")
x_test = sequence.pad_sequences(x_test, maxlen=MAX_SEQUENCE_LENGTH + pre_padding, padding="pre")

In [ ]:
len(x_train), len(x_test), len(x_train[0])

In [ ]:
embed_1 = Embedding(len(word_indices) + 1, EMBEDDING_DIM, weights=[word_embedding_matrix], 
                    input_length=MAX_SEQUENCE_LENGTH + pre_padding, trainable=True)

conv_1 = Conv1D(128, 3, activation='relu', name='conv1')
conv_2 = Conv1D(128, 3, activation='relu', name='conv2')
conv_3 = Conv1D(256, 3, activation='relu', name='conv3')
conv_4 = Conv1D(256, 3, activation='relu', name='conv4')
conv_5 = Conv1D(256, 3, activation='relu', name='conv5')
conv_6 = Conv1D(1024, 3, activation='relu', name='conv6')
conv_7 = Conv1D(1024, 3, activation='relu', name='conv7')
conv_8 = Conv1D(1024, 3, activation='relu', name='conv8')

pool_1 = AveragePooling1D(pool_size=3, strides=2, name='pool1')
pool_2 = AveragePooling1D(pool_size=3, strides=2, name='pool2')
pool_3 = MaxPooling1D(pool_size=3, strides=2, name='pool3')
pool_4 = MaxPooling1D(pool_size=3, strides=2, name='pool4')

lstm_1 = LSTM(256, dropout=0.2, recurrent_dropout=0.2, name='lstm1', return_sequences=True)
lstm_2 = LSTM(128, dropout=0.2, recurrent_dropout=0.2, name='lstm2', return_sequences=True)
lstm_3 = LSTM(64, dropout=0.2, recurrent_dropout=0.2, name='lstm3')

gru_1 = GRU(256, dropout=0.25, recurrent_dropout=0.25, name='gru1', return_sequences=True)
gru_2 = GRU(256, dropout=0.25, recurrent_dropout=0.25, name='gru2', return_sequences=True)
gru_3 = GRU(256, dropout=0.25, recurrent_dropout=0.25, name='gru3')

bi_lstm_1 = Bidirectional(lstm_1, name='bilstm1')
bi_lstm_2 = Bidirectional(lstm_2, name='bilstm2')
bi_lstm_3 = Bidirectional(lstm_3, name='bilstm3')

dense_1 = Dense(20000, activation='relu', name='dense1')
dense_2 = Dense(1, activation='sigmoid', name='dense2')

drop_1 = Dropout(0.5, name='drop1')
drop_2 = Dropout(0.5, name='drop2')

In [ ]:
def get_rnn_model():
    
    model = Sequential()
    
    model.add(embed_1)
    
    model.add(conv_1)
    model.add(conv_2)
#     model.add(pool_1)
    
#     model.add(conv_3)
#     model.add(conv_4)
#     model.add(pool_2)
    
#     model.add(conv_5)
    
    model.add(lstm_1)
    model.add(lstm_2)
    model.add(lstm_3)
    
    model.add(dense_1)
#     model.add(drop_1)
    
    model.add(dense_2)

    model.compile(loss='mean_squared_error', optimizer="adam")
    
    return model

In [ ]:
nn_model = KerasRegressor(build_fn=get_rnn_model, epochs=50, batch_size=64, verbose=1)

score_train = np.asarray(score_train)

# ml_model = AdaBoostRegressor(base_estimator=nn_model)

nn_model.fit(x_train, score_train) # , epochs=100, batch_size=64, verbose=1)
y_pred = nn_model.predict(x_test)

In [ ]:
browser_notify("NN Training Done")

In [ ]:
y_pred = np.reshape(y_pred, len(y_pred))

In [ ]:
print(emotion)
print(pearsonr(y_pred, y_gold))
print(spearmanr(y_pred, y_gold))

In [ ]:
# with open(predictions_file_path, 'w') as predictions_file:
#     for i in range(len(y_pred)):        
#         predictions_file.write(
#             str(raw_test_tweets[i].id) + "\t" + \
#             raw_test_tweets[i].text + "\t" + \
#             raw_test_tweets[i].emotion + "\t" + \
#             str(y_pred[i]) + "\n"
#         )

In [ ]:
browser_notify("Neural Net Trained")

Analysis


In [ ]:
def get_predictions_list(y_pred, y_gold):
    predictions = list()
    for i in range(len(y_gold)):
        prediction = {
            "raw_tweet": tweet_test_raw[i], 
            "cleaned_tweet": tweet_test[i], 
            "prediction": y_pred[i], 
            "actual": y_gold[i], 
            "diff": (y_gold[i] - y_pred[i])
        }
        predictions.append(prediction)
        
    return predictions

In [ ]:
predictions = get_predictions_list(y_pred, y_gold)

In [ ]:
pred_df = DataFrame(predictions)
pred_df = pred_df[["raw_tweet", "cleaned_tweet", "prediction", "actual", "diff"]]
pred_df = pred_df.sort_values(by=['diff'], ascending=[True])

In [ ]:
display(pred_df)

In [ ]:
with open(debug_file_path, 'w') as debug_file:
    pred_df.to_csv(debug_file, sep='\t')

In [ ]: