Define evaluation logic


In [ ]:
import time
import json
import pickle
import re
import html
import sys
import gc

import gensim

from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import PolynomialFeatures
from xgboost import XGBRegressor

import pandas as pd
from pandas import DataFrame

import scipy.stats as st
import numpy as np

import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Flatten, Embedding, Input
from keras import optimizers
from keras.wrappers.scikit_learn import KerasRegressor
from keras.optimizers import SGD, Adagrad
from keras.preprocessing import sequence
from keras.layers.convolutional import Conv1D
from keras.layers.pooling import MaxPooling1D, GlobalMaxPooling1D
from keras.layers.recurrent import LSTM

from nltk import word_tokenize
from nltk import bigrams
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import sentiwordnet as swn

from IPython.display import display, HTML

In [ ]:
def browser_alert(message):
    display(HTML('<script type="text/javascript">alert("' + message + '");</script>'))
    
def browser_notify(message):
    display(HTML('<script type="text/javascript">var notification=new Notification("Jupyter Notification",{icon:"http://blog.jupyter.org/content/images/2015/02/jupyter-sq-text.png",body:"' + message + '"});</script>'))

In [ ]:
word_vector_path = "/home/v2john/"
wassa_home = "/home/v2john/WASSA-Task/"

In [ ]:
def evaluate(pred,gold):
    
    f=open(pred, "rb")
    pred_lines=f.readlines()
    f.close()
    
    f=open(gold, "rb")
    gold_lines=f.readlines()
    f.close()
    

    if(len(pred_lines)==len(gold_lines)):       
        # align tweets ids with gold scores and predictions
        data_dic={}
        
        for line in gold_lines:
            line = line.decode()
            parts=line.split('\t')
            if len(parts)==4:   
                data_dic[int(parts[0])]=[float(line.split('\t')[3])]
            else:
                raise ValueError('Format problem.')
        
        
        for line in pred_lines:
            line = line.decode()
            parts=line.split('\t')
            if len(parts)==4:  
                if int(parts[0]) in data_dic:
                    try:
                        data_dic[int(parts[0])].append(float(line.split('\t')[3]))
                    except ValueError:
                        # Invalid predictions are replaced by a default value
                        data_dic[int(parts[0])].append(0.5)
                else:
                    raise ValueError('Invalid tweet id.')
            else:
                raise ValueError('Format problem.')
            
            
        # lists storing gold and prediction scores
        gold_scores=[]  
        pred_scores=[]
         
        
        # lists storing gold and prediction scores where gold score >= 0.5
        gold_scores_range_05_1=[]
        pred_scores_range_05_1=[]
         
            
        for id in data_dic:
            if(len(data_dic[id])==2):
                gold_scores.append(data_dic[id][0])
                pred_scores.append(data_dic[id][1])
                if(data_dic[id][0]>=0.5):
                    gold_scores_range_05_1.append(data_dic[id][0])
                    pred_scores_range_05_1.append(data_dic[id][1])
            else:
                raise ValueError('Repeated id in test data.')
                
      
        # return zero correlation if predictions are constant
        if np.std(pred_scores)==0 or np.std(gold_scores)==0:
            return (0,0,0,0)
        

        pears_corr=st.pearsonr(pred_scores,gold_scores)[0]                                    
        spear_corr=st.spearmanr(pred_scores,gold_scores)[0]   


        pears_corr_range_05_1=st.pearsonr(pred_scores_range_05_1,gold_scores_range_05_1)[0]                                    
        spear_corr_range_05_1=st.spearmanr(pred_scores_range_05_1,gold_scores_range_05_1)[0]           
        
      
        return (pears_corr,spear_corr,pears_corr_range_05_1,spear_corr_range_05_1)
    else:
        raise ValueError('Predictions and gold data have different number of lines.')
        
def evaluate_lists(pred, gold):
    if len(pred) == len(gold):
        gold_scores=gold
        pred_scores=pred         
        
        # lists storing gold and prediction scores where gold score >= 0.5
        gold_scores_range_05_1=[]
        pred_scores_range_05_1=[]
         
            
        for i in range(len(gold_scores)):
            if(gold_scores[i]>=0.5):
                gold_scores_range_05_1.append(gold_scores[i])
                pred_scores_range_05_1.append(pred_scores[i])
                
        # return zero correlation if predictions are constant
        if np.std(pred_scores)==0 or np.std(gold_scores)==0:
            return (0,0,0,0)
        
        pears_corr=st.pearsonr(pred_scores,gold_scores)[0]                                    
        spear_corr=st.spearmanr(pred_scores,gold_scores)[0]   


        pears_corr_range_05_1=st.pearsonr(pred_scores_range_05_1,gold_scores_range_05_1)[0]                                    
        spear_corr_range_05_1=st.spearmanr(pred_scores_range_05_1,gold_scores_range_05_1)[0]           
        
      
        return np.array([pears_corr,spear_corr,pears_corr_range_05_1,spear_corr_range_05_1])
    else:
        raise ValueError('Predictions and gold data have different number of lines.')

Word2Vec + GloVe


In [ ]:
def loadGloveModel(gloveFile):
    print("Loading Glove Model")
    f = open(gloveFile,'r')
    model = {}
    num = 1
    for line in f:
        try:
            splitLine = line.split()
            word = splitLine[0]
            embedding = [float(val) for val in splitLine[1:]]
            model[word] = np.array(embedding)
            num += 1
        except Exception as e:
            print("Failed at line " + str(num))
    print("Done.",len(model)," words loaded!")
    return model

In [ ]:
# Google news pretrained vectors
wv_model_path = word_vector_path + "GoogleNews-vectors-negative300.bin.gz"
wv_model = gensim.models.KeyedVectors.load_word2vec_format(wv_model_path, binary=True, unicode_errors='ignore')

In [ ]:
# Twitter pretrained vectors
wv_model_path_1 = word_vector_path + "word2vec_twitter_model.bin"
wv_model_1 = gensim.models.KeyedVectors.load_word2vec_format(wv_model_path_1, binary=True, unicode_errors='ignore')

In [ ]:
wv_model_path_2 = word_vector_path + "glove.twitter.27B.200d.txt"
wv_model_2 = loadGloveModel(wv_model_path_2)

In [ ]:
wv_model_path_3 = word_vector_path + "glove.6B.300d.txt"
wv_model_3 = loadGloveModel(wv_model_path_3)

In [ ]:
wv_model_path_4 = word_vector_path + "glove.42B.300d.txt"
wv_model_4 = loadGloveModel(wv_model_path_4)

In [ ]:
wv_model_path_5 = word_vector_path + "glove.840B.300d.txt"
wv_model_5 = loadGloveModel(wv_model_path_5)

In [ ]:
w2v_dimensions = len(wv_model['word'])
w2v_dimensions_1 = len(wv_model_1['word'])
w2v_dimensions_2 = len(wv_model_2['word'])
w2v_dimensions_3 = len(wv_model_3['word'])
w2v_dimensions_4 = len(wv_model_4['word'])
w2v_dimensions_5 = len(wv_model_5['word'])
print(w2v_dimensions, w2v_dimensions_1, 
      w2v_dimensions_2, w2v_dimensions_3, w2v_dimensions_4,
      w2v_dimensions_5)

In [ ]:
def get_word2vec_embedding(word, model, dimensions):

    vec_rep = np.zeros(dimensions)
    if word in model:
        vec_rep = model[word]
    
    return vec_rep

In [ ]:
# get_word2vec_embedding("charm", wv_model_2, 200)

Data Cleaning


In [ ]:
def remove_stopwords(string):
    split_string = \
        [word for word in string.split()
         if word not in stopwords.words('english')]
    
    return " ".join(split_string)

In [ ]:
def clean_str(string):  
    string = html.unescape(string)
    string = string.replace("\\n", " ")
    string = string.replace("_NEG", "")
    string = string.replace("_NEGFIRST", "")
    string = re.sub(r"@[A-Za-z0-9_s(),!?\'\`]+", "", string) # removing any twitter handle mentions
#     string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
#     string = re.sub(r"#", "", string)
    string = re.sub(r"\*", "", string)
    string = re.sub(r"\'s", "", string)
    string = re.sub(r"\'m", " am", string)
    string = re.sub(r"\'ve", " have", string)
    string = re.sub(r"n\'t", " not", string)
    string = re.sub(r"\'re", " are", string)
    string = re.sub(r"\'d", " would", string)
    string = re.sub(r"\'ll", " will", string)
    string = re.sub(r",", "", string)
    string = re.sub(r"!", " !", string)
    string = re.sub(r"\(", "", string)
    string = re.sub(r"\)", "", string)
    string = re.sub(r"\?", " ?", string)
    string = re.sub(r"\s{2,}", " ", string)

    return remove_stopwords(string.strip().lower())

Metadata and Class Definitions


In [ ]:
class Tweet(object):

    def __init__(self, id, text, emotion, intensity):
        self.id = id
        self.text = text
        self.emotion = emotion
        self.intensity = intensity

    def __repr__(self):
        return \
            "id: " + self.id + \
            ", text: " + self.text + \
            ", emotion: " + self.emotion + \
            ", intensity: " + self.intensity

In [ ]:
def read_training_data(training_data_file_path):

    train_list = list()
    with open(training_data_file_path) as input_file:
        for line in input_file:
            line = line.strip()
            array = line.split('\t')
            train_list.append(Tweet(array[0], clean_str(array[1]), array[2], float(array[3])))
    return train_list
            
def read_training_data_verbatim(training_data_file_path):

    train_list = list()
    with open(training_data_file_path) as input_file:
        for line in input_file:
            line = line.strip()
            array = line.split('\t')
            train_list.append(Tweet(array[0], array[1], array[2], float(array[3])))
    return train_list
    
def read_test_data(training_data_file_path):

    test_list = list()
    with open(training_data_file_path) as input_file:
        for line in input_file:
            line = line.strip()
            array = line.split('\t')
            test_list.append(Tweet(array[0], clean_str(array[1]), array[2], None))
    return test_list

In [ ]:
emotion = "anger"

In [ ]:
training_data_file_path = \
    wassa_home + "dataset/" + \
    emotion + "-ratings-0to1.train.txt"
predictions_file_path = \
    wassa_home + "predictions/" + \
    emotion + "-pred.txt"
dev_set_path = \
    wassa_home + "dataset/dev-set/" + \
    emotion + "-ratings-0to1.dev.gold.txt"
test_data_file_path = \
    wassa_home + "dataset/test-set/" + \
    emotion + "-ratings-0to1.test.gold.txt"
debug_file_path = \
    wassa_home + "dataset/test-set/debug/" + \
    emotion + ".tsv"

Feature Extraction Snippets

Emoji Intensity


In [ ]:
with open(wassa_home + 'lexicons/emoji_map.json') as emoji_file:
    emoji_list = json.load(emoji_file)
    
emoji_dict = dict()
for emoji in emoji_list:
    emoji_dict[emoji["emoji"]] = (emoji["name"], emoji["polarity"])

In [ ]:
def get_emoji_intensity(word):
    
    score = 0.0
    if word in emoji_dict.keys():
        score = float(emoji_dict[word][1])
    
    vec_rep = np.array([score])
    
    return vec_rep

In [ ]:
# get_emoji_intensity("💯")

Emotion Intensity Lexicon


In [ ]:
affect_intensity_file_path = \
    wassa_home + \
    "lexicons/NRC-AffectIntensity-Lexicon.txt"

def get_word_affect_intensity_dict(emotion):
    word_intensities = dict()

    with open(affect_intensity_file_path) as affect_intensity_file:
        for line in affect_intensity_file:
            word_int_array = line.replace("\n", "").split("\t")

            if (word_int_array[2] == emotion):
                word_intensities[word_int_array[0]] = float(word_int_array[1])

    return word_intensities

In [ ]:
word_intensities = get_word_affect_intensity_dict(emotion)

In [ ]:
def get_emo_int_vector(word):
    
    score = 0.0
    if word in word_intensities.keys():
        score = float(word_intensities[word])
        
    vec_rep = np.array([score])
    
    return vec_rep

In [ ]:
# get_emo_int_vector('fury')

SentiWordNet


In [ ]:
def get_sentiwordnetscore(word):
    
    vec_rep = np.zeros(2)
    
    synsetlist = list(swn.senti_synsets(word))

    if synsetlist:
        vec_rep[0] = synsetlist[0].pos_score()
        vec_rep[1] = synsetlist[0].neg_score()

    return vec_rep

In [ ]:
# get_sentiwordnetscore("fury")

Sentiment Emotion Presence Lexicon


In [ ]:
sentiment_emotion_lex_file_path = \
    wassa_home + \
    "lexicons/NRC-Sentiment-Emotion-Lexicons/Lexicons/NRC-Emotion-Lexicon-v0.92/" + \
    "NRC-Emotion-Lexicon-Wordlevel-v0.92.txt"

def get_affect_presence_list(emotion):
    word_list = list()
    
    with open(sentiment_emotion_lex_file_path) as sentiment_emotion_lex_file:
        for line in sentiment_emotion_lex_file:
            word_array = line.replace("\n", "").split("\t")

            if (word_array[1] == emotion and word_array[2] == '1'):
                word_list.append(word_array[0])
                
    return word_list

In [ ]:
sentiment_emotion_lex_word_list = get_affect_presence_list(emotion)

In [ ]:
def get_sentiment_emotion_feature(word):
    
    score = 0.0
    if word in sentiment_emotion_lex_word_list:
        score = 1.0
    vec_rep = np.array([score])
    
    return vec_rep

In [ ]:
# get_sentiment_emotion_feature("fury")

Hashtag Emotion Intensity


In [ ]:
hashtag_emotion_lex_file_path = \
    wassa_home + \
    "lexicons/NRC-Sentiment-Emotion-Lexicons/Lexicons/NRC-Hashtag-Emotion-Lexicon-v0.2/" + \
    "NRC-Hashtag-Emotion-Lexicon-v0.2.txt"
    
def get_hashtag_emotion_intensity(emotion):
    hastag_intensities = dict()
    
    with open(hashtag_emotion_lex_file_path) as hashtag_emotion_lex_file:
        for line in hashtag_emotion_lex_file:
            word_array = line.replace("\n", "").split("\t")
            if (word_array[0] == emotion):
                hastag_intensities[word_array[1]] = float(word_array[2])
                
    return hastag_intensities

In [ ]:
hashtag_emotion_intensities = get_hashtag_emotion_intensity(emotion)

In [ ]:
def get_hashtag_emotion_vector(word):
    
    score = 0.0
    
    if word in hashtag_emotion_intensities.keys():
        score = float(hashtag_emotion_intensities[word])
        
    vec_rep = np.array([score])
            
    return vec_rep

In [ ]:
# get_hashtag_emotion_vector("#fury")

Emoticon Sentiment Lexicon


In [ ]:
emoticon_lexicon_unigrams_file_path = \
    wassa_home + \
    "lexicons/NRC-Sentiment-Emotion-Lexicons/Lexicons/NRC-Emoticon-Lexicon-v1.0/Emoticon-unigrams.txt"
emoticon_lexicon_bigrams_file_path = \
    wassa_home + \
    "lexicons/NRC-Sentiment-Emotion-Lexicons/Lexicons/NRC-Emoticon-Lexicon-v1.0/Emoticon-bigrams.txt"
    
emoticon_lexicon_unigrams = dict()
emoticon_lexicon_bigrams = dict()

def get_emoticon_lexicon_unigram_dict():
    with open(emoticon_lexicon_unigrams_file_path) as emoticon_lexicon_file:
        for line in emoticon_lexicon_file:
            word_array = line.replace("\n", "").split("\t")
            emoticon_lexicon_unigrams[word_array[0]] = np.array([float(val) for val in word_array[1:]])
    
    return emoticon_lexicon_unigrams

def get_emoticon_lexicon_bigram_dict():
    with open(emoticon_lexicon_bigrams_file_path) as emoticon_lexicon_file:
        for line in emoticon_lexicon_file:
            word_array = line.replace("\n", "").split("\t")
            emoticon_lexicon_bigrams[word_array[0]] = np.array([float(val) for val in word_array[1:]])
    
    return emoticon_lexicon_bigrams

In [ ]:
emoticon_lexicon_unigram_dict = get_emoticon_lexicon_unigram_dict()

In [ ]:
emoticon_lexicon_bigram_dict = get_emoticon_lexicon_bigram_dict()

In [ ]:
def get_unigram_sentiment_emoticon_lexicon_vector(word):
    
    vec_rep = np.zeros(3)
    if word in emoticon_lexicon_unigram_dict.keys():
        vec_rep = emoticon_lexicon_unigram_dict[word]
        
    return vec_rep

def get_bigram_sentiment_emoticon_lexicon_vector(word):
    
    vec_rep = np.zeros(3)
    if word in emoticon_lexicon_bigram_dict.keys():
        vec_rep = emoticon_lexicon_bigram_dict[word]
        
    return vec_rep

In [ ]:
get_unigram_sentiment_emoticon_lexicon_vector("fury")

In [ ]:
get_bigram_sentiment_emoticon_lexicon_vector("add everyone")

Emoticon Sentiment Aff-Neg Lexicon


In [ ]:
emoticon_afflex_unigrams_file_path = \
    wassa_home + \
    "lexicons/NRC-Sentiment-Emotion-Lexicons/Lexicons/NRC-Emoticon-AffLexNegLex-v1.0/" + \
    "Emoticon-AFFLEX-NEGLEX-unigrams.txt"
emoticon_afflex_bigrams_file_path = \
    wassa_home + \
    "lexicons/NRC-Sentiment-Emotion-Lexicons/Lexicons/NRC-Emoticon-AffLexNegLex-v1.0/" + \
    "Emoticon-AFFLEX-NEGLEX-bigrams.txt"
    
emoticon_afflex_unigrams = dict()
emoticon_afflex_bigrams = dict()

def get_emoticon_afflex_unigram_dict():
    with open(emoticon_afflex_unigrams_file_path) as emoticon_lexicon_file:
        for line in emoticon_lexicon_file:
            word_array = line.replace("\n", "").split("\t")
            emoticon_afflex_unigrams[word_array[0]] = np.array([float(val) for val in word_array[1:]])
    
    return emoticon_afflex_unigrams

def get_emoticon_afflex_bigram_dict():
    with open(emoticon_afflex_bigrams_file_path) as emoticon_lexicon_file:
        for line in emoticon_lexicon_file:
            word_array = line.replace("\n", "").split("\t")
            emoticon_afflex_bigrams[word_array[0]] = np.array([float(val) for val in word_array[1:]])
    
    return emoticon_afflex_bigrams

In [ ]:
emoticon_afflex_unigram_dict = get_emoticon_afflex_unigram_dict()

In [ ]:
emoticon_afflex_bigram_dict = get_emoticon_afflex_bigram_dict()

In [ ]:
def get_unigram_sentiment_emoticon_afflex_vector(word):
    
    vec_rep = np.zeros(3)
    if word in emoticon_afflex_unigram_dict.keys():
        vec_rep = emoticon_afflex_unigram_dict[word]
        
    return vec_rep

def get_bigram_sentiment_emoticon_afflex_vector(word):
    
    vec_rep = np.zeros(3)
    if word in emoticon_afflex_bigram_dict.keys():
        vec_rep = emoticon_afflex_bigram_dict[word]
    
    return vec_rep

In [ ]:
# get_unigram_sentiment_emoticon_afflex_vector("fury")

In [ ]:
# get_bigram_sentiment_emoticon_afflex_vector("pay vip")

Hashtag Sentiment Aff-Neg Lexicon


In [ ]:
hashtag_affneglex_unigrams_file_path = \
    wassa_home + \
    "lexicons/NRC-Sentiment-Emotion-Lexicons/Lexicons/NRC-Hashtag-Sentiment-AffLexNegLex-v1.0/" + \
    "HS-AFFLEX-NEGLEX-unigrams.txt"
hashtag_affneglex_bigrams_file_path = \
    wassa_home + \
    "lexicons/NRC-Sentiment-Emotion-Lexicons/Lexicons/NRC-Hashtag-Sentiment-AffLexNegLex-v1.0/" + \
    "HS-AFFLEX-NEGLEX-bigrams.txt"
    
hashtag_affneglex_unigrams = dict()
hashtag_affneglex_bigrams = dict()

def get_hashtag_affneglex_unigram_dict():
    with open(hashtag_affneglex_unigrams_file_path) as hashtag_sent_lex_file:
        for line in hashtag_sent_lex_file:
            word_array = line.replace("\n", "").split("\t")
            hashtag_affneglex_unigrams[word_array[0]] = np.array([float(val) for val in word_array[1:]])
    
    return hashtag_affneglex_unigrams

def get_hashtag_affneglex_bigram_dict():
    with open(hashtag_affneglex_bigrams_file_path) as hashtag_sent_lex_file:
        for line in hashtag_sent_lex_file:
            word_array = line.replace("\n", "").split("\t")
            hashtag_affneglex_bigrams[word_array[0]] = np.array([float(val) for val in word_array[1:]])

    return hashtag_affneglex_bigrams

In [ ]:
hashtag_affneglex_unigram_dict = get_hashtag_affneglex_unigram_dict()

In [ ]:
hashtag_affneglex_bigram_dict = get_hashtag_affneglex_bigram_dict()

In [ ]:
def get_unigram_sentiment_hashtag_affneglex_vector(word):
    
    vec_rep = np.zeros(3)
    if word in hashtag_affneglex_unigram_dict.keys():
        vec_rep = hashtag_affneglex_unigram_dict[word]
        
    return vec_rep

def get_bigram_sentiment_hashtag_affneglex_vector(word):
    
    vec_rep = np.zeros(3)
    if word in hashtag_affneglex_bigram_dict.keys():
        vec_rep = hashtag_affneglex_bigram_dict[word]
        
    return vec_rep

In [ ]:
get_unigram_sentiment_hashtag_affneglex_vector("#great")

In [ ]:
get_bigram_sentiment_hashtag_affneglex_vector("#good luck")

Hashtag Sentiment Lexicon


In [ ]:
hash_sent_lex_unigrams_file_path = \
    wassa_home + \
    "lexicons/NRC-Sentiment-Emotion-Lexicons/Lexicons/NRC-Hashtag-Sentiment-Lexicon-v1.0/HS-unigrams.txt"
hash_sent_lex_bigrams_file_path = \
    wassa_home + \
    "lexicons/NRC-Sentiment-Emotion-Lexicons/Lexicons/NRC-Hashtag-Sentiment-Lexicon-v1.0/HS-bigrams.txt"

def get_hash_sent_lex_unigram_dict():
    
    hash_sent_lex_unigrams = dict()
    with open(hash_sent_lex_unigrams_file_path) as hash_sent_lex_file:
        for line in hash_sent_lex_file:
            word_array = line.replace("\n", "").split("\t")
            if clean_str(word_array[0]):
                hash_sent_lex_unigrams[word_array[0]] = np.array([float(val) for val in word_array[1:]])
    
    return hash_sent_lex_unigrams

def get_hash_sent_lex_bigram_dict():

    hash_sent_lex_bigrams = dict()
    with open(hash_sent_lex_bigrams_file_path) as hash_sent_lex_file:
        for line in hash_sent_lex_file:
            word_array = line.replace("\n", "").split("\t")
            if clean_str(word_array[0]):
                hash_sent_lex_bigrams[word_array[0]] = np.array([float(val) for val in word_array[1:]])
    
    return hash_sent_lex_bigrams

In [ ]:
hash_sent_lex_unigram_dict = get_hash_sent_lex_unigram_dict()

In [ ]:
hash_sent_lex_bigram_dict = get_hash_sent_lex_bigram_dict()

In [ ]:
def get_unigram_sentiment_hash_sent_lex_vector(word):
    
    vec_rep = np.zeros(3)
    if word in hash_sent_lex_unigram_dict.keys():
        vec_rep = hash_sent_lex_unigram_dict[word]
        
    return vec_rep


def get_bigram_sentiment_hash_sent_lex_vector(word):

    vec_rep = np.zeros(3)
    if word in hash_sent_lex_bigram_dict.keys():
        vec_rep = hash_sent_lex_bigram_dict[word]
            
    return vec_rep

In [ ]:
# get_unigram_sentiment_hash_sent_lex_vector("#fabulous")

In [ ]:
# get_bigram_sentiment_hash_sent_lex_vector(". #perfection")

Depeche Mood


In [ ]:
depeche_mood_file_path = \
    wassa_home + \
    "lexicons/DepecheMood_V1.0/DepecheMood_normfreq.txt"

In [ ]:
def get_depeche_vector_dict():
    depeche_vector_dict = dict()
    with open(depeche_mood_file_path) as depeche_mood_file:
        for line in depeche_mood_file:
            word_array = line.replace("\n", "").split("\t")
            depeche_vector_dict[word_array[0].split("#")[0]] = np.array([float(val) for val in word_array[1:]])
    
    return depeche_vector_dict

In [ ]:
depeche_vector_dict = get_depeche_vector_dict()

In [ ]:
def get_depeche_mood_vector(word):
    
    vec_rep = np.zeros(8)
    if word in depeche_vector_dict.keys():
        vec_rep = np.array(depeche_vector_dict[word])

    return vec_rep

In [ ]:
# get_depeche_mood_vector("120th")

Reading & Vectorizing Data


In [ ]:
def is_active_vector_method(string):
    return int(string)

def learn_unigram_word_embedding(word):
    
    word_feature_embedding_dict = dict()
    
    '''Pre-trained Word embeddings'''
    index = 0
    word_feature_embedding_dict[index] = get_word2vec_embedding(word, wv_model, w2v_dimensions)

    index = 1
    word_feature_embedding_dict[index] = get_word2vec_embedding(word, wv_model_1, w2v_dimensions_1)

    index = 2
    word_feature_embedding_dict[index] = get_word2vec_embedding(word, wv_model_2, w2v_dimensions_2)

    '''NRC Emotion Intensity Lexicon'''
    index = 3
    word_feature_embedding_dict[index] = get_emo_int_vector(word)

    '''WordNet'''
    index = 4
    word_feature_embedding_dict[index] = get_sentiwordnetscore(word)

    '''NRC Sentiment Lexica'''
    index = 5
    word_feature_embedding_dict[index] = get_sentiment_emotion_feature(word)

    index = 6
    word_feature_embedding_dict[index] = get_unigram_sentiment_emoticon_lexicon_vector(word)

    index = 7
    word_feature_embedding_dict[index] = get_unigram_sentiment_emoticon_afflex_vector(word)

    '''NRC Hashtag Lexica'''
    index = 8
    word_feature_embedding_dict[index] = get_hashtag_emotion_vector(word)

    index = 9
    word_feature_embedding_dict[index] = get_unigram_sentiment_hash_sent_lex_vector(word)

    index = 10
    word_feature_embedding_dict[index] = get_unigram_sentiment_hashtag_affneglex_vector(word)

    '''Additional pre-trained GloVe models'''
    index = 11
    word_feature_embedding_dict[index] = get_word2vec_embedding(word, wv_model_3, w2v_dimensions_3)

    index = 12
    word_feature_embedding_dict[index] = get_word2vec_embedding(word, wv_model_4, w2v_dimensions_4)

    index = 13
    word_feature_embedding_dict[index] = get_word2vec_embedding(word, wv_model_5, w2v_dimensions_5)

    '''Emoji Polarities'''
    index = 14
    word_feature_embedding_dict[index] = get_emoji_intensity(word)
    
    '''Depeche Mood'''
    index = 15
    word_feature_embedding_dict[index] = get_depeche_mood_vector(word)

    return word_feature_embedding_dict


def learn_bigram_word_embedding(word):
    
    word_feature_embedding_dict = dict()
    
    '''NRC Sentiment Lexica'''

    index = 0
    word_feature_embedding_dict[index] = get_bigram_sentiment_emoticon_lexicon_vector(word)

    index = 1
    word_feature_embedding_dict[index] = get_bigram_sentiment_emoticon_afflex_vector(word)

    '''NRC Hashtag Lexica'''
    index = 2
    word_feature_embedding_dict[index] = get_bigram_sentiment_hash_sent_lex_vector(word)

    index = 3
    word_feature_embedding_dict[index] = get_bigram_sentiment_hashtag_affneglex_vector(word)

    return word_feature_embedding_dict

In [ ]:
def get_unigram_embedding(word, word_embedding_dict, bin_string):
    
    word_feature_embedding_dict = word_embedding_dict[word]
    final_embedding = np.array([])
    
    for i in range(16):
        if is_active_vector_method(bin_string[i]):
            final_embedding = np.append(final_embedding, word_feature_embedding_dict[i])
    
    return final_embedding

In [ ]:
unigram_feature_string = "1110000000011100"
bigram_feature_string = "1111"jjjjjj

In [ ]:
training_tweets = read_training_data(training_data_file_path)
dev_tweets = read_training_data(dev_set_path)

score_train = list()
tweet_train = list()
for tweet in training_tweets:
    tweet_train.append(tweet.text)
    score_train.append(float(tweet.intensity))

for tweet in dev_tweets:
    tweet_train.append(tweet.text)
    score_train.append(float(tweet.intensity))
    
print(len(score_train))
score_train = np.asarray(score_train)

In [ ]:
test_tweets = read_training_data(test_data_file_path)
verbatim_test_tweets = read_training_data_verbatim(test_data_file_path)

tweet_test = list()
y_gold = list()
for tweet in test_tweets:
    tweet_test.append(tweet.text)
    y_gold.append(float(tweet.intensity))
    
print(len(y_gold))

In [ ]:
def build_word_embeddings(tweets):
    
    max_tweet_length = -1
    word_embedding_dict = dict()

    for tweet in tweets:

        tokens = word_tokenize(tweet)
        bi_tokens = bigrams(tokens)

        if len(tokens) > max_tweet_length:
            max_tweet_length = len(tokens)

        for token in tokens:
            if token not in word_embedding_dict.keys():
                word_embedding_dict[token] = learn_unigram_word_embedding(token)

        for token in bi_tokens:
            if token not in word_embedding_dict.keys():
                word_embedding_dict[token] = learn_bigram_word_embedding(token)
                
    return word_embedding_dict, max_tweet_length

In [ ]:
all_tweets = tweet_train + tweet_test
embedding_info = build_word_embeddings(all_tweets)

In [ ]:
browser_notify("Vectorization Done")

In [ ]:
# # Save vectors
# word_embeddings_path = "/home/v2john/word-embeddings.pkl"

# with open(word_embeddings_path, 'wb') as word_embeddings_file:
#     pickle.dump(embedding_info, word_embeddings_file)

In [ ]:
# Restore vectors

word_embeddings_path = "/home/v2john/word-embeddings.pkl"

with open(word_embeddings_path, 'rb') as word_embeddings_file:
    embedding_info = pickle.load(word_embeddings_file)
    
embeddings_index = embedding_info[0]
MAX_SEQUENCE_LENGTH = embedding_info[1]
EMBEDDING_DIM = len(get_unigram_embedding("glad", embedding_info[0], unigram_feature_string))

In [ ]:
print(MAX_SEQUENCE_LENGTH, EMBEDDING_DIM)

In [ ]:
def vectorize_tweets(tweets):
    
    train_vectors = list()
    for tweet in tweets:
        tokens = word_tokenize(tweet)
        
        train_vector = list()
        for token in tokens:
            train_vector.append(get_unigram_embedding(token, embedding_info[0], unigram_feature_string))
        
        train_vector = np.asarray(train_vector)
        train_vectors.append(train_vector)

    return np.asarray(train_vectors)

In [ ]:
x_train = vectorize_tweets(tweet_train)
x_test = vectorize_tweets(tweet_test)

In [ ]:
print(x_train.shape, x_test.shape)

Convolutional Neural Network Implementation in Keras


In [ ]:
x_train = sequence.pad_sequences(x_train, maxlen=embedding_info[1], padding="post", truncating="post", 
                                 dtype='float64')
x_test = sequence.pad_sequences(x_test, maxlen=embedding_info[1], padding="post", truncating="post",
                                dtype='float64')

In [ ]:
x_train.shape, x_test.shape, x_train.shape[1:]

In [ ]:
maxlen = 34
batch_size = 32
conv_kernel = 5

In [ ]:
conv_1 = Conv1D(300, conv_kernel, activation='relu', input_shape=x_train.shape[1:])
conv_2 = Conv1D(32, conv_kernel, activation='relu')
# conv_3 = Conv1D(100, conv_kernel, activation='relu')
pool_1 = MaxPooling1D()
pool_2 = MaxPooling1D()
pool_3 = MaxPooling1D()
pool_4 = MaxPooling1D()
flat_1 = Flatten()
dense_1 = Dense(30000, activation='relu')
dense_2 = Dense(1, activation='sigmoid')
drop_1 = Dropout(rate=0.5)

In [ ]:
def get_convnet_model():
    
    model = Sequential()

    model.add(conv_1)
    model.add(pool_1)
    
#     model.add(conv_2)
#     model.add(pool_2)
    
#     model.add(conv_3)
#     model.add(pool_3)
    
#     model.add(pool_4)

    model.add(flat_1)
    
    model.add(dense_1)
    model.add(drop_1)
    model.add(dense_2)

    model.compile(loss='mean_squared_error', optimizer="sgd")
    
    return model

In [ ]:
ml_model = KerasRegressor(build_fn=get_convnet_model, epochs=100, batch_size=16, verbose=1)

ml_model.fit(x_train, score_train)
y_pred = ml_model.predict(x_test)

In [ ]:
# print(conv_1.input_shape, conv_1.output_shape)
# print(pool_1.input_shape, pool_1.output_shape)
# print(conv_2.input_shape, conv_2.output_shape)
# print(pool_2.input_shape, pool_2.output_shape)
# print(conv_3.input_shape, conv_3.output_shape)
# print(pool_3.input_shape, pool_3.output_shape)
# print(flat_1.input_shape, flat_1.output_shape)

In [ ]:
y_pred = y_pred.reshape((760,))

In [ ]:
y_pred

In [ ]:
evaluate_lists(y_pred, y_gold)

In [ ]:
browser_notify("CNN Trained: " + str(evaluate_lists(y_pred, y_gold)))

In [ ]: