Define evaluation logic


In [ ]:
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor
import pandas as pd
from pandas import DataFrame
import scipy.stats as st
import time
import json
import pickle
import re
import html

In [ ]:
import keras

In [ ]:
import numpy
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras import optimizers
from keras.wrappers.scikit_learn import KerasRegressor
from keras.optimizers import SGD, Adagrad
from keras.layers.recurrent import LSTM

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import normalize

from nltk import word_tokenize
from nltk import bigrams
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import sentiwordnet as swn

In [ ]:
from IPython.display import display, HTML

def browser_alert(message):
    display(HTML('<script type="text/javascript">alert("' + message + '");</script>'))
    
def browser_notify(message):
    display(HTML('<script type="text/javascript">var notification=new Notification("Jupyter Notification",{icon:"http://blog.jupyter.org/content/images/2015/02/jupyter-sq-text.png",body:"' + message + '"});</script>'))

In [ ]:
# browser_notify("random")

In [ ]:
import numpy
import sys
import scipy.stats

def evaluate(pred,gold):
    
    f=open(pred, "rb")
    pred_lines=f.readlines()
    f.close()
    
    f=open(gold, "rb")
    gold_lines=f.readlines()
    f.close()
    

    if(len(pred_lines)==len(gold_lines)):       
        # align tweets ids with gold scores and predictions
        data_dic={}
        
        for line in gold_lines:
            line = line.decode()
            parts=line.split('\t')
            if len(parts)==4:   
                data_dic[int(parts[0])]=[float(line.split('\t')[3])]
            else:
                raise ValueError('Format problem.')
        
        
        for line in pred_lines:
            line = line.decode()
            parts=line.split('\t')
            if len(parts)==4:  
                if int(parts[0]) in data_dic:
                    try:
                        data_dic[int(parts[0])].append(float(line.split('\t')[3]))
                    except ValueError:
                        # Invalid predictions are replaced by a default value
                        data_dic[int(parts[0])].append(0.5)
                else:
                    raise ValueError('Invalid tweet id.')
            else:
                raise ValueError('Format problem.')
            
            
        # lists storing gold and prediction scores
        gold_scores=[]  
        pred_scores=[]
         
        
        # lists storing gold and prediction scores where gold score >= 0.5
        gold_scores_range_05_1=[]
        pred_scores_range_05_1=[]
         
            
        for id in data_dic:
            if(len(data_dic[id])==2):
                gold_scores.append(data_dic[id][0])
                pred_scores.append(data_dic[id][1])
                if(data_dic[id][0]>=0.5):
                    gold_scores_range_05_1.append(data_dic[id][0])
                    pred_scores_range_05_1.append(data_dic[id][1])
            else:
                raise ValueError('Repeated id in test data.')
                
      
        # return zero correlation if predictions are constant
        if numpy.std(pred_scores)==0 or numpy.std(gold_scores)==0:
            return (0,0,0,0)
        

        pears_corr=scipy.stats.pearsonr(pred_scores,gold_scores)[0]                                    
        spear_corr=scipy.stats.spearmanr(pred_scores,gold_scores)[0]   


        pears_corr_range_05_1=scipy.stats.pearsonr(pred_scores_range_05_1,gold_scores_range_05_1)[0]                                    
        spear_corr_range_05_1=scipy.stats.spearmanr(pred_scores_range_05_1,gold_scores_range_05_1)[0]           
        
      
        return (pears_corr,spear_corr,pears_corr_range_05_1,spear_corr_range_05_1)
    else:
        raise ValueError('Predictions and gold data have different number of lines.')
        
def evaluate_lists(pred, gold):
    if len(pred) == len(gold):
        gold_scores=gold
        pred_scores=pred         
        
        # lists storing gold and prediction scores where gold score >= 0.5
        gold_scores_range_05_1=[]
        pred_scores_range_05_1=[]
         
            
        for i in range(len(gold_scores)):
            if(gold_scores[i]>=0.5):
                gold_scores_range_05_1.append(gold_scores[i])
                pred_scores_range_05_1.append(pred_scores[i])
                
        # return zero correlation if predictions are constant
        if numpy.std(pred_scores)==0 or numpy.std(gold_scores)==0:
            return (0,0,0,0)
        
        pears_corr=scipy.stats.pearsonr(pred_scores,gold_scores)[0]                                    
        spear_corr=scipy.stats.spearmanr(pred_scores,gold_scores)[0]   


        pears_corr_range_05_1=scipy.stats.pearsonr(pred_scores_range_05_1,gold_scores_range_05_1)[0]                                    
        spear_corr_range_05_1=scipy.stats.spearmanr(pred_scores_range_05_1,gold_scores_range_05_1)[0]           
        
      
        return np.array([pears_corr,spear_corr,pears_corr_range_05_1,spear_corr_range_05_1])
    else:
        raise ValueError('Predictions and gold data have different number of lines.')

Load pre-trained word vectors


In [ ]:
import gc
gc.collect()

In [ ]:
import gensim
import numpy as np
from sklearn.preprocessing import PolynomialFeatures

In [ ]:
word_vector_path = "/home/v2john/"
wassa_home = "/home/v2john/WASSA-Task/"

Word2Vec + GloVe


In [ ]:
def loadGloveModel(gloveFile):
    print("Loading Glove Model")
    f = open(gloveFile,'r')
    model = {}
    num = 1
    for line in f:
        try:
            splitLine = line.split()
            word = splitLine[0]
            embedding = [float(val) for val in splitLine[1:]]
            model[word] = np.array(embedding)
            num += 1
        except Exception as e:
            print("Failed at line " + str(num))
    print("Done.",len(model)," words loaded!")
    return model

In [ ]:
# Google news pretrained vectors
wv_model_path = word_vector_path + "GoogleNews-vectors-negative300.bin.gz"
wv_model = gensim.models.KeyedVectors.load_word2vec_format(wv_model_path, binary=True, unicode_errors='ignore')

In [ ]:
# Twitter pretrained vectors
wv_model_path_1 = word_vector_path + "word2vec_twitter_model.bin"
wv_model_1 = gensim.models.KeyedVectors.load_word2vec_format(wv_model_path_1, binary=True, unicode_errors='ignore')

In [ ]:
wv_model_path_2 = word_vector_path + "glove.twitter.27B.200d.txt"
wv_model_2 = loadGloveModel(wv_model_path_2)

In [ ]:
wv_model_path_3 = word_vector_path + "glove.6B.300d.txt"
wv_model_3 = loadGloveModel(wv_model_path_3)

In [ ]:
wv_model_path_4 = word_vector_path + "glove.42B.300d.txt"
wv_model_4 = loadGloveModel(wv_model_path_4)

In [ ]:
wv_model_path_5 = word_vector_path + "glove.840B.300d.txt"
wv_model_5 = loadGloveModel(wv_model_path_5)

In [ ]:
w2v_dimensions = len(wv_model['word'])
w2v_dimensions_1 = len(wv_model_1['word'])
w2v_dimensions_2 = len(wv_model_2['word'])
w2v_dimensions_3 = len(wv_model_3['word'])
w2v_dimensions_4 = len(wv_model_4['word'])
w2v_dimensions_5 = len(wv_model_5['word'])

print(w2v_dimensions, w2v_dimensions_1, 
      w2v_dimensions_2, w2v_dimensions_3, w2v_dimensions_4,
      w2v_dimensions_5)

In [ ]:
def get_word2vec_embedding(tweet, model, dimensions):
    
    tokens = word_tokenize(tweet)
    
    vector_list = list()
    for token in tokens:
        try:
            vector_list.append(model[token])
        except Exception as e:
            pass

    if len(vector_list) == 0:
        uni_vec_rep = np.zeros(dimensions).tolist()
    else:
        uni_vec_rep = sum(vector_list) / float(len(vector_list))

    return uni_vec_rep

In [ ]:
# wv_model = None
# wv_model_1 = None
# wv_model_2 = None
# wv_model_3 = None
# wv_model_4 = None
# wv_model_5 = None

In [ ]:
browser_notify("Embeddings learnt")

Cleaning


In [ ]:
def remove_stopwords(string):
    split_string = \
        [word for word in string.split()
         if word not in stopwords.words('english')]
    
    return " ".join(split_string)

In [ ]:
# wnl = WordNetLemmatizer()

In [ ]:
def clean_str(string):  
    string = html.unescape(string)
    string = string.replace("\\n", " ")
    string = string.replace("_NEG", "")
    string = string.replace("_NEGFIRST", "")
    string = re.sub(r"@[A-Za-z0-9_s(),!?\'\`]+", "", string) # removing any twitter handle mentions
#     string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"#", "", string)
    string = re.sub(r"\*", "", string)
    string = re.sub(r"\'s", "", string)
    string = re.sub(r"\'m", " am", string)
    string = re.sub(r"\'ve", " have", string)
    string = re.sub(r"n\'t", " not", string)
    string = re.sub(r"\'re", " are", string)
    string = re.sub(r"\'d", " would", string)
    string = re.sub(r"\'ll", " will", string)
    string = re.sub(r",", "", string)
    string = re.sub(r"!", " !", string)
    string = re.sub(r"\(", "", string)
    string = re.sub(r"\)", "", string)
    string = re.sub(r"\?", " ?", string)
    string = re.sub(r"\s{2,}", " ", string)

    return remove_stopwords(string.strip().lower())

Metadata and Class Definitions


In [ ]:
class Tweet(object):

    def __init__(self, id, text, emotion, intensity):
        self.id = id
        self.text = text
        self.emotion = emotion
        self.intensity = intensity

    def __repr__(self):
        return \
            "id: " + self.id + \
            ", text: " + self.text + \
            ", emotion: " + self.emotion + \
            ", intensity: " + self.intensity

In [ ]:
def read_training_data(training_data_file_path):

    train_list = list()
    with open(training_data_file_path) as input_file:
        for line in input_file:
            line = line.strip()
            array = line.split('\t')
            train_list.append(Tweet(array[0], clean_str(array[1]), array[2], float(array[3])))
    return train_list
            
def read_training_data_verbatim(training_data_file_path):

    train_list = list()
    with open(training_data_file_path) as input_file:
        for line in input_file:
            line = line.strip()
            array = line.split('\t')
            train_list.append(Tweet(array[0], array[1], array[2], float(array[3])))
    return train_list
    
def read_test_data(training_data_file_path):

    test_list = list()
    with open(training_data_file_path) as input_file:
        for line in input_file:
            line = line.strip()
            array = line.split('\t')
            test_list.append(Tweet(array[0], clean_str(array[1]), array[2], None))
    return test_list

In [ ]:
emotion = "anger"

In [ ]:
training_data_file_path = \
    wassa_home + "dataset/" + \
    emotion + "-ratings-0to1.train.txt"
predictions_file_path = \
    wassa_home + "predictions/" + \
    emotion + "-pred.txt"
dev_set_path = \
    wassa_home + "dataset/dev-set/" + \
    emotion + "-ratings-0to1.dev.gold.txt"
test_data_file_path = \
    wassa_home + "dataset/test-set/" + \
    emotion + "-ratings-0to1.test.gold.txt"
debug_file_path = \
    wassa_home + "dataset/test-set/debug/" + \
    emotion + ".tsv"

In [ ]:
# print(training_data_file_path, predictions_file_path, dev_set_path, test_data_file_path)

Feature Extraction Snippets

Emoji Intensity


In [ ]:
with open(wassa_home + 'lexicons/emoji_map.json') as emoji_file:
    emoji_list = json.load(emoji_file)
    
emoji_dict = dict()
for emoji in emoji_list:
    emoji_dict[emoji["emoji"]] = (emoji["name"], emoji["polarity"])

In [ ]:
# print(emoji_dict["💯"])

In [ ]:
poly_emoji_intensity = PolynomialFeatures(5)
# poly_emoji_intensity = PolynomialFeatures(1)

def get_emoji_intensity(tweet):
    score = 0.0
    for emoji in emoji_dict.keys():
        count = tweet.count(emoji)
        score += count * emoji_dict[emoji][1]
        
    return normalize(poly_emoji_intensity.fit_transform(np.array([score]).reshape(1, -1))[0].reshape(1, -1))[0]

In [ ]:
# get_emoji_intensity("💯")

Emotion Intensity Lexicon


In [ ]:
affect_intensity_file_path = \
    wassa_home + \
    "lexicons/NRC-AffectIntensity-Lexicon.txt"

def get_word_affect_intensity_dict(emotion):
    word_intensities = dict()

    with open(affect_intensity_file_path) as affect_intensity_file:
        for line in affect_intensity_file:
            word_int_array = line.replace("\n", "").split("\t")

            if (word_int_array[2] == emotion):
                word_intensities[word_int_array[0]] = float(word_int_array[1])

    return word_intensities

In [ ]:
word_intensities = get_word_affect_intensity_dict(emotion)

In [ ]:
poly_emo_int = PolynomialFeatures(10)
# poly_emo_int = PolynomialFeatures(1)

def get_emo_int_vector(tweet):
    score = 0.0
    for word in word_intensities.keys():
        if word in tweet:
            score += tweet.count(word) * float(word_intensities[word])
    
    return normalize(poly_emo_int.fit_transform(np.array([score]).reshape(1, -1))[0].reshape(1, -1))[0]
#     return [score]

In [ ]:
# get_emo_int_vector("furious")

SentiWordNet


In [ ]:
poly_sentiwordnet = PolynomialFeatures(5)
# poly_sentiwordnet = PolynomialFeatures(1)

def get_sentiwordnetscore(tweet):
    
    score = np.zeros(2)
    
    for word in tweet.split():
        synsetlist = list(swn.senti_synsets(word))
        
        if synsetlist:
            score[0] += synsetlist[0].pos_score()
            score[1] += synsetlist[0].neg_score()
            
#     return tweet_score.tolist()
    return normalize(poly_sentiwordnet.fit_transform(np.array([score]).reshape(1, -1))[0].reshape(1, -1))[0]

In [ ]:
# get_sentiwordnetscore("furious")

Sentiment Emotion Presence Lexicon


In [ ]:
sentiment_emotion_lex_file_path = \
    wassa_home + \
    "lexicons/NRC-Sentiment-Emotion-Lexicons/Lexicons/NRC-Emotion-Lexicon-v0.92/" + \
    "NRC-Emotion-Lexicon-Wordlevel-v0.92.txt"

def get_affect_presence_list(emotion):
    word_list = list()
    
    with open(sentiment_emotion_lex_file_path) as sentiment_emotion_lex_file:
        for line in sentiment_emotion_lex_file:
            word_array = line.replace("\n", "").split("\t")

            if (word_array[1] == emotion and word_array[2] == '1'):
                word_list.append(word_array[0])
                
    return word_list

In [ ]:
word_list = get_affect_presence_list(emotion)

In [ ]:
def get_sentiment_emotion_feature(tweet):
    
    vector = np.zeros(1)
    for word in word_list:
        if word in tweet.split():
            vector[0] = 1.0
    
    return vector

In [ ]:
# get_sentiment_emotion_feature("furious")

Hashtag Emotion Intensity


In [ ]:
hashtag_emotion_lex_file_path = \
    wassa_home + \
    "lexicons/NRC-Sentiment-Emotion-Lexicons/Lexicons/NRC-Hashtag-Emotion-Lexicon-v0.2/" + \
    "NRC-Hashtag-Emotion-Lexicon-v0.2.txt"
    
def get_hashtag_emotion_intensity(emotion):
    hastag_intensities = dict()
    
    with open(hashtag_emotion_lex_file_path) as hashtag_emotion_lex_file:
        for line in hashtag_emotion_lex_file:
            word_array = line.replace("\n", "").split("\t")
            
            if (word_array[0] == emotion):
                hastag_intensities[clean_str(word_array[1])] = float(word_array[2])

    return hastag_intensities

In [ ]:
hashtag_emotion_intensities = get_hashtag_emotion_intensity(emotion)

In [ ]:
poly_hashtag_emotion = PolynomialFeatures(10)
# poly_emo_int = PolynomialFeatures(1)

def get_hashtag_emotion_vector(tweet):
    score = 0.0
    for word in hashtag_emotion_intensities.keys():
        if word in tweet:
            score += tweet.count(word) * float(hashtag_emotion_intensities[word])
            
#     return [score]
    return normalize(poly_hashtag_emotion.fit_transform(np.array([score]).reshape(1, -1))[0].reshape(1, -1))[0]

In [ ]:
# get_hashtag_emotion_vector("furious")

Emoticon Sentiment Lexicon


In [ ]:
emoticon_lexicon_unigrams_file_path = \
    wassa_home + \
    "lexicons/NRC-Sentiment-Emotion-Lexicons/Lexicons/NRC-Emoticon-Lexicon-v1.0/Emoticon-unigrams.txt"
emoticon_lexicon_bigrams_file_path = \
    wassa_home + \
    "lexicons/NRC-Sentiment-Emotion-Lexicons/Lexicons/NRC-Emoticon-Lexicon-v1.0/Emoticon-bigrams.txt"
emoticon_lexicon_pairs_file_path = \
    wassa_home + \
    "lexicons/NRC-Sentiment-Emotion-Lexicons/Lexicons/NRC-Emoticon-Lexicon-v1.0/Emoticon-pairs.txt"
pair_split_string = "---"
    
def get_emoticon_lexicon_unigram_dict():
    emoticon_lexicon_unigrams = dict()
    with open(emoticon_lexicon_unigrams_file_path) as emoticon_lexicon_file:
        for line in emoticon_lexicon_file:
            word_array = line.replace("\n", "").split("\t")
            emoticon_lexicon_unigrams[word_array[0]] = np.array([float(val) for val in word_array[1:]])
    
    return emoticon_lexicon_unigrams

def get_emoticon_lexicon_bigram_dict():
    emoticon_lexicon_bigrams = dict()
    with open(emoticon_lexicon_bigrams_file_path) as emoticon_lexicon_file:
        for line in emoticon_lexicon_file:
            word_array = line.replace("\n", "").split("\t")
            emoticon_lexicon_bigrams[word_array[0]] = np.array([float(val) for val in word_array[1:]])
    
    return emoticon_lexicon_bigrams

def get_emoticon_lexicon_pairs_dict():
    emoticon_lexicon_pairs = dict()
    with open(emoticon_lexicon_pairs_file_path) as emoticon_lexicon_file:
        for line in emoticon_lexicon_file:
            word_array = line.replace("\n", "").split("\t")
            pair = word_array[0].split(pair_split_string)
            token_1 = clean_str(pair[0])
            token_2 = clean_str(pair[1])
            if token_1 and token_2:
                token_1_dict = None
                if token_1 in emoticon_lexicon_pairs.keys():
                    token_1_dict = emoticon_lexicon_pairs[token_1]
                else:
                    token_1_dict = dict()
                    
                token_1_dict[token_2] = np.array([float(val) for val in word_array[1:]])
                emoticon_lexicon_pairs[token_1] = token_1_dict
    
    return emoticon_lexicon_pairs

In [ ]:
emoticon_lexicon_unigram_dict = get_emoticon_lexicon_unigram_dict()

In [ ]:
emoticon_lexicon_bigram_dict = get_emoticon_lexicon_bigram_dict()

In [ ]:
emoticon_lexicon_pairs_dict = get_emoticon_lexicon_pairs_dict()

In [ ]:
poly_emoticon_lexicon = PolynomialFeatures(5)
# poly_emoticon_lexicon = PolynomialFeatures(1)

def get_unigram_sentiment_emoticon_lexicon_vector(tokens):
    vector_list = np.zeros(3)
    counter = 0
    for token in tokens:
        word = clean_str(token)
        if word in emoticon_lexicon_unigram_dict.keys():
            vector_list += emoticon_lexicon_unigram_dict[word]
            counter += 1
    if counter > 0:
        vector_list /= counter
        
#     return vector_list
    return normalize(poly_emoticon_lexicon.fit_transform(vector_list.reshape(1, -1))[0].reshape(1, -1))[0]


def get_bigram_sentiment_emoticon_lexicon_vector(tokens):
    bi_tokens = bigrams(tokens)
    vector_list = np.zeros(3)
    counter = 0
    for bi_token in bi_tokens:
        word = clean_str(" ".join(bi_token))
        if word in emoticon_lexicon_bigram_dict.keys():
            vector_list += emoticon_lexicon_bigram_dict[word]
            counter += 1
    if counter > 0:
        vector_list /= counter
        
#     return vector_list
    return normalize(poly_emoji_intensity.fit_transform(vector_list.reshape(1, -1))[0].reshape(1, -1))[0].tolist()

def get_pair_sentiment_emoticon_lexicon_vector(tokens):
    vector_list = np.zeros(3)
    counter = 0
    
    for i in range(len(tokens)):
        word_1 = clean_str(tokens[i])
        if word_1 in emoticon_lexicon_pairs_dict.keys():
            token_1_dict = emoticon_lexicon_pairs_dict[word_1]
            for j in range(i, len(tokens)):
                word_2 = clean_str(tokens[j])
                if word_2 in token_1_dict.keys():
                    vector_list += token_1_dict[word_2]
                    counter += 1
                    
    if counter > 0:
        vector_list /= counter
        
#     return vector_list
    return normalize(poly_emoji_intensity.fit_transform(vector_list.reshape(1, -1))[0].reshape(1, -1))[0].tolist()

def get_sentiment_emoticon_lexicon_vector(tweet):
    
    tokens = word_tokenize(tweet)
    final_list = np.asarray([])
    
    # Adding unigram features
    final_list = np.append(
        final_list,
        get_unigram_sentiment_emoticon_lexicon_vector(tokens)
    )
    
    final_list = np.append(
        final_list,
        get_bigram_sentiment_emoticon_lexicon_vector(tokens)
    )
    
    final_list = np.append(
        final_list,
        get_pair_sentiment_emoticon_lexicon_vector(tokens)
    )
    
    # Adding bigram features
#     final_list.extend(get_bigram_sentiment_emoticon_lexicon_vector(tokens))
    
    # Adding pair features
#     final_list.extend(get_pair_sentiment_emoticon_lexicon_vector(tokens))

    return final_list

In [ ]:
# get_sentiment_emoticon_lexicon_vector("furious")

Emoticon Sentiment Aff-Neg Lexicon


In [ ]:
emoticon_afflex_unigrams_file_path = \
    wassa_home + \
    "lexicons/NRC-Sentiment-Emotion-Lexicons/Lexicons/NRC-Emoticon-AffLexNegLex-v1.0/" + \
    "Emoticon-AFFLEX-NEGLEX-unigrams.txt"
emoticon_afflex_bigrams_file_path = \
    wassa_home + \
    "lexicons/NRC-Sentiment-Emotion-Lexicons/Lexicons/NRC-Emoticon-AffLexNegLex-v1.0/" + \
    "Emoticon-AFFLEX-NEGLEX-bigrams.txt"
    
def get_emoticon_afflex_unigram_dict():
    emoticon_afflex_unigrams = dict()
    with open(emoticon_afflex_unigrams_file_path) as emoticon_lexicon_file:
        for line in emoticon_lexicon_file:
            word_array = line.replace("\n", "").split("\t")
            emoticon_afflex_unigrams[word_array[0]] = np.array([float(val) for val in word_array[1:]])
    
    return emoticon_afflex_unigrams

def get_emoticon_afflex_bigram_dict():
    emoticon_afflex_bigrams = dict()
    with open(emoticon_afflex_bigrams_file_path) as emoticon_lexicon_file:
        for line in emoticon_lexicon_file:
            word_array = line.replace("\n", "").split("\t")
            emoticon_afflex_bigrams[word_array[0]] = np.array([float(val) for val in word_array[1:]])
    
    return emoticon_afflex_bigrams

In [ ]:
emoticon_afflex_unigram_dict = get_emoticon_afflex_unigram_dict()

In [ ]:
emoticon_afflex_bigram_dict = get_emoticon_afflex_bigram_dict()

In [ ]:
poly_emoticon_lexicon = PolynomialFeatures(5)
# poly_emoticon_lexicon = PolynomialFeatures(1)

def get_unigram_sentiment_emoticon_afflex_vector(tokens):
    vector_list = np.zeros(3)
    counter = 0
    for token in tokens:
        word = clean_str(token)
        if word in emoticon_afflex_unigram_dict.keys():
            vector_list += emoticon_afflex_unigram_dict[word]
            counter += 1
    if counter > 0:
        vector_list /= counter
        
#     return vector_list
    return normalize(poly_emoticon_lexicon.fit_transform(vector_list.reshape(1, -1))[0].reshape(1, -1))[0]


def get_bigram_sentiment_emoticon_afflex_vector(tokens):
    bi_tokens = bigrams(tokens)
    vector_list = np.zeros(3)
    counter = 0
    for bi_token in bi_tokens:
        word = clean_str(" ".join(bi_token))
        if word in emoticon_afflex_bigram_dict.keys():
            vector_list += emoticon_afflex_bigram_dict[word]
            counter += 1
    if counter > 0:
        vector_list /= counter
        
#     return vector_list
    return normalize(poly_emoticon_lexicon.fit_transform(vector_list.reshape(1, -1))[0].reshape(1, -1))[0]

def get_sentiment_emoticon_afflex_vector(tweet):
    final_list = np.asarray([])
    tokens = word_tokenize(tweet)
    
    # Adding unigram features
    final_list = np.append(final_list, get_unigram_sentiment_emoticon_afflex_vector(tokens))
    
    # Adding bigram featunigram_list =ures
    final_list = np.append(final_list, get_bigram_sentiment_emoticon_afflex_vector(tokens))

    return final_list

In [ ]:
# get_sentiment_emoticon_afflex_vector("furious")

Hashtag Sentiment Aff-Neg Lexicon


In [ ]:
hashtag_affneglex_unigrams_file_path = \
    wassa_home + \
    "lexicons/NRC-Sentiment-Emotion-Lexicons/Lexicons/NRC-Hashtag-Sentiment-AffLexNegLex-v1.0/" + \
    "HS-AFFLEX-NEGLEX-unigrams.txt"
hashtag_affneglex_bigrams_file_path = \
    wassa_home + \
    "lexicons/NRC-Sentiment-Emotion-Lexicons/Lexicons/NRC-Hashtag-Sentiment-AffLexNegLex-v1.0/" + \
    "HS-AFFLEX-NEGLEX-bigrams.txt"
    
def get_hashtag_affneglex_unigram_dict():
    hashtag_affneglex_unigrams = dict()
    with open(hashtag_affneglex_unigrams_file_path) as hashtag_sent_lex_file:
        for line in hashtag_sent_lex_file:
            word_array = line.replace("\n", "").split("\t")
            hashtag_affneglex_unigrams[clean_str(word_array[0])] = np.array([float(val) for val in word_array[1:]])
    
    return hashtag_affneglex_unigrams

def get_hashtag_affneglex_bigram_dict():
    hashtag_affneglex_bigrams = dict()
    with open(hashtag_affneglex_bigrams_file_path) as hashtag_sent_lex_file:
        for line in hashtag_sent_lex_file:
            word_array = line.replace("\n", "").split("\t")
            hashtag_affneglex_bigrams[clean_str(word_array[0])] = np.array([float(val) for val in word_array[1:]])

    return hashtag_affneglex_bigrams

In [ ]:
hashtag_affneglex_unigram_dict = get_hashtag_affneglex_unigram_dict()

In [ ]:
hashtag_affneglex_bigram_dict = get_hashtag_affneglex_bigram_dict()

In [ ]:
poly_hashtag_sent_affneglex = PolynomialFeatures(5)
# poly_hashtag_sent_affneglex = PolynomialFeatures(1)

def get_unigram_sentiment_hashtag_affneglex_vector(tokens):
    vector_list = np.zeros(3)
    counter = 0
    for token in tokens:
        word = clean_str(token)
        if word in hashtag_affneglex_unigram_dict.keys():
            vector_list += hashtag_affneglex_unigram_dict[word]
            counter += 1
    if counter > 0:
        vector_list /= counter
        
#     return vector_list
    return normalize(poly_hashtag_sent_affneglex.fit_transform(vector_list.reshape(1, -1))[0].reshape(1, -1))[0]

def get_bigram_sentiment_hashtag_affneglex_vector(tokens):
    bi_tokens = bigrams(tokens)
    vector_list = np.zeros(3)
    counter = 0
    for bi_token in bi_tokens:
        word = clean_str(" ".join(bi_token))
        if word in hashtag_affneglex_bigram_dict.keys():
            vector_list += hashtag_affneglex_bigram_dict[word]
            counter += 1
    if counter > 0:
        vector_list /= counter
        
#     return vector_list
    return normalize(poly_hashtag_sent_affneglex.fit_transform(vector_list.reshape(1, -1))[0].reshape(1, -1))[0]

def get_sentiment_hashtag_affneglex_vector(tweet):
    final_list = np.asarray([])
    tokens = word_tokenize(tweet)
    
    # Adding unigram features
    final_list = np.append(final_list, get_unigram_sentiment_hashtag_affneglex_vector(tokens))
    # Adding bigram features
    final_list = np.append(final_list, get_bigram_sentiment_hashtag_affneglex_vector(tokens))

    return final_list

In [ ]:
# get_sentiment_hashtag_affneglex_vector("furious")

Hashtag Sentiment Lexicon


In [ ]:
hash_sent_lex_unigrams_file_path = \
    wassa_home + \
    "lexicons/NRC-Sentiment-Emotion-Lexicons/Lexicons/NRC-Hashtag-Sentiment-Lexicon-v1.0/HS-unigrams.txt"
hash_sent_lex_bigrams_file_path = \
    wassa_home + \
    "lexicons/NRC-Sentiment-Emotion-Lexicons/Lexicons/NRC-Hashtag-Sentiment-Lexicon-v1.0/HS-bigrams.txt"
hash_sent_lex_pairs_file_path = \
    wassa_home + \
    "lexicons/NRC-Sentiment-Emotion-Lexicons/Lexicons/NRC-Hashtag-Sentiment-Lexicon-v1.0/HS-pairs.txt"
pair_split_string = "---"


def get_hash_sent_lex_unigram_dict():
    hash_sent_lex_unigrams = dict()
    with open(hash_sent_lex_unigrams_file_path) as hash_sent_lex_file:
        for line in hash_sent_lex_file:
            word_array = line.replace("\n", "").split("\t")
            if clean_str(word_array[0]):
                hash_sent_lex_unigrams[word_array[0]] = np.array([float(val) for val in word_array[1:]])
    
    return hash_sent_lex_unigrams

def get_hash_sent_lex_bigram_dict():
    hash_sent_lex_bigrams = dict()
    with open(hash_sent_lex_bigrams_file_path) as hash_sent_lex_file:
        for line in hash_sent_lex_file:
            word_array = line.replace("\n", "").split("\t")
            if clean_str(word_array[0]):
                hash_sent_lex_bigrams[word_array[0]] = np.array([float(val) for val in word_array[1:]])
    
    return hash_sent_lex_bigrams

def get_hash_sent_lex_pairs_dict():
    hash_sent_lex_pairs = dict()
    with open(hash_sent_lex_pairs_file_path) as hash_sent_lex_file:
        for line in hash_sent_lex_file:
            word_array = line.replace("\n", "").split("\t")
            pair = word_array[0].split(pair_split_string)
            token_1 = clean_str(pair[0])
            token_2 = clean_str(pair[1])
            if token_1 and token_2:
                token_1_dict = None
                if token_1 in hash_sent_lex_pairs.keys():
                    token_1_dict = hash_sent_lex_pairs[token_1]
                else:
                    token_1_dict = dict()
                    
                token_1_dict[token_2] = np.array([float(val) for val in word_array[1:]])
                hash_sent_lex_pairs[token_1] = token_1_dict
    
    return hash_sent_lex_pairs

In [ ]:
hash_sent_lex_unigram_dict = get_hash_sent_lex_unigram_dict()

In [ ]:
hash_sent_lex_bigram_dict = get_hash_sent_lex_bigram_dict()

In [ ]:
hash_sent_lex_pairs_dict = get_hash_sent_lex_pairs_dict()

In [ ]:
poly_hash_sent_lex = PolynomialFeatures(5)
# poly_hash_sent_lex = PolynomialFeatures(1)

def get_unigram_sentiment_hash_sent_lex_vector(tokens):
    vector_list = np.zeros(3)
    counter = 0
    
    for token in tokens:
        word = clean_str(token)
        if word in hash_sent_lex_unigram_dict.keys():
            vector_list += hash_sent_lex_unigram_dict[word]
            counter += 1

    if counter > 0:
        vector_list /= counter
    
#     return vector_list
    return normalize(poly_hash_sent_lex.fit_transform(vector_list.reshape(1, -1))[0].reshape(1, -1))[0]
    
def get_bigram_sentiment_hash_sent_lex_vector(tokens):
    bi_tokens = bigrams(tokens)
    vector_list = np.zeros(3)
    counter = 0
    for bi_token in bi_tokens:
        word = clean_str(" ".join(bi_token))
        if word in hash_sent_lex_bigram_dict.keys():
            vector_list += hash_sent_lex_bigram_dict[word]
            counter += 1
    if counter > 0:
        vector_list /= counter
    
#     return vector_list
    return normalize(poly_hash_sent_lex.fit_transform(vector_list.reshape(1, -1))[0].reshape(1, -1))[0]

def get_pair_sentiment_hash_sent_lex_vector(tokens):
    vector_list = np.zeros(3)
    counter = 0
    
    for i in range(len(tokens)):
        word_1 = clean_str(tokens[i])
        if word_1 in hash_sent_lex_pairs_dict.keys():
            token_1_dict = hash_sent_lex_pairs_dict[word_1]
            for j in range(i, len(tokens)):
                word_2 = clean_str(tokens[j])
                if word_2 in token_1_dict.keys():
                    vector_list += token_1_dict[word_2]
                    counter += 1
    if counter > 0:
        vector_list /= counter
        
#     return vector_list
    return normalize(poly_hash_sent_lex.fit_transform(vector_list.reshape(1, -1))[0].reshape(1, -1))[0]
    
def get_sentiment_hash_sent_lex_vector(tweet):
    final_list = np.asarray([])
    
    tokens = word_tokenize(tweet)
    
    # Adding unigram features
    final_list = np.append(final_list, get_unigram_sentiment_hash_sent_lex_vector(tokens))
    # Adding bigram features
    final_list = np.append(final_list, get_bigram_sentiment_hash_sent_lex_vector(tokens))
    # Adding pair features
    final_list = np.append(final_list, get_pair_sentiment_hash_sent_lex_vector(tokens))

    return final_list

In [ ]:
# get_sentiment_hash_sent_lex_vector("furious")

Depeche Mood


In [ ]:
depeche_mood_file_path = \
    wassa_home + \
    "lexicons/DepecheMood_V1.0/DepecheMood_normfreq.txt"

In [ ]:
def get_depeche_vector_dict():
    depeche_vector_dict = dict()
    with open(depeche_mood_file_path) as depeche_mood_file:
        for line in depeche_mood_file:
            word_array = line.replace("\n", "").split("\t")
            depeche_vector_dict[word_array[0].split("#")[0]] = np.array([float(val) for val in word_array[1:]])
    
    return depeche_vector_dict

In [ ]:
depeche_vector_dict = get_depeche_vector_dict()

In [ ]:
# print(len(depeche_vector_dict["0"]))

In [ ]:
poly_depm = PolynomialFeatures(5)

def get_depeche_mood_vector(tweet):
    vector_list = np.zeros(8)
    tokens = word_tokenize(tweet)
    counter = 0
    
    for token in tokens:
        if token in depeche_vector_dict.keys():
            vector_list += np.array(depeche_vector_dict[token])
            counter += 1
    
    if counter > 0:
        vector_list /= counter
        
    return normalize(poly_depm.fit_transform(vector_list.reshape(1, -1))[0].reshape(1, -1))[0]

In [ ]:
# get_depeche_mood_vector("rom sentences with happy sad anxious")

Reading & Vectorizing Data


In [ ]:
def is_active_vector_method(string):
    return int(string)


def vectorize_tweets(tweet_list, bin_string, vector_dict):

    vectors = list()
    frames = list()

    '''Pre-trained Word embeddings'''
    index = 0
    if is_active_vector_method(bin_string[index]):
        if index not in vector_dict.keys():
            tmp_vector = \
                DataFrame(list(map(lambda x: get_word2vec_embedding(x, wv_model, w2v_dimensions), tweet_list)))
            vector_dict[index] = tmp_vector
        frames.append(vector_dict[index])

    index = 1
    if is_active_vector_method(bin_string[index]):
        if index not in vector_dict.keys():
            tmp_vector = \
                DataFrame(list(map(lambda x: get_word2vec_embedding(x, wv_model_1, w2v_dimensions_1), tweet_list)))
            vector_dict[index] = tmp_vector
        frames.append(vector_dict[index])

    index = 2
    if is_active_vector_method(bin_string[index]):
        if index not in vector_dict.keys():
            tmp_vector = \
                DataFrame(list(map(lambda x: get_word2vec_embedding(x, wv_model_2, w2v_dimensions_2), tweet_list)))
            vector_dict[index] = tmp_vector
        frames.append(vector_dict[index])

    '''NRC Emotion Intensity Lexicon'''
    index = 3
    if is_active_vector_method(bin_string[index]):
        if index not in vector_dict.keys():
            tmp_vector = DataFrame(list(map(lambda x: get_emo_int_vector(x), tweet_list)))
            vector_dict[index] = tmp_vector
        frames.append(vector_dict[index])

    '''WordNet'''
    index = 4
    if is_active_vector_method(bin_string[index]):
        if index not in vector_dict.keys():
            tmp_vector = DataFrame(list(map(lambda x: get_sentiwordnetscore(x), tweet_list)))
            vector_dict[index] = tmp_vector
        frames.append(vector_dict[index])

    '''NRC Sentiment Lexica'''
    index = 5
    if is_active_vector_method(bin_string[index]):
        if index not in vector_dict.keys():
            tmp_vector = DataFrame(list(map(lambda x: get_sentiment_emotion_feature(x), tweet_list)))
            vector_dict[index] = tmp_vector
        frames.append(vector_dict[index])

    index = 6
    if is_active_vector_method(bin_string[index]):
        if index not in vector_dict.keys():
            tmp_vector = DataFrame(list(map(lambda x: get_sentiment_emoticon_lexicon_vector(x), tweet_list)))
            vector_dict[index] = tmp_vector
        frames.append(vector_dict[index])

    index = 7
    if is_active_vector_method(bin_string[index]):
        if index not in vector_dict.keys():
            tmp_vector = DataFrame(list(map(lambda x: get_sentiment_emoticon_afflex_vector(x), tweet_list)))
            vector_dict[index] = tmp_vector
        frames.append(vector_dict[index])

    '''NRC Hashtag Lexica'''
    index = 8
    if is_active_vector_method(bin_string[index]):
        if index not in vector_dict.keys():
            tmp_vector = DataFrame(list(map(lambda x: get_hashtag_emotion_vector(x), tweet_list)))
            vector_dict[index] = tmp_vector
        frames.append(vector_dict[index])

    index = 9
    if is_active_vector_method(bin_string[index]):
        if index not in vector_dict.keys():
            tmp_vector = DataFrame(list(map(lambda x: get_sentiment_hash_sent_lex_vector(x), tweet_list)))
            vector_dict[index] = tmp_vector
        frames.append(vector_dict[index])

    index = 10
    if is_active_vector_method(bin_string[index]):
        if index not in vector_dict.keys():
            tmp_vector = DataFrame(list(map(lambda x: get_sentiment_hashtag_affneglex_vector(x), tweet_list)))
            vector_dict[index] = tmp_vector
        frames.append(vector_dict[index])

    index = 11
    if is_active_vector_method(bin_string[index]):
        if index not in vector_dict.keys():
            tmp_vector = \
                DataFrame(list(map(lambda x: get_word2vec_embedding(x, wv_model_3, w2v_dimensions_3), tweet_list)))
            vector_dict[index] = tmp_vector
        frames.append(vector_dict[index])

    index = 12
    if is_active_vector_method(bin_string[index]):
        if index not in vector_dict.keys():
            tmp_vector = \
                DataFrame(list(map(lambda x: get_word2vec_embedding(x, wv_model_4, w2v_dimensions_4), tweet_list)))
            vector_dict[index] = tmp_vector
        frames.append(vector_dict[index])

    index = 13
    if is_active_vector_method(bin_string[index]):
        if index not in vector_dict.keys():
            tmp_vector = \
                DataFrame(list(map(lambda x: get_word2vec_embedding(x, wv_model_5, w2v_dimensions_5), tweet_list)))
            vector_dict[index] = tmp_vector
        frames.append(vector_dict[index])

    index = 14
    if is_active_vector_method(bin_string[index]):
        if index not in vector_dict.keys():
            tmp_vector = DataFrame(list(map(lambda x: get_emoji_intensity(x), tweet_list)))
            vector_dict[index] = tmp_vector
        frames.append(vector_dict[index])
        
    index = 15
    if is_active_vector_method(bin_string[index]):
        if index not in vector_dict.keys():
            tmp_vector = DataFrame(list(map(lambda x: get_depeche_mood_vector(x), tweet_list)))
            vector_dict[index] = tmp_vector
        frames.append(vector_dict[index])

    vectors = pd.concat(frames, axis=1)

    return vectors.values.tolist()

In [ ]:
# _ = train_vector_dict.pop(3)
# _ = train_vector_dict.pop(4)
# _ = train_vector_dict.pop(5)
# _ = train_vector_dict.pop(6)
# _ = train_vector_dict.pop(7)
# _ = train_vector_dict.pop(8)
# _ = train_vector_dict.pop(9)
# _ = train_vector_dict.pop(10)
# _ = train_vector_dict.pop(14)

# _ = test_vector_dict.pop(3)
# _ = test_vector_dict.pop(4)
# _ = test_vector_dict.pop(5)
# _ = test_vector_dict.pop(6)
# _ = test_vector_dict.pop(7)
# _ = test_vector_dict.pop(8)
# _ = test_vector_dict.pop(9)
# _ = test_vector_dict.pop(10)
# _ = test_vector_dict.pop(14)

In [ ]:
# train_vector_dict = dict()
# test_vector_dict = dict()

In [ ]:
feature_string = "1111001001001100"

In [ ]:
training_tweets = read_training_data(training_data_file_path)
dev_tweets = read_training_data(dev_set_path)

score_train = list()
tweet_train = list()
for tweet in training_tweets:
    tweet_train.append(tweet.text)
    score_train.append(float(tweet.intensity))

for tweet in dev_tweets:
    tweet_train.append(tweet.text)
    score_train.append(float(tweet.intensity))
    
print(len(score_train))

In [ ]:
x_train = vectorize_tweets(tweet_train, feature_string, train_vector_dict)

In [ ]:
print(len(x_train))
dimension = len(x_train[0])
print(dimension)

In [ ]:
# print(x_train[0])

In [ ]:
test_tweets = read_training_data(test_data_file_path)
verbatim_test_tweets = read_training_data_verbatim(test_data_file_path)

tweet_test = list()
y_gold = list()
for tweet in test_tweets:
    tweet_test.append(tweet.text)
    y_gold.append(float(tweet.intensity))

In [ ]:
x_test = vectorize_tweets(tweet_test, feature_string, test_vector_dict)

In [ ]:
print(len(x_test))
print(len(x_test[0]))

In [ ]:
browser_notify("Vectorization Done")

In [ ]:
train_vectors_path = "/home/v2john/" + emotion + "_train_vectors"
test_vectors_path = "/home/v2john/" + emotion + "_test_vectors"

In [ ]:
# # Save vectors

# with open(train_vectors_path, 'wb') as train_vectors_file:
#     pickle.dump(train_vector_dict, train_vectors_file)

# with open(test_vectors_path, 'wb') as test_vectors_file:
#     pickle.dump(test_vector_dict, test_vectors_file)

In [ ]:
# Restore vectors

with open(train_vectors_path, 'rb') as train_vectors_file:
    train_vector_dict = pickle.load(train_vectors_file)

with open(test_vectors_path, 'rb') as test_vectors_file:
    test_vector_dict = pickle.load(test_vectors_file)

Model Training and Testing


In [ ]:
def print_predictions(y_pred, y_gold):
    with open(debug_file_path, 'w') as debug_file:
        debug_file.write("VerbatimTweet" + "\t" + "CleanedTweet" + "\t" + 
                         "Predicted" + "\t" + "Actual" + "\n")
        for i in range(len(verbatim_test_tweets)):
            debug_file.write(verbatim_test_tweets[i].text + "\t" + tweet_test[i] + 
                             "\t" + str(y_pred[i]) + "\t" + str(y_gold[i]) + "\n")
    browser_notify("Wrote debug tweets")

In [ ]:
# ml_model = XGBRegressor(seed=0)

# # specify parameters and distributions to sample from
# param_dist = {
#     "max_depth": range(3, 11),
#     "n_estimators": range(100, 10000)
# }

# x_train = np.array(x_train)
# score_train = np.array(score_train)

# # run randomized search
# random_search = RandomizedSearchCV(ml_model, param_distributions=param_dist, n_iter=50)
# random_search.fit(x_train, score_train)

In [ ]:
# browser_notify("Random search complete")

In [ ]:
# print(random_search.best_params_)

In [ ]:
# ml_model = random_search.best_estimator_
# ml_model = ensemble.GradientBoostingRegressor(n_estimators=10000)
ml_model = XGBRegressor(max_depth=3, n_estimators=30000, seed=0)

x_train = np.array(x_train)
score_train = np.array(score_train)
ml_model.fit(x_train, score_train)

y_pred = ml_model.predict(x_test)

score = evaluate_lists(y_pred, y_gold)

In [ ]:
print("### " + emotion + ", feature-string: " + feature_string)
print("| pears-corr | spear-corr | pears-corr-range-05-1 | spear-corr-range-05-1 |")
print("| --- | --- | --- | --- |")
print("| " + str(score[0]) + " | " + str(score[1]) + " | " + \
      str(score[2]) + " | " + str(score[3]) + " |")

In [ ]:
# print_predictions(y_pred, y_gold)

In [ ]:
browser_notify("Training complete")

In [ ]:
# with open(predictions_file_path, 'w') as predictions_file:
#     for i in range(len(y_pred)):
#         predictions_file.write(
#             str(test_tweets[i].id) + "\t" + test_tweets[i].text + "\t" +
#             test_tweets[i].emotion + "\t" + str(y_pred[i]) + "\n"
#         )

Simple Neural Network Implementation in Keras


In [ ]:
# define base model
input_size, dim_size = np.array(x_train).shape
print(input_size, dim_size)
sgd = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True, clipnorm=0.5)

def baseline_model():
    # create model
    model = Sequential()
    
    model.add(Dense(10000, activation='relu', kernel_initializer='glorot_uniform', input_dim=dim_size))
    model.add(Dropout(rate=0.25))
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(loss='mean_squared_error', optimizer="adam")
    return model

In [ ]:
ml_model = KerasRegressor(build_fn=baseline_model, epochs=1000, batch_size=128, verbose=1)
x_train = np.array(x_train)
x_test = np.array(x_test)

ml_model.fit(x_train, score_train)

In [ ]:
y_pred = ml_model.predict(np.array(x_test))

In [ ]:
# print(y_pred)

In [ ]:
score = evaluate_lists(y_pred, y_gold)

print("### " + emotion + ", feature_string: " + feature_string)
print("| pears-corr | spear-corr | pears-corr-range-05-1 | spear-corr-range-05-1 |")
print("| --- | --- | --- | --- |")
print("| " + str(score[0]) + " | " + str(score[1]) + " | " + \
      str(score[2]) + " | " + str(score[3]) + " |")

In [ ]:
browser_notify("Neural Net Training Complete")

Partitioned Neural Network


In [ ]:
sub1_feature_string = "0001111111100011"
x_train_subset_1 = vectorize_tweets(tweet_train, sub1_feature_string, train_vector_dict)
x_test_subset_1 = vectorize_tweets(tweet_test, sub1_feature_string, test_vector_dict)
print(len(x_train_subset_1[0]))

In [ ]:
# print(x_train_subset_1[0])

In [ ]:
sub2_feature_string = "1110000000011100"
x_train_subset_2 = vectorize_tweets(tweet_train, sub2_feature_string, train_vector_dict)
x_test_subset_2 = vectorize_tweets(tweet_test, sub2_feature_string, test_vector_dict)
print(len(x_train_subset_2[0]))

In [ ]:
def lex_model():
    # create model
    model = Sequential()
    model.add(Dense(1000, activation='tanh', kernel_initializer='random_uniform', 
                    input_dim=len(x_train_subset_1[0])))
    model.add(Dense(500, activation='sigmoid', kernel_initializer='random_uniform', 
                    input_dim=len(x_train_subset_1[0])))
    return model

In [ ]:
def embed_model():
    # create model    
    model = Sequential()
    model.add(Dense(10000, activation='relu', kernel_initializer='random_uniform', 
                    input_dim=len(x_train_subset_1[0])))
    model.add(Dense(500, activation='relu', kernel_initializer='random_uniform', 
                    input_dim=len(x_train_subset_1[0])))
    return model

In [ ]:
x_train_subset_1 = np.array(x_train_subset_1)
x_test_subset_1 = np.array(x_test_subset_1)

lex_embeddings_train = lex_model().predict(x_train_subset_1)
lex_embeddings_test = lex_model().predict(x_test_subset_1)

In [ ]:
x_train_subset_2 = np.array(x_train_subset_2)
x_test_subset_2 = np.array(x_test_subset_2)

word_embeddings_train = embed_model().predict(x_train_subset_1)
word_embeddings_test = embed_model().predict(x_test_subset_1)

In [ ]:
print(lex_embeddings_train.shape, word_embeddings_train.shape)

In [ ]:
x_train_final = list()
x_test_final = list()

for i in range(len(x_train)):
    x_train_final.append(np.append(lex_embeddings_train[i], word_embeddings_train[i]))

for i in range(len(x_test)):
    x_test_final.append(np.append(lex_embeddings_test[i], word_embeddings_test[i]))

print(len(x_train_final[0]))

Feature Definitions


In [ ]:
feature_index_mapping = \
    {
        0: "Word2Vec [Google News]",
        1: "Word2Vec [Twitter]",
        2: "GloVe-Twitter",
        3: "NRC-AffectIntensity",
        4: "Wordnet-Affect",
        5: "NRC-Emotion-Lexicon",
        6: "NRC-Emoticon-Lexicon",
        7: "NRC-Emoticon-AffLexNegLex",
        8: "NRC-Hashtag-Emotion",
        9: "NRC-Hashtag-Sentiment-Lexicon",
        10: "NRC-Hashtag-Sentiment-AffLexNegLex",
    }


def get_features_from_identifier(bin_string):
    features = list()
    for i in range(len(bin_string)):
        if int(bin_string[i]):
            features.append(feature_index_mapping[i])

    return features

In [ ]:
print(get_features_from_identifier(feature_string))