Define evaluation logic

from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor
import pandas as pd
from pandas import DataFrame
import scipy.stats as st
import time
import json
import pickle
import re
import html

import keras

import numpy
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras import optimizers
from keras.wrappers.scikit_learn import KerasRegressor
from keras.optimizers import SGD, Adagrad
from keras.layers.recurrent import LSTM

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import normalize

from nltk import word_tokenize
from nltk import bigrams
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import sentiwordnet as swn

from IPython.display import display, HTML

def browser_alert(message):
    display(HTML('<script type="text/javascript">alert("' + message + '");</script>'))
def browser_notify(message):
    display(HTML('<script type="text/javascript">var notification=new Notification("Jupyter Notification",{icon:"",body:"' + message + '"});</script>'))

# browser_notify("random")

import numpy
import sys
import scipy.stats

def evaluate(pred,gold):
    f=open(pred, "rb")
    f=open(gold, "rb")

        # align tweets ids with gold scores and predictions
        for line in gold_lines:
            line = line.decode()
            if len(parts)==4:   
                raise ValueError('Format problem.')
        for line in pred_lines:
            line = line.decode()
            if len(parts)==4:  
                if int(parts[0]) in data_dic:
                    except ValueError:
                        # Invalid predictions are replaced by a default value
                    raise ValueError('Invalid tweet id.')
                raise ValueError('Format problem.')
        # lists storing gold and prediction scores
        # lists storing gold and prediction scores where gold score >= 0.5
        for id in data_dic:
                raise ValueError('Repeated id in test data.')
        # return zero correlation if predictions are constant
        if numpy.std(pred_scores)==0 or numpy.std(gold_scores)==0:
            return (0,0,0,0)


        return (pears_corr,spear_corr,pears_corr_range_05_1,spear_corr_range_05_1)
        raise ValueError('Predictions and gold data have different number of lines.')
def evaluate_lists(pred, gold):
    if len(pred) == len(gold):
        # lists storing gold and prediction scores where gold score >= 0.5
        for i in range(len(gold_scores)):
        # return zero correlation if predictions are constant
        if numpy.std(pred_scores)==0 or numpy.std(gold_scores)==0:
            return (0,0,0,0)

        return np.array([pears_corr,spear_corr,pears_corr_range_05_1,spear_corr_range_05_1])
        raise ValueError('Predictions and gold data have different number of lines.')

Load pre-trained word vectors

import gc

import gensim
import numpy as np
from sklearn.preprocessing import PolynomialFeatures

word_vector_path = "/home/v2john/"
wassa_home = "/home/v2john/WASSA-Task/"

Word2Vec + GloVe

def loadGloveModel(gloveFile):
    print("Loading Glove Model")
    f = open(gloveFile,'r')
    model = {}
    num = 1
    for line in f:
            splitLine = line.split()
            word = splitLine[0]
            embedding = [float(val) for val in splitLine[1:]]
            model[word] = np.array(embedding)
            num += 1
        except Exception as e:
            print("Failed at line " + str(num))
    print("Done.",len(model)," words loaded!")
    return model

# Google news pretrained vectors
wv_model_path = word_vector_path + "GoogleNews-vectors-negative300.bin.gz"
wv_model = gensim.models.KeyedVectors.load_word2vec_format(wv_model_path, binary=True, unicode_errors='ignore')

# Twitter pretrained vectors
wv_model_path_1 = word_vector_path + "word2vec_twitter_model.bin"
wv_model_1 = gensim.models.KeyedVectors.load_word2vec_format(wv_model_path_1, binary=True, unicode_errors='ignore')

wv_model_path_2 = word_vector_path + "glove.twitter.27B.200d.txt"
wv_model_2 = loadGloveModel(wv_model_path_2)

wv_model_path_3 = word_vector_path + "glove.6B.300d.txt"
wv_model_3 = loadGloveModel(wv_model_path_3)

wv_model_path_4 = word_vector_path + "glove.42B.300d.txt"
wv_model_4 = loadGloveModel(wv_model_path_4)

wv_model_path_5 = word_vector_path + "glove.840B.300d.txt"
wv_model_5 = loadGloveModel(wv_model_path_5)

w2v_dimensions = len(wv_model['word'])
w2v_dimensions_1 = len(wv_model_1['word'])
w2v_dimensions_2 = len(wv_model_2['word'])
w2v_dimensions_3 = len(wv_model_3['word'])
w2v_dimensions_4 = len(wv_model_4['word'])
w2v_dimensions_5 = len(wv_model_5['word'])

print(w2v_dimensions, w2v_dimensions_1, 
      w2v_dimensions_2, w2v_dimensions_3, w2v_dimensions_4,

def get_word2vec_embedding(tweet, model, dimensions):
    tokens = word_tokenize(tweet)
    vector_list = list()
    for token in tokens:
        except Exception as e:

    if len(vector_list) == 0:
        uni_vec_rep = np.zeros(dimensions).tolist()
        uni_vec_rep = sum(vector_list) / float(len(vector_list))

    return uni_vec_rep

# wv_model = None
# wv_model_1 = None
# wv_model_2 = None
# wv_model_3 = None
# wv_model_4 = None
# wv_model_5 = None

browser_notify("Embeddings learnt")


def remove_stopwords(string):
    split_string = \
        [word for word in string.split()
         if word not in stopwords.words('english')]
    return " ".join(split_string)

# wnl = WordNetLemmatizer()

def clean_str(string):  
    string = html.unescape(string)
    string = string.replace("\\n", " ")
    string = string.replace("_NEG", "")
    string = string.replace("_NEGFIRST", "")
    string = re.sub(r"@[A-Za-z0-9_s(),!?\'\`]+", "", string) # removing any twitter handle mentions
#     string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"#", "", string)
    string = re.sub(r"\*", "", string)
    string = re.sub(r"\'s", "", string)
    string = re.sub(r"\'m", " am", string)
    string = re.sub(r"\'ve", " have", string)
    string = re.sub(r"n\'t", " not", string)
    string = re.sub(r"\'re", " are", string)
    string = re.sub(r"\'d", " would", string)
    string = re.sub(r"\'ll", " will", string)
    string = re.sub(r",", "", string)
    string = re.sub(r"!", " !", string)
    string = re.sub(r"\(", "", string)
    string = re.sub(r"\)", "", string)
    string = re.sub(r"\?", " ?", string)
    string = re.sub(r"\s{2,}", " ", string)

    return remove_stopwords(string.strip().lower())

Metadata and Class Definitions

class Tweet(object):

    def __init__(self, id, text, emotion, intensity): = id
        self.text = text
        self.emotion = emotion
        self.intensity = intensity

    def __repr__(self):
        return \
            "id: " + + \
            ", text: " + self.text + \
            ", emotion: " + self.emotion + \
            ", intensity: " + self.intensity

def read_training_data(training_data_file_path):

    train_list = list()
    with open(training_data_file_path) as input_file:
        for line in input_file:
            line = line.strip()
            array = line.split('\t')
            train_list.append(Tweet(array[0], clean_str(array[1]), array[2], float(array[3])))
    return train_list
def read_training_data_verbatim(training_data_file_path):

    train_list = list()
    with open(training_data_file_path) as input_file:
        for line in input_file:
            line = line.strip()
            array = line.split('\t')
            train_list.append(Tweet(array[0], array[1], array[2], float(array[3])))
    return train_list
def read_test_data(training_data_file_path):

    test_list = list()
    with open(training_data_file_path) as input_file:
        for line in input_file:
            line = line.strip()
            array = line.split('\t')
            test_list.append(Tweet(array[0], clean_str(array[1]), array[2], None))
    return test_list

emotion = "anger"

training_data_file_path = \
    wassa_home + "dataset/" + \
    emotion + "-ratings-0to1.train.txt"
predictions_file_path = \
    wassa_home + "predictions/" + \
    emotion + "-pred.txt"
dev_set_path = \
    wassa_home + "dataset/dev-set/" + \
    emotion + ""
test_data_file_path = \
    wassa_home + "dataset/test-set/" + \
    emotion + ""
debug_file_path = \
    wassa_home + "dataset/test-set/debug/" + \
    emotion + ".tsv"

# print(training_data_file_path, predictions_file_path, dev_set_path, test_data_file_path)

Feature Extraction Snippets

Emoji Intensity

with open(wassa_home + 'lexicons/emoji_map.json') as emoji_file:
    emoji_list = json.load(emoji_file)
emoji_dict = dict()
for emoji in emoji_list:
    emoji_dict[emoji["emoji"]] = (emoji["name"], emoji["polarity"])

# print(emoji_dict["💯"])

poly_emoji_intensity = PolynomialFeatures(5)
# poly_emoji_intensity = PolynomialFeatures(1)

def get_emoji_intensity(tweet):
    score = 0.0
    for emoji in emoji_dict.keys():
        count = tweet.count(emoji)
        score += count * emoji_dict[emoji][1]
    return normalize(poly_emoji_intensity.fit_transform(np.array([score]).reshape(1, -1))[0].reshape(1, -1))[0]

# get_emoji_intensity("💯")

In [ ]:
affect_intensity_file_path = \
    wassa_home + \

def get_word_affect_intensity_dict(emotion):
    word_intensities = dict()

    with open(affect_intensity_file_path) as affect_intensity_file:
        for line in affect_intensity_file:
            word_int_array = line.replace("\n", "").split("\t")

            if (word_int_array[2] == emotion):
                word_intensities[word_int_array[0]] = float(word_int_array[1])

    return word_intensities

word_intensities = get_word_affect_intensity_dict(emotion)

poly_emo_int = PolynomialFeatures(10)
# poly_emo_int = PolynomialFeatures(1)

def get_emo_int_vector(tweet):
    score = 0.0
    for word in word_intensities.keys():
        if word in tweet:
            score += tweet.count(word) * float(word_intensities[word])
    return normalize(poly_emo_int.fit_transform(np.array([score]).reshape(1, -1))[0].reshape(1, -1))[0]
#     return [score]

# get_emo_int_vector("furious")


poly_sentiwordnet = PolynomialFeatures(5)
# poly_sentiwordnet = PolynomialFeatures(1)

def get_sentiwordnetscore(tweet):
    score = np.zeros(2)
    for word in tweet.split():
        synsetlist = list(swn.senti_synsets(word))
        if synsetlist:
            score[0] += synsetlist[0].pos_score()
            score[1] += synsetlist[0].neg_score()
#     return tweet_score.tolist()
    return normalize(poly_sentiwordnet.fit_transform(np.array([score]).reshape(1, -1))[0].reshape(1, -1))[0]

# get_sentiwordnetscore("furious")

In [ ]:
sentiment_emotion_lex_file_path = \
    wassa_home + \
    "lexicons/NRC-Sentiment-Emotion-Lexicons/Lexicons/NRC-Emotion-Lexicon-v0.92/" + \

def get_affect_presence_list(emotion):
    word_list = list()
    with open(sentiment_emotion_lex_file_path) as sentiment_emotion_lex_file:
        for line in sentiment_emotion_lex_file:
            word_array = line.replace("\n", "").split("\t")

            if (word_array[1] == emotion and word_array[2] == '1'):
    return word_list

word_list = get_affect_presence_list(emotion)

def get_sentiment_emotion_feature(tweet):
    vector = np.zeros(1)
    for word in word_list:
        if word in tweet.split():
            vector[0] = 1.0
    return vector

# get_sentiment_emotion_feature("furious")

In [ ]:
hashtag_emotion_lex_file_path = \
    wassa_home + \
    "lexicons/NRC-Sentiment-Emotion-Lexicons/Lexicons/NRC-Hashtag-Emotion-Lexicon-v0.2/" + \
def get_hashtag_emotion_intensity(emotion):
    hastag_intensities = dict()
    with open(hashtag_emotion_lex_file_path) as hashtag_emotion_lex_file:
        for line in hashtag_emotion_lex_file:
            word_array = line.replace("\n", "").split("\t")
            if (word_array[0] == emotion):
                hastag_intensities[clean_str(word_array[1])] = float(word_array[2])

    return hastag_intensities

hashtag_emotion_intensities = get_hashtag_emotion_intensity(emotion)

poly_hashtag_emotion = PolynomialFeatures(10)
# poly_emo_int = PolynomialFeatures(1)

def get_hashtag_emotion_vector(tweet):
    score = 0.0
    for word in hashtag_emotion_intensities.keys():
        if word in tweet:
            score += tweet.count(word) * float(hashtag_emotion_intensities[word])
#     return [score]
    return normalize(poly_hashtag_emotion.fit_transform(np.array([score]).reshape(1, -1))[0].reshape(1, -1))[0]

# get_hashtag_emotion_vector("furious")

In [ ]:
emoticon_lexicon_unigrams_file_path = \
    wassa_home + \
emoticon_lexicon_bigrams_file_path = \
    wassa_home + \
emoticon_lexicon_pairs_file_path = \
    wassa_home + \
pair_split_string = "---"
def get_emoticon_lexicon_unigram_dict():
    emoticon_lexicon_unigrams = dict()
    with open(emoticon_lexicon_unigrams_file_path) as emoticon_lexicon_file:
        for line in emoticon_lexicon_file:
            word_array = line.replace("\n", "").split("\t")
            emoticon_lexicon_unigrams[word_array[0]] = np.array([float(val) for val in word_array[1:]])
    return emoticon_lexicon_unigrams

def get_emoticon_lexicon_bigram_dict():
    emoticon_lexicon_bigrams = dict()
    with open(emoticon_lexicon_bigrams_file_path) as emoticon_lexicon_file:
        for line in emoticon_lexicon_file:
            word_array = line.replace("\n", "").split("\t")
            emoticon_lexicon_bigrams[word_array[0]] = np.array([float(val) for val in word_array[1:]])
    return emoticon_lexicon_bigrams

def get_emoticon_lexicon_pairs_dict():
    emoticon_lexicon_pairs = dict()
    with open(emoticon_lexicon_pairs_file_path) as emoticon_lexicon_file:
        for line in emoticon_lexicon_file:
            word_array = line.replace("\n", "").split("\t")
            pair = word_array[0].split(pair_split_string)
            token_1 = clean_str(pair[0])
            token_2 = clean_str(pair[1])
            if token_1 and token_2:
                token_1_dict = None
                if token_1 in emoticon_lexicon_pairs.keys():
                    token_1_dict = emoticon_lexicon_pairs[token_1]
                    token_1_dict = dict()
                token_1_dict[token_2] = np.array([float(val) for val in word_array[1:]])
                emoticon_lexicon_pairs[token_1] = token_1_dict
    return emoticon_lexicon_pairs

emoticon_lexicon_unigram_dict = get_emoticon_lexicon_unigram_dict()

emoticon_lexicon_bigram_dict = get_emoticon_lexicon_bigram_dict()

emoticon_lexicon_pairs_dict = get_emoticon_lexicon_pairs_dict()

poly_emoticon_lexicon = PolynomialFeatures(5)
# poly_emoticon_lexicon = PolynomialFeatures(1)

def get_unigram_sentiment_emoticon_lexicon_vector(tokens):
    vector_list = np.zeros(3)
    counter = 0
    for token in tokens:
        word = clean_str(token)
        if word in emoticon_lexicon_unigram_dict.keys():
            vector_list += emoticon_lexicon_unigram_dict[word]
            counter += 1
    if counter > 0:
        vector_list /= counter
#     return vector_list
    return normalize(poly_emoticon_lexicon.fit_transform(vector_list.reshape(1, -1))[0].reshape(1, -1))[0]

def get_bigram_sentiment_emoticon_lexicon_vector(tokens):
    bi_tokens = bigrams(tokens)
    vector_list = np.zeros(3)
    counter = 0
    for bi_token in bi_tokens:
        word = clean_str(" ".join(bi_token))
        if word in emoticon_lexicon_bigram_dict.keys():
            vector_list += emoticon_lexicon_bigram_dict[word]
            counter += 1
    if counter > 0:
        vector_list /= counter
#     return vector_list
    return normalize(poly_emoji_intensity.fit_transform(vector_list.reshape(1, -1))[0].reshape(1, -1))[0].tolist()

def get_pair_sentiment_emoticon_lexicon_vector(tokens):
    vector_list = np.zeros(3)
    counter = 0
    for i in range(len(tokens)):
        word_1 = clean_str(tokens[i])
        if word_1 in emoticon_lexicon_pairs_dict.keys():
            token_1_dict = emoticon_lexicon_pairs_dict[word_1]
            for j in range(i, len(tokens)):
                word_2 = clean_str(tokens[j])
                if word_2 in token_1_dict.keys():
                    vector_list += token_1_dict[word_2]
                    counter += 1
    if counter > 0:
        vector_list /= counter
#     return vector_list
    return normalize(poly_emoji_intensity.fit_transform(vector_list.reshape(1, -1))[0].reshape(1, -1))[0].tolist()

def get_sentiment_emoticon_lexicon_vector(tweet):
    tokens = word_tokenize(tweet)
    final_list = np.asarray([])
    # Adding unigram features
    final_list = np.append(
    final_list = np.append(
    final_list = np.append(
    # Adding bigram features
#     final_list.extend(get_bigram_sentiment_emoticon_lexicon_vector(tokens))
    # Adding pair features
#     final_list.extend(get_pair_sentiment_emoticon_lexicon_vector(tokens))

    return final_list

# get_sentiment_emoticon_lexicon_vector("furious")

In [ ]:
emoticon_afflex_unigrams_file_path = \
    wassa_home + \
    "lexicons/NRC-Sentiment-Emotion-Lexicons/Lexicons/NRC-Emoticon-AffLexNegLex-v1.0/" + \
emoticon_afflex_bigrams_file_path = \
    wassa_home + \
    "lexicons/NRC-Sentiment-Emotion-Lexicons/Lexicons/NRC-Emoticon-AffLexNegLex-v1.0/" + \
def get_emoticon_afflex_unigram_dict():
    emoticon_afflex_unigrams = dict()
    with open(emoticon_afflex_unigrams_file_path) as emoticon_lexicon_file:
        for line in emoticon_lexicon_file:
            word_array = line.replace("\n", "").split("\t")
            emoticon_afflex_unigrams[word_array[0]] = np.array([float(val) for val in word_array[1:]])
    return emoticon_afflex_unigrams

def get_emoticon_afflex_bigram_dict():
    emoticon_afflex_bigrams = dict()
    with open(emoticon_afflex_bigrams_file_path) as emoticon_lexicon_file:
        for line in emoticon_lexicon_file:
            word_array = line.replace("\n", "").split("\t")
            emoticon_afflex_bigrams[word_array[0]] = np.array([float(val) for val in word_array[1:]])
    return emoticon_afflex_bigrams

emoticon_afflex_unigram_dict = get_emoticon_afflex_unigram_dict()

emoticon_afflex_bigram_dict = get_emoticon_afflex_bigram_dict()

poly_emoticon_lexicon = PolynomialFeatures(5)
# poly_emoticon_lexicon = PolynomialFeatures(1)

def get_unigram_sentiment_emoticon_afflex_vector(tokens):
    vector_list = np.zeros(3)
    counter = 0
    for token in tokens:
        word = clean_str(token)
        if word in emoticon_afflex_unigram_dict.keys():
            vector_list += emoticon_afflex_unigram_dict[word]
            counter += 1
    if counter > 0:
        vector_list /= counter
#     return vector_list
    return normalize(poly_emoticon_lexicon.fit_transform(vector_list.reshape(1, -1))[0].reshape(1, -1))[0]

def get_bigram_sentiment_emoticon_afflex_vector(tokens):
    bi_tokens = bigrams(tokens)
    vector_list = np.zeros(3)
    counter = 0
    for bi_token in bi_tokens:
        word = clean_str(" ".join(bi_token))
        if word in emoticon_afflex_bigram_dict.keys():
            vector_list += emoticon_afflex_bigram_dict[word]
            counter += 1
    if counter > 0:
        vector_list /= counter
#     return vector_list
    return normalize(poly_emoticon_lexicon.fit_transform(vector_list.reshape(1, -1))[0].reshape(1, -1))[0]

def get_sentiment_emoticon_afflex_vector(tweet):
    final_list = np.asarray([])
    tokens = word_tokenize(tweet)
    # Adding unigram features
    final_list = np.append(final_list, get_unigram_sentiment_emoticon_afflex_vector(tokens))
    # Adding bigram featunigram_list =ures
    final_list = np.append(final_list, get_bigram_sentiment_emoticon_afflex_vector(tokens))

    return final_list

# get_sentiment_emoticon_afflex_vector("furious")

In [ ]:
hashtag_affneglex_unigrams_file_path = \
    wassa_home + \
    "lexicons/NRC-Sentiment-Emotion-Lexicons/Lexicons/NRC-Hashtag-Sentiment-AffLexNegLex-v1.0/" + \
hashtag_affneglex_bigrams_file_path = \
    wassa_home + \
    "lexicons/NRC-Sentiment-Emotion-Lexicons/Lexicons/NRC-Hashtag-Sentiment-AffLexNegLex-v1.0/" + \
def get_hashtag_affneglex_unigram_dict():
    hashtag_affneglex_unigrams = dict()
    with open(hashtag_affneglex_unigrams_file_path) as hashtag_sent_lex_file:
        for line in hashtag_sent_lex_file:
            word_array = line.replace("\n", "").split("\t")
            hashtag_affneglex_unigrams[clean_str(word_array[0])] = np.array([float(val) for val in word_array[1:]])
    return hashtag_affneglex_unigrams

def get_hashtag_affneglex_bigram_dict():
    hashtag_affneglex_bigrams = dict()
    with open(hashtag_affneglex_bigrams_file_path) as hashtag_sent_lex_file:
        for line in hashtag_sent_lex_file:
            word_array = line.replace("\n", "").split("\t")
            hashtag_affneglex_bigrams[clean_str(word_array[0])] = np.array([float(val) for val in word_array[1:]])

    return hashtag_affneglex_bigrams

hashtag_affneglex_unigram_dict = get_hashtag_affneglex_unigram_dict()

hashtag_affneglex_bigram_dict = get_hashtag_affneglex_bigram_dict()

poly_hashtag_sent_affneglex = PolynomialFeatures(5)
# poly_hashtag_sent_affneglex = PolynomialFeatures(1)

def get_unigram_sentiment_hashtag_affneglex_vector(tokens):
    vector_list = np.zeros(3)
    counter = 0
    for token in tokens:
        word = clean_str(token)
        if word in hashtag_affneglex_unigram_dict.keys():
            vector_list += hashtag_affneglex_unigram_dict[word]
            counter += 1
    if counter > 0:
        vector_list /= counter
#     return vector_list
    return normalize(poly_hashtag_sent_affneglex.fit_transform(vector_list.reshape(1, -1))[0].reshape(1, -1))[0]

def get_bigram_sentiment_hashtag_affneglex_vector(tokens):
    bi_tokens = bigrams(tokens)
    vector_list = np.zeros(3)
    counter = 0
    for bi_token in bi_tokens:
        word = clean_str(" ".join(bi_token))
        if word in hashtag_affneglex_bigram_dict.keys():
            vector_list += hashtag_affneglex_bigram_dict[word]
            counter += 1
    if counter > 0:
        vector_list /= counter
#     return vector_list
    return normalize(poly_hashtag_sent_affneglex.fit_transform(vector_list.reshape(1, -1))[0].reshape(1, -1))[0]

def get_sentiment_hashtag_affneglex_vector(tweet):
    final_list = np.asarray([])
    tokens = word_tokenize(tweet)
    # Adding unigram features
    final_list = np.append(final_list, get_unigram_sentiment_hashtag_affneglex_vector(tokens))
    # Adding bigram features
    final_list = np.append(final_list, get_bigram_sentiment_hashtag_affneglex_vector(tokens))

    return final_list

# get_sentiment_hashtag_affneglex_vector("furious")

In [ ]:
hash_sent_lex_unigrams_file_path = \
    wassa_home + \
hash_sent_lex_bigrams_file_path = \
    wassa_home + \
hash_sent_lex_pairs_file_path = \
    wassa_home + \
pair_split_string = "---"

def get_hash_sent_lex_unigram_dict():
    hash_sent_lex_unigrams = dict()
    with open(hash_sent_lex_unigrams_file_path) as hash_sent_lex_file:
        for line in hash_sent_lex_file:
            word_array = line.replace("\n", "").split("\t")
            if clean_str(word_array[0]):
                hash_sent_lex_unigrams[word_array[0]] = np.array([float(val) for val in word_array[1:]])
    return hash_sent_lex_unigrams

def get_hash_sent_lex_bigram_dict():
    hash_sent_lex_bigrams = dict()
    with open(hash_sent_lex_bigrams_file_path) as hash_sent_lex_file:
        for line in hash_sent_lex_file:
            word_array = line.replace("\n", "").split("\t")
            if clean_str(word_array[0]):
                hash_sent_lex_bigrams[word_array[0]] = np.array([float(val) for val in word_array[1:]])
    return hash_sent_lex_bigrams

def get_hash_sent_lex_pairs_dict():
    hash_sent_lex_pairs = dict()
    with open(hash_sent_lex_pairs_file_path) as hash_sent_lex_file:
        for line in hash_sent_lex_file:
            word_array = line.replace("\n", "").split("\t")
            pair = word_array[0].split(pair_split_string)
            token_1 = clean_str(pair[0])
            token_2 = clean_str(pair[1])
            if token_1 and token_2:
                token_1_dict = None
                if token_1 in hash_sent_lex_pairs.keys():
                    token_1_dict = hash_sent_lex_pairs[token_1]
                    token_1_dict = dict()
                token_1_dict[token_2] = np.array([float(val) for val in word_array[1:]])
                hash_sent_lex_pairs[token_1] = token_1_dict
    return hash_sent_lex_pairs

hash_sent_lex_unigram_dict = get_hash_sent_lex_unigram_dict()

hash_sent_lex_bigram_dict = get_hash_sent_lex_bigram_dict()

hash_sent_lex_pairs_dict = get_hash_sent_lex_pairs_dict()

poly_hash_sent_lex = PolynomialFeatures(5)
# poly_hash_sent_lex = PolynomialFeatures(1)

def get_unigram_sentiment_hash_sent_lex_vector(tokens):
    vector_list = np.zeros(3)
    counter = 0
    for token in tokens:
        word = clean_str(token)
        if word in hash_sent_lex_unigram_dict.keys():
            vector_list += hash_sent_lex_unigram_dict[word]
            counter += 1

    if counter > 0:
        vector_list /= counter
#     return vector_list
    return normalize(poly_hash_sent_lex.fit_transform(vector_list.reshape(1, -1))[0].reshape(1, -1))[0]
def get_bigram_sentiment_hash_sent_lex_vector(tokens):
    bi_tokens = bigrams(tokens)
    vector_list = np.zeros(3)
    counter = 0
    for bi_token in bi_tokens:
        word = clean_str(" ".join(bi_token))
        if word in hash_sent_lex_bigram_dict.keys():
            vector_list += hash_sent_lex_bigram_dict[word]
            counter += 1
    if counter > 0:
        vector_list /= counter
#     return vector_list
    return normalize(poly_hash_sent_lex.fit_transform(vector_list.reshape(1, -1))[0].reshape(1, -1))[0]

def get_pair_sentiment_hash_sent_lex_vector(tokens):
    vector_list = np.zeros(3)
    counter = 0
    for i in range(len(tokens)):
        word_1 = clean_str(tokens[i])
        if word_1 in hash_sent_lex_pairs_dict.keys():
            token_1_dict = hash_sent_lex_pairs_dict[word_1]
            for j in range(i, len(tokens)):
                word_2 = clean_str(tokens[j])
                if word_2 in token_1_dict.keys():
                    vector_list += token_1_dict[word_2]
                    counter += 1
    if counter > 0:
        vector_list /= counter
#     return vector_list
    return normalize(poly_hash_sent_lex.fit_transform(vector_list.reshape(1, -1))[0].reshape(1, -1))[0]
def get_sentiment_hash_sent_lex_vector(tweet):
    final_list = np.asarray([])
    tokens = word_tokenize(tweet)
    # Adding unigram features
    final_list = np.append(final_list, get_unigram_sentiment_hash_sent_lex_vector(tokens))
    # Adding bigram features
    final_list = np.append(final_list, get_bigram_sentiment_hash_sent_lex_vector(tokens))
    # Adding pair features
    final_list = np.append(final_list, get_pair_sentiment_hash_sent_lex_vector(tokens))

    return final_list

# get_sentiment_hash_sent_lex_vector("furious")

In [ ]:
depeche_mood_file_path = \
    wassa_home + \

def get_depeche_vector_dict():
    depeche_vector_dict = dict()
    with open(depeche_mood_file_path) as depeche_mood_file:
        for line in depeche_mood_file:
            word_array = line.replace("\n", "").split("\t")
            depeche_vector_dict[word_array[0].split("#")[0]] = np.array([float(val) for val in word_array[1:]])
    return depeche_vector_dict

depeche_vector_dict = get_depeche_vector_dict()

# print(len(depeche_vector_dict["0"]))

poly_depm = PolynomialFeatures(5)

def get_depeche_mood_vector(tweet):
    vector_list = np.zeros(8)
    tokens = word_tokenize(tweet)
    counter = 0
    for token in tokens:
        if token in depeche_vector_dict.keys():
            vector_list += np.array(depeche_vector_dict[token])
            counter += 1
    if counter > 0:
        vector_list /= counter
    return normalize(poly_depm.fit_transform(vector_list.reshape(1, -1))[0].reshape(1, -1))[0]

# get_depeche_mood_vector("rom sentences with happy sad anxious")

def is_active_vector_method(string):
    return int(string)

def vectorize_tweets(tweet_list, bin_string, vector_dict):

    vectors = list()
    frames = list()

    '''Pre-trained Word embeddings'''
    index = 0
    if is_active_vector_method(bin_string[index]):
        if index not in vector_dict.keys():
            tmp_vector = \
                DataFrame(list(map(lambda x: get_word2vec_embedding(x, wv_model, w2v_dimensions), tweet_list)))
            vector_dict[index] = tmp_vector

    index = 1
    if is_active_vector_method(bin_string[index]):
        if index not in vector_dict.keys():
            tmp_vector = \
                DataFrame(list(map(lambda x: get_word2vec_embedding(x, wv_model_1, w2v_dimensions_1), tweet_list)))
            vector_dict[index] = tmp_vector

    index = 2
    if is_active_vector_method(bin_string[index]):
        if index not in vector_dict.keys():
            tmp_vector = \
                DataFrame(list(map(lambda x: get_word2vec_embedding(x, wv_model_2, w2v_dimensions_2), tweet_list)))
            vector_dict[index] = tmp_vector

    '''NRC Emotion Intensity Lexicon'''
    index = 3
    if is_active_vector_method(bin_string[index]):
        if index not in vector_dict.keys():
            tmp_vector = DataFrame(list(map(lambda x: get_emo_int_vector(x), tweet_list)))
            vector_dict[index] = tmp_vector

    index = 4
    if is_active_vector_method(bin_string[index]):
        if index not in vector_dict.keys():
            tmp_vector = DataFrame(list(map(lambda x: get_sentiwordnetscore(x), tweet_list)))
            vector_dict[index] = tmp_vector

    '''NRC Sentiment Lexica'''
    index = 5
    if is_active_vector_method(bin_string[index]):
        if index not in vector_dict.keys():
            tmp_vector = DataFrame(list(map(lambda x: get_sentiment_emotion_feature(x), tweet_list)))
            vector_dict[index] = tmp_vector

    index = 6
    if is_active_vector_method(bin_string[index]):
        if index not in vector_dict.keys():
            tmp_vector = DataFrame(list(map(lambda x: get_sentiment_emoticon_lexicon_vector(x), tweet_list)))
            vector_dict[index] = tmp_vector

    index = 7
    if is_active_vector_method(bin_string[index]):
        if index not in vector_dict.keys():
            tmp_vector = DataFrame(list(map(lambda x: get_sentiment_emoticon_afflex_vector(x), tweet_list)))
            vector_dict[index] = tmp_vector

    '''NRC Hashtag Lexica'''
    index = 8
    if is_active_vector_method(bin_string[index]):
        if index not in vector_dict.keys():
            tmp_vector = DataFrame(list(map(lambda x: get_hashtag_emotion_vector(x), tweet_list)))
            vector_dict[index] = tmp_vector

    index = 9
    if is_active_vector_method(bin_string[index]):
        if index not in vector_dict.keys():
            tmp_vector = DataFrame(list(map(lambda x: get_sentiment_hash_sent_lex_vector(x), tweet_list)))
            vector_dict[index] = tmp_vector

    index = 10
    if is_active_vector_method(bin_string[index]):
        if index not in vector_dict.keys():
            tmp_vector = DataFrame(list(map(lambda x: get_sentiment_hashtag_affneglex_vector(x), tweet_list)))
            vector_dict[index] = tmp_vector

    index = 11
    if is_active_vector_method(bin_string[index]):
        if index not in vector_dict.keys():
            tmp_vector = \
                DataFrame(list(map(lambda x: get_word2vec_embedding(x, wv_model_3, w2v_dimensions_3), tweet_list)))
            vector_dict[index] = tmp_vector

    index = 12
    if is_active_vector_method(bin_string[index]):
        if index not in vector_dict.keys():
            tmp_vector = \
                DataFrame(list(map(lambda x: get_word2vec_embedding(x, wv_model_4, w2v_dimensions_4), tweet_list)))
            vector_dict[index] = tmp_vector

    index = 13
    if is_active_vector_method(bin_string[index]):
        if index not in vector_dict.keys():
            tmp_vector = \
                DataFrame(list(map(lambda x: get_word2vec_embedding(x, wv_model_5, w2v_dimensions_5), tweet_list)))
            vector_dict[index] = tmp_vector

    index = 14
    if is_active_vector_method(bin_string[index]):
        if index not in vector_dict.keys():
            tmp_vector = DataFrame(list(map(lambda x: get_emoji_intensity(x), tweet_list)))
            vector_dict[index] = tmp_vector
    index = 15
    if is_active_vector_method(bin_string[index]):
        if index not in vector_dict.keys():
            tmp_vector = DataFrame(list(map(lambda x: get_depeche_mood_vector(x), tweet_list)))
            vector_dict[index] = tmp_vector

    vectors = pd.concat(frames, axis=1)

    return vectors.values.tolist()

# _ = train_vector_dict.pop(3)
# _ = train_vector_dict.pop(4)
# _ = train_vector_dict.pop(5)
# _ = train_vector_dict.pop(6)
# _ = train_vector_dict.pop(7)
# _ = train_vector_dict.pop(8)
# _ = train_vector_dict.pop(9)
# _ = train_vector_dict.pop(10)
# _ = train_vector_dict.pop(14)

# _ = test_vector_dict.pop(3)
# _ = test_vector_dict.pop(4)
# _ = test_vector_dict.pop(5)
# _ = test_vector_dict.pop(6)
# _ = test_vector_dict.pop(7)
# _ = test_vector_dict.pop(8)
# _ = test_vector_dict.pop(9)
# _ = test_vector_dict.pop(10)
# _ = test_vector_dict.pop(14)

# train_vector_dict = dict()
# test_vector_dict = dict()

feature_string = "1111001001001100"

training_tweets = read_training_data(training_data_file_path)
dev_tweets = read_training_data(dev_set_path)

score_train = list()
tweet_train = list()
for tweet in training_tweets:

for tweet in dev_tweets:

x_train = vectorize_tweets(tweet_train, feature_string, train_vector_dict)

dimension = len(x_train[0])

# print(x_train[0])

test_tweets = read_training_data(test_data_file_path)
verbatim_test_tweets = read_training_data_verbatim(test_data_file_path)

tweet_test = list()
y_gold = list()
for tweet in test_tweets:

x_test = vectorize_tweets(tweet_test, feature_string, test_vector_dict)

browser_notify("Vectorization Done")

train_vectors_path = "/home/v2john/" + emotion + "_train_vectors"
test_vectors_path = "/home/v2john/" + emotion + "_test_vectors"

# # Save vectors

# with open(train_vectors_path, 'wb') as train_vectors_file:
#     pickle.dump(train_vector_dict, train_vectors_file)

# with open(test_vectors_path, 'wb') as test_vectors_file:
#     pickle.dump(test_vector_dict, test_vectors_file)

# Restore vectors

with open(train_vectors_path, 'rb') as train_vectors_file:
    train_vector_dict = pickle.load(train_vectors_file)

with open(test_vectors_path, 'rb') as test_vectors_file:
    test_vector_dict = pickle.load(test_vectors_file)

def print_predictions(y_pred, y_gold):
    with open(debug_file_path, 'w') as debug_file:
        debug_file.write("VerbatimTweet" + "\t" + "CleanedTweet" + "\t" + 
                         "Predicted" + "\t" + "Actual" + "\n")
        for i in range(len(verbatim_test_tweets)):
            debug_file.write(verbatim_test_tweets[i].text + "\t" + tweet_test[i] + 
                             "\t" + str(y_pred[i]) + "\t" + str(y_gold[i]) + "\n")
    browser_notify("Wrote debug tweets")

# ml_model = XGBRegressor(seed=0)

# # specify parameters and distributions to sample from
# param_dist = {
#     "max_depth": range(3, 11),
#     "n_estimators": range(100, 10000)
# }

# x_train = np.array(x_train)
# score_train = np.array(score_train)

# # run randomized search
# random_search = RandomizedSearchCV(ml_model, param_distributions=param_dist, n_iter=50)
#, score_train)

# browser_notify("Random search complete")

# print(random_search.best_params_)

# ml_model = random_search.best_estimator_
# ml_model = ensemble.GradientBoostingRegressor(n_estimators=10000)
ml_model = XGBRegressor(max_depth=3, n_estimators=30000, seed=0)

x_train = np.array(x_train)
score_train = np.array(score_train), score_train)

y_pred = ml_model.predict(x_test)

score = evaluate_lists(y_pred, y_gold)

print("### " + emotion + ", feature-string: " + feature_string)
print("| pears-corr | spear-corr | pears-corr-range-05-1 | spear-corr-range-05-1 |")
print("| --- | --- | --- | --- |")
print("| " + str(score[0]) + " | " + str(score[1]) + " | " + \
      str(score[2]) + " | " + str(score[3]) + " |")

# print_predictions(y_pred, y_gold)

browser_notify("Training complete")

# with open(predictions_file_path, 'w') as predictions_file:
#     for i in range(len(y_pred)):
#         predictions_file.write(
#             str(test_tweets[i].id) + "\t" + test_tweets[i].text + "\t" +
#             test_tweets[i].emotion + "\t" + str(y_pred[i]) + "\n"
#         )

# define base model
input_size, dim_size = np.array(x_train).shape
print(input_size, dim_size)
sgd = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True, clipnorm=0.5)

def baseline_model():
    # create model
    model = Sequential()
    model.add(Dense(10000, activation='relu', kernel_initializer='glorot_uniform', input_dim=dim_size))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='mean_squared_error', optimizer="adam")
    return model

ml_model = KerasRegressor(build_fn=baseline_model, epochs=1000, batch_size=128, verbose=1)
x_train = np.array(x_train)
x_test = np.array(x_test), score_train)

y_pred = ml_model.predict(np.array(x_test))

# print(y_pred)

score = evaluate_lists(y_pred, y_gold)

print("### " + emotion + ", feature_string: " + feature_string)
print("| pears-corr | spear-corr | pears-corr-range-05-1 | spear-corr-range-05-1 |")
print("| --- | --- | --- | --- |")
print("| " + str(score[0]) + " | " + str(score[1]) + " | " + \
      str(score[2]) + " | " + str(score[3]) + " |")

browser_notify("Neural Net Training Complete")

sub1_feature_string = "0001111111100011"
x_train_subset_1 = vectorize_tweets(tweet_train, sub1_feature_string, train_vector_dict)
x_test_subset_1 = vectorize_tweets(tweet_test, sub1_feature_string, test_vector_dict)

# print(x_train_subset_1[0])

sub2_feature_string = "1110000000011100"
x_train_subset_2 = vectorize_tweets(tweet_train, sub2_feature_string, train_vector_dict)
x_test_subset_2 = vectorize_tweets(tweet_test, sub2_feature_string, test_vector_dict)

def lex_model():
    # create model
    model = Sequential()
    model.add(Dense(1000, activation='tanh', kernel_initializer='random_uniform', 
    model.add(Dense(500, activation='sigmoid', kernel_initializer='random_uniform', 
    return model

def embed_model():
    # create model    
    model = Sequential()
    model.add(Dense(10000, activation='relu', kernel_initializer='random_uniform', 
    model.add(Dense(500, activation='relu', kernel_initializer='random_uniform', 
    return model

x_train_subset_1 = np.array(x_train_subset_1)
x_test_subset_1 = np.array(x_test_subset_1)

lex_embeddings_train = lex_model().predict(x_train_subset_1)
lex_embeddings_test = lex_model().predict(x_test_subset_1)

x_train_subset_2 = np.array(x_train_subset_2)
x_test_subset_2 = np.array(x_test_subset_2)

word_embeddings_train = embed_model().predict(x_train_subset_1)
word_embeddings_test = embed_model().predict(x_test_subset_1)

print(lex_embeddings_train.shape, word_embeddings_train.shape)

x_train_final = list()
x_test_final = list()

for i in range(len(x_train)):
    x_train_final.append(np.append(lex_embeddings_train[i], word_embeddings_train[i]))

for i in range(len(x_test)):
    x_test_final.append(np.append(lex_embeddings_test[i], word_embeddings_test[i]))


feature_index_mapping = \
        0: "Word2Vec [Google News]",
        1: "Word2Vec [Twitter]",
        2: "GloVe-Twitter",
        3: "NRC-AffectIntensity",
        4: "Wordnet-Affect",
        5: "NRC-Emotion-Lexicon",
        6: "NRC-Emoticon-Lexicon",
        7: "NRC-Emoticon-AffLexNegLex",
        8: "NRC-Hashtag-Emotion",
        9: "NRC-Hashtag-Sentiment-Lexicon",
        10: "NRC-Hashtag-Sentiment-AffLexNegLex",

def get_features_from_identifier(bin_string):
    features = list()
    for i in range(len(bin_string)):
        if int(bin_string[i]):

    return features

