In [ ]:
import time
import json
import pickle
import re
import html
import sys
import gc
import gensim
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import PolynomialFeatures
from xgboost import XGBRegressor
import pandas as pd
from pandas import DataFrame
import scipy.stats as st
import numpy as np
import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Flatten, Embedding, Input
from keras import optimizers
from keras.wrappers.scikit_learn import KerasRegressor
from keras.optimizers import SGD, Adagrad
from keras.preprocessing import sequence
from keras.layers.convolutional import Conv1D
from keras.layers.pooling import MaxPooling1D, GlobalMaxPooling1D
from keras.layers.recurrent import LSTM
from nltk import word_tokenize
from nltk import bigrams
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import sentiwordnet as swn
from IPython.display import display, HTML
In [ ]:
def browser_alert(message):
display(HTML('<script type="text/javascript">alert("' + message + '");</script>'))
def browser_notify(message):
display(HTML('<script type="text/javascript">var notification=new Notification("Jupyter Notification",{icon:"http://blog.jupyter.org/content/images/2015/02/jupyter-sq-text.png",body:"' + message + '"});</script>'))
In [ ]:
word_vector_path = "/home/v2john/"
wassa_home = "/home/v2john/WASSA-Task/"
In [ ]:
def evaluate(pred,gold):
f=open(pred, "rb")
pred_lines=f.readlines()
f.close()
f=open(gold, "rb")
gold_lines=f.readlines()
f.close()
if(len(pred_lines)==len(gold_lines)):
# align tweets ids with gold scores and predictions
data_dic={}
for line in gold_lines:
line = line.decode()
parts=line.split('\t')
if len(parts)==4:
data_dic[int(parts[0])]=[float(line.split('\t')[3])]
else:
raise ValueError('Format problem.')
for line in pred_lines:
line = line.decode()
parts=line.split('\t')
if len(parts)==4:
if int(parts[0]) in data_dic:
try:
data_dic[int(parts[0])].append(float(line.split('\t')[3]))
except ValueError:
# Invalid predictions are replaced by a default value
data_dic[int(parts[0])].append(0.5)
else:
raise ValueError('Invalid tweet id.')
else:
raise ValueError('Format problem.')
# lists storing gold and prediction scores
gold_scores=[]
pred_scores=[]
# lists storing gold and prediction scores where gold score >= 0.5
gold_scores_range_05_1=[]
pred_scores_range_05_1=[]
for id in data_dic:
if(len(data_dic[id])==2):
gold_scores.append(data_dic[id][0])
pred_scores.append(data_dic[id][1])
if(data_dic[id][0]>=0.5):
gold_scores_range_05_1.append(data_dic[id][0])
pred_scores_range_05_1.append(data_dic[id][1])
else:
raise ValueError('Repeated id in test data.')
# return zero correlation if predictions are constant
if np.std(pred_scores)==0 or np.std(gold_scores)==0:
return (0,0,0,0)
pears_corr=st.pearsonr(pred_scores,gold_scores)[0]
spear_corr=st.spearmanr(pred_scores,gold_scores)[0]
pears_corr_range_05_1=st.pearsonr(pred_scores_range_05_1,gold_scores_range_05_1)[0]
spear_corr_range_05_1=st.spearmanr(pred_scores_range_05_1,gold_scores_range_05_1)[0]
return (pears_corr,spear_corr,pears_corr_range_05_1,spear_corr_range_05_1)
else:
raise ValueError('Predictions and gold data have different number of lines.')
def evaluate_lists(pred, gold):
if len(pred) == len(gold):
gold_scores=gold
pred_scores=pred
# lists storing gold and prediction scores where gold score >= 0.5
gold_scores_range_05_1=[]
pred_scores_range_05_1=[]
for i in range(len(gold_scores)):
if(gold_scores[i]>=0.5):
gold_scores_range_05_1.append(gold_scores[i])
pred_scores_range_05_1.append(pred_scores[i])
# return zero correlation if predictions are constant
if np.std(pred_scores)==0 or np.std(gold_scores)==0:
return (0,0,0,0)
pears_corr=st.pearsonr(pred_scores,gold_scores)[0]
spear_corr=st.spearmanr(pred_scores,gold_scores)[0]
pears_corr_range_05_1=st.pearsonr(pred_scores_range_05_1,gold_scores_range_05_1)[0]
spear_corr_range_05_1=st.spearmanr(pred_scores_range_05_1,gold_scores_range_05_1)[0]
return np.array([pears_corr,spear_corr,pears_corr_range_05_1,spear_corr_range_05_1])
else:
raise ValueError('Predictions and gold data have different number of lines.')
In [ ]:
def loadGloveModel(gloveFile):
print("Loading Glove Model")
f = open(gloveFile,'r')
model = {}
num = 1
for line in f:
try:
splitLine = line.split()
word = splitLine[0]
embedding = [float(val) for val in splitLine[1:]]
model[word] = np.array(embedding)
num += 1
except Exception as e:
print("Failed at line " + str(num))
print("Done.",len(model)," words loaded!")
return model
In [ ]:
# Google news pretrained vectors
wv_model_path = word_vector_path + "GoogleNews-vectors-negative300.bin.gz"
wv_model = gensim.models.KeyedVectors.load_word2vec_format(wv_model_path, binary=True, unicode_errors='ignore')
In [ ]:
# Twitter pretrained vectors
wv_model_path_1 = word_vector_path + "word2vec_twitter_model.bin"
wv_model_1 = gensim.models.KeyedVectors.load_word2vec_format(wv_model_path_1, binary=True, unicode_errors='ignore')
In [ ]:
wv_model_path_2 = word_vector_path + "glove.twitter.27B.200d.txt"
wv_model_2 = loadGloveModel(wv_model_path_2)
In [ ]:
wv_model_path_3 = word_vector_path + "glove.6B.300d.txt"
wv_model_3 = loadGloveModel(wv_model_path_3)
In [ ]:
wv_model_path_4 = word_vector_path + "glove.42B.300d.txt"
wv_model_4 = loadGloveModel(wv_model_path_4)
In [ ]:
wv_model_path_5 = word_vector_path + "glove.840B.300d.txt"
wv_model_5 = loadGloveModel(wv_model_path_5)
In [ ]:
w2v_dimensions = len(wv_model['word'])
w2v_dimensions_1 = len(wv_model_1['word'])
w2v_dimensions_2 = len(wv_model_2['word'])
w2v_dimensions_3 = len(wv_model_3['word'])
w2v_dimensions_4 = len(wv_model_4['word'])
w2v_dimensions_5 = len(wv_model_5['word'])
print(w2v_dimensions, w2v_dimensions_1,
w2v_dimensions_2, w2v_dimensions_3, w2v_dimensions_4,
w2v_dimensions_5)
In [ ]:
def get_word2vec_embedding(word, model, dimensions):
vec_rep = np.zeros(dimensions)
if word in model:
vec_rep = model[word]
return vec_rep
In [ ]:
# get_word2vec_embedding("charm", wv_model_2, 200)
In [ ]:
def remove_stopwords(string):
split_string = \
[word for word in string.split()
if word not in stopwords.words('english')]
return " ".join(split_string)
In [ ]:
def clean_str(string):
string = html.unescape(string)
string = string.replace("\\n", " ")
string = string.replace("_NEG", "")
string = string.replace("_NEGFIRST", "")
string = re.sub(r"@[A-Za-z0-9_s(),!?\'\`]+", "", string) # removing any twitter handle mentions
# string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
# string = re.sub(r"#", "", string)
string = re.sub(r"\*", "", string)
string = re.sub(r"\'s", "", string)
string = re.sub(r"\'m", " am", string)
string = re.sub(r"\'ve", " have", string)
string = re.sub(r"n\'t", " not", string)
string = re.sub(r"\'re", " are", string)
string = re.sub(r"\'d", " would", string)
string = re.sub(r"\'ll", " will", string)
string = re.sub(r",", "", string)
string = re.sub(r"!", " !", string)
string = re.sub(r"\(", "", string)
string = re.sub(r"\)", "", string)
string = re.sub(r"\?", " ?", string)
string = re.sub(r"\s{2,}", " ", string)
return remove_stopwords(string.strip().lower())
In [ ]:
class Tweet(object):
def __init__(self, id, text, emotion, intensity):
self.id = id
self.text = text
self.emotion = emotion
self.intensity = intensity
def __repr__(self):
return \
"id: " + self.id + \
", text: " + self.text + \
", emotion: " + self.emotion + \
", intensity: " + self.intensity
In [ ]:
def read_training_data(training_data_file_path):
train_list = list()
with open(training_data_file_path) as input_file:
for line in input_file:
line = line.strip()
array = line.split('\t')
train_list.append(Tweet(array[0], clean_str(array[1]), array[2], float(array[3])))
return train_list
def read_training_data_verbatim(training_data_file_path):
train_list = list()
with open(training_data_file_path) as input_file:
for line in input_file:
line = line.strip()
array = line.split('\t')
train_list.append(Tweet(array[0], array[1], array[2], float(array[3])))
return train_list
def read_test_data(training_data_file_path):
test_list = list()
with open(training_data_file_path) as input_file:
for line in input_file:
line = line.strip()
array = line.split('\t')
test_list.append(Tweet(array[0], clean_str(array[1]), array[2], None))
return test_list
In [ ]:
emotion = "anger"
In [ ]:
training_data_file_path = \
wassa_home + "dataset/" + \
emotion + "-ratings-0to1.train.txt"
predictions_file_path = \
wassa_home + "predictions/" + \
emotion + "-pred.txt"
dev_set_path = \
wassa_home + "dataset/dev-set/" + \
emotion + "-ratings-0to1.dev.gold.txt"
test_data_file_path = \
wassa_home + "dataset/test-set/" + \
emotion + "-ratings-0to1.test.gold.txt"
debug_file_path = \
wassa_home + "dataset/test-set/debug/" + \
emotion + ".tsv"
In [ ]:
with open(wassa_home + 'lexicons/emoji_map.json') as emoji_file:
emoji_list = json.load(emoji_file)
emoji_dict = dict()
for emoji in emoji_list:
emoji_dict[emoji["emoji"]] = (emoji["name"], emoji["polarity"])
In [ ]:
def get_emoji_intensity(word):
score = 0.0
if word in emoji_dict.keys():
score = float(emoji_dict[word][1])
vec_rep = np.array([score])
return vec_rep
In [ ]:
# get_emoji_intensity("💯")
In [ ]:
affect_intensity_file_path = \
wassa_home + \
"lexicons/NRC-AffectIntensity-Lexicon.txt"
def get_word_affect_intensity_dict(emotion):
word_intensities = dict()
with open(affect_intensity_file_path) as affect_intensity_file:
for line in affect_intensity_file:
word_int_array = line.replace("\n", "").split("\t")
if (word_int_array[2] == emotion):
word_intensities[word_int_array[0]] = float(word_int_array[1])
return word_intensities
In [ ]:
word_intensities = get_word_affect_intensity_dict(emotion)
In [ ]:
def get_emo_int_vector(word):
score = 0.0
if word in word_intensities.keys():
score = float(word_intensities[word])
vec_rep = np.array([score])
return vec_rep
In [ ]:
# get_emo_int_vector('fury')
In [ ]:
def get_sentiwordnetscore(word):
vec_rep = np.zeros(2)
synsetlist = list(swn.senti_synsets(word))
if synsetlist:
vec_rep[0] = synsetlist[0].pos_score()
vec_rep[1] = synsetlist[0].neg_score()
return vec_rep
In [ ]:
# get_sentiwordnetscore("fury")
In [ ]:
sentiment_emotion_lex_file_path = \
wassa_home + \
"lexicons/NRC-Sentiment-Emotion-Lexicons/Lexicons/NRC-Emotion-Lexicon-v0.92/" + \
"NRC-Emotion-Lexicon-Wordlevel-v0.92.txt"
def get_affect_presence_list(emotion):
word_list = list()
with open(sentiment_emotion_lex_file_path) as sentiment_emotion_lex_file:
for line in sentiment_emotion_lex_file:
word_array = line.replace("\n", "").split("\t")
if (word_array[1] == emotion and word_array[2] == '1'):
word_list.append(word_array[0])
return word_list
In [ ]:
sentiment_emotion_lex_word_list = get_affect_presence_list(emotion)
In [ ]:
def get_sentiment_emotion_feature(word):
score = 0.0
if word in sentiment_emotion_lex_word_list:
score = 1.0
vec_rep = np.array([score])
return vec_rep
In [ ]:
# get_sentiment_emotion_feature("fury")
In [ ]:
hashtag_emotion_lex_file_path = \
wassa_home + \
"lexicons/NRC-Sentiment-Emotion-Lexicons/Lexicons/NRC-Hashtag-Emotion-Lexicon-v0.2/" + \
"NRC-Hashtag-Emotion-Lexicon-v0.2.txt"
def get_hashtag_emotion_intensity(emotion):
hastag_intensities = dict()
with open(hashtag_emotion_lex_file_path) as hashtag_emotion_lex_file:
for line in hashtag_emotion_lex_file:
word_array = line.replace("\n", "").split("\t")
if (word_array[0] == emotion):
hastag_intensities[word_array[1]] = float(word_array[2])
return hastag_intensities
In [ ]:
hashtag_emotion_intensities = get_hashtag_emotion_intensity(emotion)
In [ ]:
def get_hashtag_emotion_vector(word):
score = 0.0
if word in hashtag_emotion_intensities.keys():
score = float(hashtag_emotion_intensities[word])
vec_rep = np.array([score])
return vec_rep
In [ ]:
# get_hashtag_emotion_vector("#fury")
In [ ]:
emoticon_lexicon_unigrams_file_path = \
wassa_home + \
"lexicons/NRC-Sentiment-Emotion-Lexicons/Lexicons/NRC-Emoticon-Lexicon-v1.0/Emoticon-unigrams.txt"
emoticon_lexicon_bigrams_file_path = \
wassa_home + \
"lexicons/NRC-Sentiment-Emotion-Lexicons/Lexicons/NRC-Emoticon-Lexicon-v1.0/Emoticon-bigrams.txt"
emoticon_lexicon_unigrams = dict()
emoticon_lexicon_bigrams = dict()
def get_emoticon_lexicon_unigram_dict():
with open(emoticon_lexicon_unigrams_file_path) as emoticon_lexicon_file:
for line in emoticon_lexicon_file:
word_array = line.replace("\n", "").split("\t")
emoticon_lexicon_unigrams[word_array[0]] = np.array([float(val) for val in word_array[1:]])
return emoticon_lexicon_unigrams
def get_emoticon_lexicon_bigram_dict():
with open(emoticon_lexicon_bigrams_file_path) as emoticon_lexicon_file:
for line in emoticon_lexicon_file:
word_array = line.replace("\n", "").split("\t")
emoticon_lexicon_bigrams[word_array[0]] = np.array([float(val) for val in word_array[1:]])
return emoticon_lexicon_bigrams
In [ ]:
emoticon_lexicon_unigram_dict = get_emoticon_lexicon_unigram_dict()
In [ ]:
emoticon_lexicon_bigram_dict = get_emoticon_lexicon_bigram_dict()
In [ ]:
def get_unigram_sentiment_emoticon_lexicon_vector(word):
vec_rep = np.zeros(3)
if word in emoticon_lexicon_unigram_dict.keys():
vec_rep = emoticon_lexicon_unigram_dict[word]
return vec_rep
def get_bigram_sentiment_emoticon_lexicon_vector(word):
vec_rep = np.zeros(3)
if word in emoticon_lexicon_bigram_dict.keys():
vec_rep = emoticon_lexicon_bigram_dict[word]
return vec_rep
In [ ]:
get_unigram_sentiment_emoticon_lexicon_vector("fury")
In [ ]:
get_bigram_sentiment_emoticon_lexicon_vector("add everyone")
In [ ]:
emoticon_afflex_unigrams_file_path = \
wassa_home + \
"lexicons/NRC-Sentiment-Emotion-Lexicons/Lexicons/NRC-Emoticon-AffLexNegLex-v1.0/" + \
"Emoticon-AFFLEX-NEGLEX-unigrams.txt"
emoticon_afflex_bigrams_file_path = \
wassa_home + \
"lexicons/NRC-Sentiment-Emotion-Lexicons/Lexicons/NRC-Emoticon-AffLexNegLex-v1.0/" + \
"Emoticon-AFFLEX-NEGLEX-bigrams.txt"
emoticon_afflex_unigrams = dict()
emoticon_afflex_bigrams = dict()
def get_emoticon_afflex_unigram_dict():
with open(emoticon_afflex_unigrams_file_path) as emoticon_lexicon_file:
for line in emoticon_lexicon_file:
word_array = line.replace("\n", "").split("\t")
emoticon_afflex_unigrams[word_array[0]] = np.array([float(val) for val in word_array[1:]])
return emoticon_afflex_unigrams
def get_emoticon_afflex_bigram_dict():
with open(emoticon_afflex_bigrams_file_path) as emoticon_lexicon_file:
for line in emoticon_lexicon_file:
word_array = line.replace("\n", "").split("\t")
emoticon_afflex_bigrams[word_array[0]] = np.array([float(val) for val in word_array[1:]])
return emoticon_afflex_bigrams
In [ ]:
emoticon_afflex_unigram_dict = get_emoticon_afflex_unigram_dict()
In [ ]:
emoticon_afflex_bigram_dict = get_emoticon_afflex_bigram_dict()
In [ ]:
def get_unigram_sentiment_emoticon_afflex_vector(word):
vec_rep = np.zeros(3)
if word in emoticon_afflex_unigram_dict.keys():
vec_rep = emoticon_afflex_unigram_dict[word]
return vec_rep
def get_bigram_sentiment_emoticon_afflex_vector(word):
vec_rep = np.zeros(3)
if word in emoticon_afflex_bigram_dict.keys():
vec_rep = emoticon_afflex_bigram_dict[word]
return vec_rep
In [ ]:
# get_unigram_sentiment_emoticon_afflex_vector("fury")
In [ ]:
# get_bigram_sentiment_emoticon_afflex_vector("pay vip")
In [ ]:
hashtag_affneglex_unigrams_file_path = \
wassa_home + \
"lexicons/NRC-Sentiment-Emotion-Lexicons/Lexicons/NRC-Hashtag-Sentiment-AffLexNegLex-v1.0/" + \
"HS-AFFLEX-NEGLEX-unigrams.txt"
hashtag_affneglex_bigrams_file_path = \
wassa_home + \
"lexicons/NRC-Sentiment-Emotion-Lexicons/Lexicons/NRC-Hashtag-Sentiment-AffLexNegLex-v1.0/" + \
"HS-AFFLEX-NEGLEX-bigrams.txt"
hashtag_affneglex_unigrams = dict()
hashtag_affneglex_bigrams = dict()
def get_hashtag_affneglex_unigram_dict():
with open(hashtag_affneglex_unigrams_file_path) as hashtag_sent_lex_file:
for line in hashtag_sent_lex_file:
word_array = line.replace("\n", "").split("\t")
hashtag_affneglex_unigrams[word_array[0]] = np.array([float(val) for val in word_array[1:]])
return hashtag_affneglex_unigrams
def get_hashtag_affneglex_bigram_dict():
with open(hashtag_affneglex_bigrams_file_path) as hashtag_sent_lex_file:
for line in hashtag_sent_lex_file:
word_array = line.replace("\n", "").split("\t")
hashtag_affneglex_bigrams[word_array[0]] = np.array([float(val) for val in word_array[1:]])
return hashtag_affneglex_bigrams
In [ ]:
hashtag_affneglex_unigram_dict = get_hashtag_affneglex_unigram_dict()
In [ ]:
hashtag_affneglex_bigram_dict = get_hashtag_affneglex_bigram_dict()
In [ ]:
def get_unigram_sentiment_hashtag_affneglex_vector(word):
vec_rep = np.zeros(3)
if word in hashtag_affneglex_unigram_dict.keys():
vec_rep = hashtag_affneglex_unigram_dict[word]
return vec_rep
def get_bigram_sentiment_hashtag_affneglex_vector(word):
vec_rep = np.zeros(3)
if word in hashtag_affneglex_bigram_dict.keys():
vec_rep = hashtag_affneglex_bigram_dict[word]
return vec_rep
In [ ]:
get_unigram_sentiment_hashtag_affneglex_vector("#great")
In [ ]:
get_bigram_sentiment_hashtag_affneglex_vector("#good luck")
In [ ]:
hash_sent_lex_unigrams_file_path = \
wassa_home + \
"lexicons/NRC-Sentiment-Emotion-Lexicons/Lexicons/NRC-Hashtag-Sentiment-Lexicon-v1.0/HS-unigrams.txt"
hash_sent_lex_bigrams_file_path = \
wassa_home + \
"lexicons/NRC-Sentiment-Emotion-Lexicons/Lexicons/NRC-Hashtag-Sentiment-Lexicon-v1.0/HS-bigrams.txt"
def get_hash_sent_lex_unigram_dict():
hash_sent_lex_unigrams = dict()
with open(hash_sent_lex_unigrams_file_path) as hash_sent_lex_file:
for line in hash_sent_lex_file:
word_array = line.replace("\n", "").split("\t")
if clean_str(word_array[0]):
hash_sent_lex_unigrams[word_array[0]] = np.array([float(val) for val in word_array[1:]])
return hash_sent_lex_unigrams
def get_hash_sent_lex_bigram_dict():
hash_sent_lex_bigrams = dict()
with open(hash_sent_lex_bigrams_file_path) as hash_sent_lex_file:
for line in hash_sent_lex_file:
word_array = line.replace("\n", "").split("\t")
if clean_str(word_array[0]):
hash_sent_lex_bigrams[word_array[0]] = np.array([float(val) for val in word_array[1:]])
return hash_sent_lex_bigrams
In [ ]:
hash_sent_lex_unigram_dict = get_hash_sent_lex_unigram_dict()
In [ ]:
hash_sent_lex_bigram_dict = get_hash_sent_lex_bigram_dict()
In [ ]:
def get_unigram_sentiment_hash_sent_lex_vector(word):
vec_rep = np.zeros(3)
if word in hash_sent_lex_unigram_dict.keys():
vec_rep = hash_sent_lex_unigram_dict[word]
return vec_rep
def get_bigram_sentiment_hash_sent_lex_vector(word):
vec_rep = np.zeros(3)
if word in hash_sent_lex_bigram_dict.keys():
vec_rep = hash_sent_lex_bigram_dict[word]
return vec_rep
In [ ]:
# get_unigram_sentiment_hash_sent_lex_vector("#fabulous")
In [ ]:
# get_bigram_sentiment_hash_sent_lex_vector(". #perfection")
In [ ]:
depeche_mood_file_path = \
wassa_home + \
"lexicons/DepecheMood_V1.0/DepecheMood_normfreq.txt"
In [ ]:
def get_depeche_vector_dict():
depeche_vector_dict = dict()
with open(depeche_mood_file_path) as depeche_mood_file:
for line in depeche_mood_file:
word_array = line.replace("\n", "").split("\t")
depeche_vector_dict[word_array[0].split("#")[0]] = np.array([float(val) for val in word_array[1:]])
return depeche_vector_dict
In [ ]:
depeche_vector_dict = get_depeche_vector_dict()
In [ ]:
def get_depeche_mood_vector(word):
vec_rep = np.zeros(8)
if word in depeche_vector_dict.keys():
vec_rep = np.array(depeche_vector_dict[word])
return vec_rep
In [ ]:
# get_depeche_mood_vector("120th")
In [ ]:
def is_active_vector_method(string):
return int(string)
def learn_unigram_word_embedding(word):
word_feature_embedding_dict = dict()
'''Pre-trained Word embeddings'''
index = 0
word_feature_embedding_dict[index] = get_word2vec_embedding(word, wv_model, w2v_dimensions)
index = 1
word_feature_embedding_dict[index] = get_word2vec_embedding(word, wv_model_1, w2v_dimensions_1)
index = 2
word_feature_embedding_dict[index] = get_word2vec_embedding(word, wv_model_2, w2v_dimensions_2)
'''NRC Emotion Intensity Lexicon'''
index = 3
word_feature_embedding_dict[index] = get_emo_int_vector(word)
'''WordNet'''
index = 4
word_feature_embedding_dict[index] = get_sentiwordnetscore(word)
'''NRC Sentiment Lexica'''
index = 5
word_feature_embedding_dict[index] = get_sentiment_emotion_feature(word)
index = 6
word_feature_embedding_dict[index] = get_unigram_sentiment_emoticon_lexicon_vector(word)
index = 7
word_feature_embedding_dict[index] = get_unigram_sentiment_emoticon_afflex_vector(word)
'''NRC Hashtag Lexica'''
index = 8
word_feature_embedding_dict[index] = get_hashtag_emotion_vector(word)
index = 9
word_feature_embedding_dict[index] = get_unigram_sentiment_hash_sent_lex_vector(word)
index = 10
word_feature_embedding_dict[index] = get_unigram_sentiment_hashtag_affneglex_vector(word)
'''Additional pre-trained GloVe models'''
index = 11
word_feature_embedding_dict[index] = get_word2vec_embedding(word, wv_model_3, w2v_dimensions_3)
index = 12
word_feature_embedding_dict[index] = get_word2vec_embedding(word, wv_model_4, w2v_dimensions_4)
index = 13
word_feature_embedding_dict[index] = get_word2vec_embedding(word, wv_model_5, w2v_dimensions_5)
'''Emoji Polarities'''
index = 14
word_feature_embedding_dict[index] = get_emoji_intensity(word)
'''Depeche Mood'''
index = 15
word_feature_embedding_dict[index] = get_depeche_mood_vector(word)
return word_feature_embedding_dict
def learn_bigram_word_embedding(word):
word_feature_embedding_dict = dict()
'''NRC Sentiment Lexica'''
index = 0
word_feature_embedding_dict[index] = get_bigram_sentiment_emoticon_lexicon_vector(word)
index = 1
word_feature_embedding_dict[index] = get_bigram_sentiment_emoticon_afflex_vector(word)
'''NRC Hashtag Lexica'''
index = 2
word_feature_embedding_dict[index] = get_bigram_sentiment_hash_sent_lex_vector(word)
index = 3
word_feature_embedding_dict[index] = get_bigram_sentiment_hashtag_affneglex_vector(word)
return word_feature_embedding_dict
In [ ]:
def get_unigram_embedding(word, word_embedding_dict, bin_string):
word_feature_embedding_dict = word_embedding_dict[word]
final_embedding = np.array([])
for i in range(16):
if is_active_vector_method(bin_string[i]):
final_embedding = np.append(final_embedding, word_feature_embedding_dict[i])
return final_embedding
In [ ]:
unigram_feature_string = "1110000000011100"
bigram_feature_string = "1111"jjjjjj
In [ ]:
training_tweets = read_training_data(training_data_file_path)
dev_tweets = read_training_data(dev_set_path)
score_train = list()
tweet_train = list()
for tweet in training_tweets:
tweet_train.append(tweet.text)
score_train.append(float(tweet.intensity))
for tweet in dev_tweets:
tweet_train.append(tweet.text)
score_train.append(float(tweet.intensity))
print(len(score_train))
score_train = np.asarray(score_train)
In [ ]:
test_tweets = read_training_data(test_data_file_path)
verbatim_test_tweets = read_training_data_verbatim(test_data_file_path)
tweet_test = list()
y_gold = list()
for tweet in test_tweets:
tweet_test.append(tweet.text)
y_gold.append(float(tweet.intensity))
print(len(y_gold))
In [ ]:
def build_word_embeddings(tweets):
max_tweet_length = -1
word_embedding_dict = dict()
for tweet in tweets:
tokens = word_tokenize(tweet)
bi_tokens = bigrams(tokens)
if len(tokens) > max_tweet_length:
max_tweet_length = len(tokens)
for token in tokens:
if token not in word_embedding_dict.keys():
word_embedding_dict[token] = learn_unigram_word_embedding(token)
for token in bi_tokens:
if token not in word_embedding_dict.keys():
word_embedding_dict[token] = learn_bigram_word_embedding(token)
return word_embedding_dict, max_tweet_length
In [ ]:
all_tweets = tweet_train + tweet_test
embedding_info = build_word_embeddings(all_tweets)
In [ ]:
browser_notify("Vectorization Done")
In [ ]:
# # Save vectors
# word_embeddings_path = "/home/v2john/word-embeddings.pkl"
# with open(word_embeddings_path, 'wb') as word_embeddings_file:
# pickle.dump(embedding_info, word_embeddings_file)
In [ ]:
# Restore vectors
word_embeddings_path = "/home/v2john/word-embeddings.pkl"
with open(word_embeddings_path, 'rb') as word_embeddings_file:
embedding_info = pickle.load(word_embeddings_file)
embeddings_index = embedding_info[0]
MAX_SEQUENCE_LENGTH = embedding_info[1]
EMBEDDING_DIM = len(get_unigram_embedding("glad", embedding_info[0], unigram_feature_string))
In [ ]:
print(MAX_SEQUENCE_LENGTH, EMBEDDING_DIM)
In [ ]:
def vectorize_tweets(tweets):
train_vectors = list()
for tweet in tweets:
tokens = word_tokenize(tweet)
train_vector = list()
for token in tokens:
train_vector.append(get_unigram_embedding(token, embedding_info[0], unigram_feature_string))
train_vector = np.asarray(train_vector)
train_vectors.append(train_vector)
return np.asarray(train_vectors)
In [ ]:
x_train = vectorize_tweets(tweet_train)
x_test = vectorize_tweets(tweet_test)
In [ ]:
print(x_train.shape, x_test.shape)
In [ ]:
x_train = sequence.pad_sequences(x_train, maxlen=embedding_info[1], padding="post", truncating="post",
dtype='float64')
x_test = sequence.pad_sequences(x_test, maxlen=embedding_info[1], padding="post", truncating="post",
dtype='float64')
In [ ]:
x_train.shape, x_test.shape, x_train.shape[1:]
In [ ]:
maxlen = 34
batch_size = 32
conv_kernel = 5
In [ ]:
conv_1 = Conv1D(300, conv_kernel, activation='relu', input_shape=x_train.shape[1:])
conv_2 = Conv1D(32, conv_kernel, activation='relu')
# conv_3 = Conv1D(100, conv_kernel, activation='relu')
pool_1 = MaxPooling1D()
pool_2 = MaxPooling1D()
pool_3 = MaxPooling1D()
pool_4 = MaxPooling1D()
flat_1 = Flatten()
dense_1 = Dense(30000, activation='relu')
dense_2 = Dense(1, activation='sigmoid')
drop_1 = Dropout(rate=0.5)
In [ ]:
def get_convnet_model():
model = Sequential()
model.add(conv_1)
model.add(pool_1)
# model.add(conv_2)
# model.add(pool_2)
# model.add(conv_3)
# model.add(pool_3)
# model.add(pool_4)
model.add(flat_1)
model.add(dense_1)
model.add(drop_1)
model.add(dense_2)
model.compile(loss='mean_squared_error', optimizer="sgd")
return model
In [ ]:
ml_model = KerasRegressor(build_fn=get_convnet_model, epochs=100, batch_size=16, verbose=1)
ml_model.fit(x_train, score_train)
y_pred = ml_model.predict(x_test)
In [ ]:
# print(conv_1.input_shape, conv_1.output_shape)
# print(pool_1.input_shape, pool_1.output_shape)
# print(conv_2.input_shape, conv_2.output_shape)
# print(pool_2.input_shape, pool_2.output_shape)
# print(conv_3.input_shape, conv_3.output_shape)
# print(pool_3.input_shape, pool_3.output_shape)
# print(flat_1.input_shape, flat_1.output_shape)
In [ ]:
y_pred = y_pred.reshape((760,))
In [ ]:
y_pred
In [ ]:
evaluate_lists(y_pred, y_gold)
In [ ]:
browser_notify("CNN Trained: " + str(evaluate_lists(y_pred, y_gold)))
In [ ]: