In [ ]:
import spacy
from nltk.corpus import wordnet as wn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
from random import shuffle

from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

sns.set_style("darkgrid")

In [ ]:
nlp = spacy.load('en_core_web_lg', disable=['parser', 'tagger', 'ner'])

In [ ]:
all_words = set()
for pos in ['v', 'n', 'a']:
    for synset in wn.all_synsets(pos):
        lemma_names = [x for x in synset.lemma_names() if x.isalpha()]
        all_words.update(lemma_names)

In [ ]:
word_vectors = dict()
for word in all_words:
    word_nlp = nlp(word.lower())[0]
    has_vector, vector = word_nlp.has_vector, word_nlp.vector
    if has_vector:
        word_vectors[word] = vector

In [ ]:
wordvec_matrix = np.zeros((len(word_vectors), 300))

In [ ]:
word_index = dict()
word_vec_tuples = list(word_vectors.items())
shuffle(word_vec_tuples)
for (index, (word, vec)) in enumerate(word_vec_tuples):
    word_index[word] = index
    wordvec_matrix[index, :] = vec

In [ ]:
normalized_wordvec_matrix = wordvec_matrix / np.linalg.norm(wordvec_matrix, axis=1, keepdims=True)

In [ ]:
word_cosine_sim = np.dot(normalized_wordvec_matrix[:1000,:], normalized_wordvec_matrix[:1000,:].T)
word_cosine_sim = np.tril(word_cosine_sim, -1)
word_cosine_sim = word_cosine_sim[np.nonzero(word_cosine_sim)]

In [ ]:
plt.hist(word_cosine_sim, bins=100, normed=True, alpha=0.5, label='random')
plt.title("dist. of cosine similarity of randomly chosen words")
plt.show()

In [ ]:
data = pd.read_csv('../datasets/synonym_dataset.csv.gz')

In [ ]:
def pair_has_wordvec(row):
    return (row['word1'] in word_vectors) and (row['word2'] in word_vectors)

In [ ]:
data['has_word_vec'] = data.apply(pair_has_wordvec, axis=1)

In [ ]:
data = data.loc[data.has_word_vec]

In [ ]:
def cosine_similarity(row):
    v1 = word_vectors[row['word1']]
    v2 = word_vectors[row['word2']]
    return np.dot(v1, v2)/(np.linalg.norm(v1)*np.linalg.norm(v1))

In [ ]:
data['cosine'] = data.apply(cosine_similarity, axis=1)

In [ ]:
for lbl in ['train', 'test']:
    df = data.loc[data.split==lbl]
    plt.figure(figsize=(16, 8))
    plt.subplot(121)
    plt.hist(df.loc[df.synonym==1, 'cosine'], bins=100, 
             color='steelblue', normed=False, label='synonym')

    plt.hist(df.loc[df.synonym==0, 'cosine'], bins=100, 
             color='seagreen', normed=False, alpha=0.5, 
             label='not synonym')

    plt.hist(word_cosine_sim, bins=100, color='red', normed=False, alpha=0.5, label='random')

    plt.title("count dist of cosine similarity of word vectors ({} set)".format(lbl), fontsize=14)
    plt.legend()
    plt.xlabel("cosine similarity")

    plt.subplot(122)
    plt.hist(df.loc[df.synonym==1, 'cosine'], bins=100, 
             color='steelblue', normed=True, label='synonym')

    plt.hist(df.loc[df.synonym==0, 'cosine'], bins=100, 
             color='seagreen', normed=True, alpha=0.5, 
             label='not synonym')

    plt.hist(word_cosine_sim, bins=100, color='red', normed=True, alpha=0.5, label='random')

    plt.title("normalized distribution of cosine similarity of word vectors({} set)".format(lbl), fontsize=14)
    plt.legend()
    plt.xlabel("cosine similarity")

    plt.show()

In [ ]:
df = data.loc[data.split=='train'].copy()

In [ ]:
print(roc_auc_score(df['synonym'], df['cosine']))

In [ ]:
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []
span_space = np.linspace(0.05, 0.95, 100)
for threshold in span_space:
    df['pred_syn'] = 0
    df.loc[df.cosine > threshold, 'pred_syn'] = 1
    accuracy_scores.append(accuracy_score(df['synonym'], df['pred_syn']))
    precision_scores.append(precision_score(df['synonym'], df['pred_syn']))
    recall_scores.append(recall_score(df['synonym'], df['pred_syn']))
    f1_scores.append(f1_score(df['synonym'], df['pred_syn']))

In [ ]:
sns.regplot(span_space, np.array(accuracy_scores), fit_reg=False)
plt.xlabel("classification threshold")
plt.ylabel("accuracy")
plt.show()

In [ ]:
sns.regplot(np.array(recall_scores), np.array(precision_scores), fit_reg=False)
plt.xlabel("recall")
plt.ylabel("precision")
plt.show()

In [ ]:
sns.regplot(np.array(span_space), np.array(f1_scores), fit_reg=False)
plt.xlabel("threshold")
plt.ylabel("f1")
plt.show()

In [ ]:
best_f1_threshold, best_f1_val = span_space[np.argmax(f1_scores)], np.max(f1_scores)
print("best threshold: {} best f1 score: {} ".format(best_f1_threshold, best_f1_val))

In [ ]:
df_test = data.loc[data.split=='test'].copy()
df_test['pred_syn'] = 0
df_test.loc[df_test.cosine > best_f1_threshold, 'pred_syn'] = 1
print("f1 score in test set using best threshold: {}".format(f1_score(df_test.synonym, df_test.pred_syn)))