In [ ]:
# external imports
import numpy as np
import pandas as pd
import seaborn as sb
import gensim as gs
import pylab as pl
import matplotlib.pyplot as plt
import csv
import scipy
import sklearn.preprocessing
# internal imports
import helpers as HP
# Constants: PS! put in your own paths to the files
GLOVE_FOLDER = 'glove.twitter.27B'
GS_FOLDER = 'gensim_glove_twitter_27B/'
GS_25DIM = GS_FOLDER + "gensim_glove_25dim.txt"
GS_50DIM = GS_FOLDER + "gensim_glove_50dim.txt"
GS_100DIM = GS_FOLDER + "gensim_glove_100dim.txt"
GS_200DIM = GS_FOLDER + "gensim_glove_200dim.txt"
one can use gensims word2vec functions to check similarity and other interesting functions https://radimrehurek.com/gensim/models/word2vec.html
In [ ]:
# spits out a .txt-file with the vectors in gensim format
#HP.create_gensim_wv_from_glove(GLOVE_FOLDER)
# afterwards you can delete the originalglovefiles
In [ ]:
# uses the created gensim-.txt file to create the word2vec so one can operate on it
# choose wordvectors with the dimension you want
global_vectors = HP.load_gensim_global_vectors(GS_200DIM)
In [ ]:
global_vectors.similar_by_word("racism")
In [ ]:
# supposed to show queen, to demonstrate the power of word2vec, but failed #blameTwitterDataset
global_vectors.most_similar_cosmul(positive=['woman', 'king'], negative=['man'])
In [ ]:
related_words = HP.generate_related_words("racism", global_vectors, topn=12)
print(related_words)
custom_related = ['racism', 'bigotry', 'apartheid', 'black', 'nigger', 'race']
In [ ]:
racism_topic_vector, racism_std_dims = HP.create_topic(related_words, global_vectors)
print("top similarity for custom racismvector:", global_vectors.similar_by_vector(racism_topic_vector))
c_racism_topic_vector, c_racism_std_dims = HP.create_topic(custom_related, global_vectors)
print("top similarity for custom racismvector:", global_vectors.similar_by_vector(c_racism_topic_vector))
In [ ]:
###### GENERATED #####
## Comparing "nigger" and "misogynist" with standard cosine with all dimensions
word_vector = global_vectors['nigger']
similarity_score = HP.calculate_topic_similarity(word_vector, racism_topic_vector, global_vectors, std_dims=False)
print("niggers relation to racism", similarity_score)
word_vector = global_vectors['car']
similarity_score = HP.calculate_topic_similarity(word_vector, racism_topic_vector, global_vectors, std_dims=False)
print("car relation to racism", similarity_score)
## Comparing "nigger" and "misogynist" with standard cosine with all dimensions
word_vector = global_vectors['nigger']
similarity_score = HP.calculate_topic_similarity(word_vector, racism_topic_vector, global_vectors, std_dims=racism_std_dims, perc_dim_to_compare=0.8)
print("\nniggers relation to racism", similarity_score)
word_vector = global_vectors['car']
similarity_score = HP.calculate_topic_similarity(word_vector, racism_topic_vector, global_vectors, std_dims=racism_std_dims, perc_dim_to_compare=0.8)
print("car relation to racism", similarity_score)
In [ ]:
###### CUSTOM #####
## Comparing "nigger" and "misogynist" with standard cosine with all dimensions
word_vector = global_vectors['nigger']
similarity_score = HP.calculate_topic_similarity(word_vector, c_racism_topic_vector, global_vectors, std_dims=False)
print("niggers relation to racism", similarity_score)
word_vector = global_vectors['car']
similarity_score = HP.calculate_topic_similarity(word_vector, c_racism_topic_vector, global_vectors, std_dims=False)
print("car relation to racism", similarity_score)
## Comparing "nigger" and "misogynist" with standard cosine with all dimensions
word_vector = global_vectors['nigger']
similarity_score = HP.calculate_topic_similarity(word_vector, c_racism_topic_vector, global_vectors, std_dims=racism_std_dims, perc_dim_to_compare=0.4)
print("\nniggers relation to racism", similarity_score)
word_vector = global_vectors['car']
similarity_score = HP.calculate_topic_similarity(word_vector, c_racism_topic_vector, global_vectors, std_dims=racism_std_dims, perc_dim_to_compare=0.4)
print("car relation to racism", similarity_score)
It is working the right direction, but is it good enough to group just by cosine? We should try and assess if this is a good method.. :/
In [ ]:
analyze_related_words(related_words, global_vectors)
"""
Must add labals manually when adding to blog
"""
In [ ]: