In [1]:
from sklearn.datasets import fetch_20newsgroups
from nltk import word_tokenize
from nltk import download
from nltk.corpus import stopwords
import numpy as np
download('punkt')
download('stopwords') #download
stop_words = stopwords.words('english')
def preprocess(text):
text = text.lower()
doc = word_tokenize(text)
doc = [word for word in doc if word not in stop_words]
doc = [word for word in doc if word.isalpha()]
return doc
# Fetch ng20 dataset
# ng20 = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'qoutes'))
ng20 = fetch_20newsgroups() #nothing removed
texts, y = ng20.data, ng20.target
corpus = [preprocess(text) for text in texts]
# print (corpus[1])
In [2]:
# Centroid of the word vectors (Cosine Similarity)
from gensim.models import Word2Vec
from gensim.models.word2vec import Word2Vec
from gensim.models.keyedvectors import KeyedVectors
word2vec_model = KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin.gz', binary=True)
word2vec_model.init_sims(replace=True)
print("finish loading word2vec")
In [3]:
# Doc filter vocab
def filter_docs(corpus, texts, labels, condition_on_doc):
"""
Filter corpus, texts and labels given the function condition_on_doc which takes
a doc.
The document doc is kept if condition_on_doc(doc) is true.
"""
number_of_docs = len(corpus)
if texts is not None:
texts = [text for (text, doc) in zip(texts, corpus)
if condition_on_doc(doc)]
labels = [i for (i, doc) in zip(labels, corpus) if condition_on_doc(doc)]
corpus = [doc for doc in corpus if condition_on_doc(doc)]
corpus = [doc for doc in corpus if len([word for word in doc if word in word2vec_model.vocab]) != 0]
# corpus = [doc for doc in corpus if len(doc) != 0]
print("{} total docs".format(number_of_docs))
print("{} docs removed".format(number_of_docs - len(corpus)))
return (corpus, texts, labels)
corpus, texts, y = filter_docs(corpus, texts, y, lambda doc: (len(doc) != 0))
In [4]:
snippets = []
snippets_labels = []
snippets_file = "data/data-web-snippets/train.txt"
with open(snippets_file, 'r') as f:
for line in f:
# each line is a snippet: a bag of words separated by spaces and
# the category
line = line.split()
category = line[-1]
doc = line[:-1]
snippets.append(doc)
snippets_labels.append(category)
snippets, _, snippets_labels = filter_docs(snippets, None, snippets_labels, lambda doc: (len(doc) != 0))
In [5]:
def document_vector(word2vec_model, doc):
# remove out-of-vocabulary words
doc = [word for word in doc if word in word2vec_model.vocab]
return np.mean(word2vec_model[doc], axis=0)
In [ ]:
In [6]:
def has_vector_representation(word2vec_model, doc):
"""check if at least one word of the document is in the
word2vec dictionary"""
return not all(word not in word2vec_model.vocab for word in doc)
In [7]:
# Clean data with respect to Word2Vec model vocab.
corpus, texts, y = filter_docs(corpus, texts, y, lambda doc:has_vector_representation(word2vec_model,doc))
snippets, _, snippets_labels = filter_docs(snippets,None, snippets_labels, lambda docs:has_vector_representation(word2vec_model, doc))
In [8]:
# LSI
from gensim import corpora
from gensim.models import TfidfModel
from gensim.models import LsiModel
from gensim.similarities import MatrixSimilarity
sims={'ng20':{}, 'snippets':{}}
In [9]:
# NG20 - LSI
dictionary=corpora.Dictionary(corpus)
corpus_gensim=[dictionary.doc2bow(doc) for doc in corpus]
tfidf=TfidfModel(corpus_gensim)
corpus_tfidf=tfidf[corpus_gensim]
lsi=LsiModel(corpus_tfidf, id2word=dictionary,num_topics=200)
lsi_index=MatrixSimilarity(lsi[corpus_tfidf])
sims['ng20']['LSI'] = np.array([lsi_index[lsi[corpus_tfidf[i]]] for i in range(len(corpus))])
print ('finish calculating LSI')
In [10]:
# Snippets - LSI
dictionary_snippets = corpora.Dictionary(snippets)
corpus_gensim_snippets = [dictionary_snippets.doc2bow(doc) for doc in snippets]
tfidf_snippets = TfidfModel(corpus_gensim_snippets)
corpus_tfidf_snippets=tfidf_snippets[corpus_gensim_snippets]
lsi_snippets = LsiModel(corpus_tfidf_snippets, id2word=dictionary_snippets, num_topics=200)
lsi_index_snippets = MatrixSimilarity(lsi_snippets[corpus_tfidf_snippets])
sims['snippets']['LSI'] = np.array([lsi_index[lsi[corpus_tfidf[i]]] for i in range(len(snippets))])
print ('finish calculating LSI')
In [11]:
# Centroid of the word vectors (Cosine Similarity)
from sklearn.metrics.pairwise import cosine_similarity
# ng20 centroid matrix
sims['ng20']['centroid']=cosine_similarity(np.array([document_vector(word2vec_model,doc) for doc in corpus]))
sims['snippets']['centroid'] = cosine_similarity(np.array([document_vector(word2vec_model, doc) for doc in snippets]))
print ('finish calculating cosin')
In [12]:
def most_similar(i, X_sims, topn=None):
"""return the indices of the topn most similar documents with document i
given the similarity matrix X_sims"""
r = np.argsort(X_sims[i])[::-1]
if r is None:
return r
else:
return r[:topn]
#LSI
print(most_similar(0, sims['ng20']['LSI'], 20))
print(most_similar(0, sims['snippets']['LSI'], 20))
#Centroid
print(most_similar(0, sims['ng20']['centroid'], 20))
print(most_similar(0, sims['snippets']['centroid'], 20))
In [13]:
# WMD
from gensim.similarities import WmdSimilarity
wmd_similarity_top20 = WmdSimilarity(corpus, word2vec_model, num_best=20)
most_similars_wmd_ng20_top20 = wmd_similarity_top20[corpus[0]]
In [14]:
wmd_similarity_snippets = WmdSimilarity(snippets, word2vec_model, num_best=20)
most_similars_snippets = wmd_similarity_snippets[snippets[0]]
In [15]:
most_similars_snippets
Out[15]:
In [ ]: