In [1]:
from sklearn.datasets import fetch_20newsgroups

from nltk import word_tokenize
from nltk import download
from nltk.corpus import stopwords
import numpy as np


download('punkt')
download('stopwords') #download 

stop_words = stopwords.words('english')

def preprocess(text):
    text = text.lower()
    doc = word_tokenize(text)
    doc = [word for word in doc if word not in stop_words]
    doc = [word for word in doc if word.isalpha()]
    return doc

# Fetch ng20 dataset
# ng20 = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'qoutes'))
ng20 = fetch_20newsgroups() #nothing removed

texts, y = ng20.data, ng20.target

corpus = [preprocess(text) for text in texts]


# print (corpus[1])


[nltk_data] Downloading package punkt to /home/sonic/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/sonic/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!

In [2]:
# Centroid of the word vectors (Cosine Similarity)
from gensim.models import Word2Vec

from gensim.models.word2vec import Word2Vec
from gensim.models.keyedvectors import KeyedVectors
word2vec_model = KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin.gz', binary=True)
word2vec_model.init_sims(replace=True) 
print("finish loading word2vec")


finish loading word2vec

In [3]:
# Doc filter vocab
def filter_docs(corpus, texts, labels, condition_on_doc):
    """
    Filter corpus, texts and labels given the function condition_on_doc which takes
    a doc.
    The document doc is kept if condition_on_doc(doc) is true.
    """
    number_of_docs = len(corpus)
    

    if texts is not None:
        texts = [text for (text, doc) in zip(texts, corpus)
                 if condition_on_doc(doc)]

    labels = [i for (i, doc) in zip(labels, corpus) if condition_on_doc(doc)]
    corpus = [doc for doc in corpus if condition_on_doc(doc)]
    corpus = [doc for doc in corpus if len([word for word in doc if word in word2vec_model.vocab]) != 0]
#     corpus = [doc for doc in corpus if len(doc) != 0]

    print("{} total docs".format(number_of_docs))
    print("{} docs removed".format(number_of_docs - len(corpus)))

    return (corpus, texts, labels)

corpus, texts, y = filter_docs(corpus, texts, y, lambda doc: (len(doc) != 0))


11314 total docs
0 docs removed

In [4]:
snippets = []
snippets_labels = []
snippets_file = "data/data-web-snippets/train.txt"
with open(snippets_file, 'r') as f:
    for line in f:
        # each line is a snippet: a bag of words separated by spaces and
        # the category
        line = line.split()
        category = line[-1]
        doc = line[:-1]
        snippets.append(doc)
        snippets_labels.append(category)

snippets, _, snippets_labels = filter_docs(snippets, None, snippets_labels, lambda doc: (len(doc) != 0))


10060 total docs
4 docs removed

In [5]:
def document_vector(word2vec_model, doc):
    # remove out-of-vocabulary words
    doc = [word for word in doc if word in word2vec_model.vocab]
    return np.mean(word2vec_model[doc], axis=0)

In [ ]:


In [6]:
def has_vector_representation(word2vec_model, doc):
    """check if at least one word of the document is in the
    word2vec dictionary"""
    return not all(word not in word2vec_model.vocab for word in doc)

In [7]:
# Clean data with respect to Word2Vec model vocab.
corpus, texts, y = filter_docs(corpus, texts, y, lambda doc:has_vector_representation(word2vec_model,doc))
snippets, _, snippets_labels = filter_docs(snippets,None, snippets_labels, lambda docs:has_vector_representation(word2vec_model, doc))


11314 total docs
0 docs removed
10056 total docs
0 docs removed

In [8]:
# LSI
from gensim import corpora
from gensim.models import TfidfModel
from gensim.models import LsiModel
from gensim.similarities import MatrixSimilarity

sims={'ng20':{}, 'snippets':{}}

In [9]:
# NG20 - LSI
dictionary=corpora.Dictionary(corpus)
corpus_gensim=[dictionary.doc2bow(doc) for doc in corpus]
tfidf=TfidfModel(corpus_gensim)
corpus_tfidf=tfidf[corpus_gensim]
lsi=LsiModel(corpus_tfidf, id2word=dictionary,num_topics=200)
lsi_index=MatrixSimilarity(lsi[corpus_tfidf])

sims['ng20']['LSI'] = np.array([lsi_index[lsi[corpus_tfidf[i]]] for i in range(len(corpus))])
print ('finish calculating LSI')


finish calculating LSI

In [10]:
# Snippets - LSI
dictionary_snippets = corpora.Dictionary(snippets)
corpus_gensim_snippets = [dictionary_snippets.doc2bow(doc) for doc in snippets]
tfidf_snippets = TfidfModel(corpus_gensim_snippets)
corpus_tfidf_snippets=tfidf_snippets[corpus_gensim_snippets]
lsi_snippets = LsiModel(corpus_tfidf_snippets, id2word=dictionary_snippets, num_topics=200)
lsi_index_snippets = MatrixSimilarity(lsi_snippets[corpus_tfidf_snippets])

sims['snippets']['LSI'] = np.array([lsi_index[lsi[corpus_tfidf[i]]] for i in range(len(snippets))])
print ('finish calculating LSI')


finish calculating LSI

In [11]:
# Centroid of the word vectors (Cosine Similarity)
from sklearn.metrics.pairwise import cosine_similarity

# ng20 centroid matrix
sims['ng20']['centroid']=cosine_similarity(np.array([document_vector(word2vec_model,doc) for doc in corpus]))


sims['snippets']['centroid'] = cosine_similarity(np.array([document_vector(word2vec_model, doc) for doc in snippets]))

print ('finish calculating cosin')


finish calculating cosin

In [12]:
def most_similar(i, X_sims, topn=None):
    """return the indices of the topn most similar documents with document i
    given the similarity matrix X_sims"""

    r = np.argsort(X_sims[i])[::-1]
    if r is None:
        return r
    else:
        return r[:topn]

#LSI
print(most_similar(0, sims['ng20']['LSI'], 20))
print(most_similar(0, sims['snippets']['LSI'], 20))

#Centroid
print(most_similar(0, sims['ng20']['centroid'], 20))
print(most_similar(0, sims['snippets']['centroid'], 20))


[    0   958  8266  7993  1224  8013   596  5553   659  7861  8364  8372
  1082  8555 10024  3819  9018  4985  4627  7878]
[    0   958  8266  7993  1224  8013   596  5553   659  7861  8364  8372
  1082  8555 10024  3819  9018  4985  4627  7878]
[    0   958  2554  3112  7861 11225  3819  6418  5167  3424   730 11169
  8153  1126  3311  4600  9580   659  8405  9456]
[   0   13   15  973  378   17    2 6658 6829 6833 6307   16  974    8 5535
   19   14    5  965   12]

In [13]:
# WMD
from gensim.similarities import WmdSimilarity

wmd_similarity_top20 = WmdSimilarity(corpus, word2vec_model, num_best=20)
most_similars_wmd_ng20_top20 = wmd_similarity_top20[corpus[0]]

In [14]:
wmd_similarity_snippets = WmdSimilarity(snippets, word2vec_model, num_best=20)
most_similars_snippets = wmd_similarity_snippets[snippets[0]]

In [15]:
most_similars_snippets


Out[15]:
[(0, 1.0),
 (13, 0.60329492929644768),
 (973, 0.5286272334018054),
 (2, 0.52601643645020979),
 (378, 0.52323760224834504),
 (16, 0.5177545982856413),
 (7509, 0.51254049902118537),
 (12, 0.51098133962722514),
 (6828, 0.50705399681485708),
 (19, 0.50653771271015224),
 (17, 0.50652784909122306),
 (974, 0.50597894422453937),
 (7, 0.50416865321814652),
 (15, 0.5041521835206122),
 (712, 0.50320841421252649),
 (6663, 0.50302378894798072),
 (56, 0.50239767268269631),
 (6829, 0.5021651306706193),
 (5, 0.501937836425843),
 (2169, 0.50181756077408601)]

In [ ]: