In [ ]:
%reset
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
from pprint import pprint
Empleando un ejemplo clásico que incluso es utilizado por el tutorial que puede encontrarse en la página web de Gensim, a continuación se muestra cómo las palabras de unos documentos son transformados mediante un diccionario a números, calculado posteriormente un corpus bag of words con las repeticiones de cada palabra en cada documento.
In [ ]:
from gensim import corpora, models, similarities
documents = ["Human machine interface for lab abc computer applications",
"A survey of user opinion of computer system response time",
"The EPS user interface management system",
"System and human system engineering testing of EPS",
"Relation of user perceived response time to error measurement",
"The generation of random binary unordered trees",
"The intersection graph of paths in trees",
"Graph minors IV Widths of trees and well quasi ordering",
"Graph minors A survey"]
In [ ]:
# remove common words and tokenize
stoplist = set('for a of the and to in'.split())
texts = [[word for word in document.lower().split() if word not in stoplist]
for document in documents]
# remove words that appear only once
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
for token in text:
frequency[token] += 1
texts = [[token for token in text if frequency[token] > 1]
for text in texts]
pprint(texts)
In [ ]:
dictionary = corpora.Dictionary(texts)
print(dictionary)
In [ ]:
pprint(dictionary.token2id)
In [ ]:
doc = "Human computer interaction"
vec_bow = dictionary.doc2bow(doc.lower().split())
print(vec_bow)
In [ ]:
bow_corpus = [dictionary.doc2bow(text) for text in texts]
pprint(bow_corpus)
In [ ]:
from gensim import corpora, models, similarities
tfidf = models.TfidfModel(bow_corpus)
tfidf_corpus = tfidf[bow_corpus]
for doc in tfidf_corpus:
pprint(doc)
Siguiendo con el mismo ejemplo pueden observarse los topics calculados en LSI para un número de "conceptos" igual a 2. El corpus bag of words previamente calculado sirve para crear el modelo LSI, y posteriormente el mismo corpus bag of words puede utilizarse para obtener un conjunto de vectores, el nuevo corpus del espacio LSI, como se muestra en la figura en 2 dimensiones al haber indicado 2 topics o conceptos.
In [ ]:
lsi = models.LsiModel(bow_corpus, id2word=dictionary, num_topics=2)
#lsi = models.LsiModel(tfidf_corpus, id2word=dictionary, num_topics=2)
pprint(lsi.print_topics())
lsi_corpus = lsi[bow_corpus]
print('==========')
for doc in lsi_corpus:
pprint(doc)
In [ ]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from sklearn import metrics
from sklearn.cluster import KMeans
x = [x[1] for x,_ in lsi_corpus]
y = [y[1] for _,y in lsi_corpus]
x = np.array(x)
y = np.array(y)
matrix = np.vstack((x,y)).T
print('LSI matrix shape:', matrix.shape)
km = KMeans(n_clusters=2, init='k-means++', max_iter=100, n_init=4, verbose=False, random_state=10)
km.fit(matrix)
pprint(km.labels_)
plt.title('Documents in the LSI space')
plt.xlabel('Dimension / Topic 1')
plt.ylabel('Dimension / Topic 2')
plt.scatter(km.cluster_centers_[:,0], km.cluster_centers_[:,1], marker='x', s=169, linewidths=3, color='g', zorder=10)
plt.scatter(matrix[:,0], matrix[:,1])
unique_labels = set(km.labels_)
colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels)))
xy = matrix[km.labels_==0]
plt.scatter(xy[:,0], xy[:,1], color='r')
silhouette_coefficient = metrics.silhouette_score(matrix, km.labels_, sample_size=1000)
print('Silhouette Coefficient:', silhouette_coefficient)
Para LDA, del mismo modo obtenemos el cáculo final para discernir si un documento está relacionado con unos términos o con otros, es decir, con unos temas o con otros. A continuación se muestra el resultado del cálculo de topics y en la figura la importancia de cada término en cada topic, utilizando para ello la librería pyLDAvis.
In [ ]:
lda = models.ldamodel.LdaModel(bow_corpus, num_topics=2, id2word = dictionary, passes=20)
pprint(lda.print_topics())
lda_corpus = lda[bow_corpus]
print('==========')
for doc in lda_corpus:
pprint(doc)
In [ ]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from sklearn import metrics
from sklearn.cluster import KMeans
x = [x[1] for x,_ in lda_corpus]
y = [y[1] for _,y in lda_corpus]
x = np.array(x)
y = np.array(y)
matrix = np.vstack((x,y)).T
print('LDA matrix shape:', matrix.shape)
km = KMeans(n_clusters=2, init='k-means++', max_iter=100, n_init=4, verbose=False, random_state=10)
km.fit(matrix)
pprint(km.labels_)
plt.title('Documents in the LDA space')
plt.xlabel('Dimension / Topic 1')
plt.ylabel('Dimension / Topic 2')
plt.scatter(km.cluster_centers_[:,0], km.cluster_centers_[:,1], marker='x', s=169, linewidths=3, color='g', zorder=10)
plt.scatter(matrix[:,0], matrix[:,1])
unique_labels = set(km.labels_)
colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels)))
xy = matrix[km.labels_==0]
plt.scatter(xy[:,0], xy[:,1], color='r')
silhouette_coefficient = metrics.silhouette_score(matrix, km.labels_, sample_size=1000)
print('Silhouette Coefficient:', silhouette_coefficient)
In [ ]:
# Install scikit-bio==0.2.3 plus pyLDAvis, $ ~/anaconda3/bin/pip install scikit-bio==0.2.3
import pyLDAvis
from pyLDAvis import gensim as gensimvis
pyLDAvis.enable_notebook()
gensimvis.prepare(lda, bow_corpus, dictionary)
In [ ]:
hdp = models.hdpmodel.HdpModel(bow_corpus, id2word=dictionary)
pprint(hdp.print_topics(topics=-1))
hdp_corpus = hdp[bow_corpus]
print('==========')
for doc in hdp_corpus:
pprint(doc)
Para el nuevo documento “Human computer interaction”, que sirve de consulta, cada modelo devuelve los documentos que ha calculado como más similares. Puede observarse que LSI realiza un división más clara entre los 5 documentos que considera más similares al texto y los 4 restantes. En cambio para LDA la distancia no es tan grande entre los 7 documentos que considera más similares al texto y el resto de documentos.
In [ ]:
vec_tfidf = tfidf[vec_bow]
pprint(vec_tfidf)
index_tfidf = similarities.MatrixSimilarity(tfidf_corpus)
sims = index_tfidf[vec_tfidf]
sims = sorted(enumerate(sims), key=lambda item: -item[1])
print('==========')
for sim in sims:
print(sim[1], '\t', documents[sim[0]])
In [ ]:
index_tfidf.num_best = 2
sims = index_tfidf[vec_tfidf]
pprint(sims)
In [ ]:
vec_lsi = lsi[vec_bow]
pprint(vec_lsi)
index_lsi = similarities.MatrixSimilarity(lsi_corpus)
sims = index_lsi[vec_lsi]
sims = sorted(enumerate(sims), key=lambda item: -item[1])
print('==========')
for sim in sims:
print(sim[1], '\t', documents[sim[0]])
In [ ]:
vec_lda = lda[vec_bow]
pprint(vec_lda)
index_lda = similarities.MatrixSimilarity(lda_corpus)
sims = index_lda[vec_lda]
sims = sorted(enumerate(sims), key=lambda item: -item[1])
print('==========')
for sim in sims:
print(sim[1], '\t', documents[sim[0]])
The class similarities.MatrixSimilarity is only appropriate when the whole set of vectors fits into memory. For example, a corpus of one million documents would require 2GB of RAM in a 256-dimensional LSI space, when used with this class.
Without 2GB of free RAM, you would need to use the similarities.Similarity class. This class operates in fixed memory, by splitting the index across multiple files on disk, called shards. It uses similarities.MatrixSimilarity and similarities.SparseMatrixSimilarity internally, so it is still fast, although slightly more complex.
The HDP model is a new addition to gensim, and still rough around its academic edges – use with care.
In [ ]:
vec_hdp = hdp[vec_bow]
pprint(vec_hdp)
index_hdp = similarities.MatrixSimilarity(hdp_corpus)
sims = index_hdp[vec_hdp]
sims = sorted(enumerate(sims), key=lambda item: -item[1])
print('==========')
pprint(sims)