In [1]:
from gensim import corpora, models, similarities, utils
import numpy as np
import nltk
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
In [2]:
corpus = [[(0, 1.0), (1, 1.0), (2, 1.0)],
[(2, 1.0), (3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (8, 1.0)],
[(1, 1.0), (3, 1.0), (4, 1.0), (7, 1.0)],
[(0, 1.0), (4, 2.0), (7, 1.0)],
[(3, 1.0), (5, 1.0), (6, 1.0)],
[(9, 1.0)],
[(9, 1.0), (10, 1.0)],
[(9, 1.0), (10, 1.0), (11, 1.0)],
[(8, 1.0), (10, 1.0), (11, 1.0)]]
transformation則是用來將一種representation轉換成另一種。
以下面的例子來說,我們先由corpus產生一個tfidf model,這個model的用途是將未來看到的document-term vector轉換成tfidf。
In [3]:
tfidf = models.TfidfModel(corpus)
tfidf[(0, 2.0), (2, 3.0)]
Out[3]:
如果要取得原本corpus的vector,需要用tfidf model轉換一次。
下面將轉換的結果產生一個similarity matrix:
In [4]:
index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=12)
In [5]:
sims = index[tfidf[(0, 2.0), (2, 3.0)]]
In [6]:
sims
Out[6]:
這個結果表示(0,2)-(2,3)的document,與文件[0]具有80%的相似度,和文件[1]具有36%、文件[3]具有27%的相似度。
In [7]:
documents = ["Human machine interface for lab abc computer applications",
"A survey of user opinion of computer system response time",
"The EPS user interface management system",
"System and human system engineering testing of EPS",
"Relation of user perceived response time to error measurement",
"The generation of random binary unordered trees",
"The intersection graph of paths in trees",
"Graph minors IV Widths of trees and well quasi ordering",
"Graph minors A survey"]
In [8]:
stoplist = set('for a of the and to in'.split())
texts = [[word for word in document.lower().split() if word not in stoplist] for document in documents]
In [9]:
import itertools
cnt = nltk.FreqDist(list(itertools.chain.from_iterable(texts)))
In [10]:
ntexts = [[term for term in doc if cnt[term] > 1] for doc in texts]
ntexts
Out[10]:
In [11]:
dict = corpora.Dictionary(ntexts)
# dict.save('corpus.dict') # 存到檔案中
In [12]:
dict.token2id
Out[12]:
In [13]:
dict.doc2bow("Human computer interaction".lower().split())
Out[13]:
In [14]:
corpus = [dict.doc2bow(text) for text in texts]
# corpora.MmCorpus.serialize('corpus.mm', corpus) # 存到檔案中
corpus
Out[14]:
In [15]:
class MyCorpus(object):
def __iter__(self):
for line in open('mycorpus.txt'):
# 假設每一行代表一個document,而且tokens以空白字元分隔
yield dict.doc2bow(line.lower().split())
# 這裡用yield會變成一個generator,因此可以將物件放在for裡面
# 例如 for line in myCorpus:
In [16]:
myCorpus = MyCorpus()
myCorpus
Out[16]:
In [19]:
for vec in myCorpus:
print vec
同樣的,建立dictionary時,也可以用這樣的技巧避免載入檔案到記憶體。
In [20]:
dict2 = corpora.Dictionary(line.lower().split() for line in open("mycorpus.txt"))
print dict2
In [21]:
stopid = [dict2.token2id[s] for s in stoplist]
onceid = [id for id, freq in dict2.dfs.iteritems() if freq == 1]
dict2.filter_tokens(stopid + onceid)
dict2.compactify()
print dict2
corpora.MmCorpus.serialize('corpus.mm', myCorpus)
myCorpus = corpora.MmCorpus('corpus.mm')
corpora.SvmLightCorpus.serialize('corpus.svml', myCorpus)
myCorpus = corpora.SvmLightCorpus('corpus.svml')
corpora.BleiCorpus.serialize('corpus.ldac', myCorpus)
myCorpus = corpora.BleiCorpus('corpus.ldac')
corpora.LowCorpus.serialize('corpus.low', myCorpus)
myCorpus = corpora.LowCorpus('corpus.low')
corpus在使用時,也可以用streaming的方式:
for doc in corpus:
print doc
In [22]:
import gensim
# term-document matrix
numpy_matrix = np.array([[1,2,3,4],[2,4,5,6]])
corpus = gensim.matutils.Dense2Corpus(numpy_matrix)
In [23]:
numpy_matrix = gensim.matutils.corpus2dense(corpus, num_terms=3)
numpy_matrix
Out[23]:
In [24]:
from scipy import sparse
In [25]:
corpus = gensim.matutils.Sparse2Corpus(sparse.rand(5, 3))
scipy_matrix = gensim.matutils.corpus2csc(corpus, 5)
scipy_matrix
Out[25]:
In [ ]: