In [53]:
import re, json, os, nltk, string, gensim, bz2
from gensim import corpora, models, similarities, utils
from nltk.corpus import stopwords
from os import listdir
from datetime import datetime as dt
import numpy as np
import codecs
import sys
stdin, stdout, stderr = sys.stdin, sys.stdout, sys.stderr
reload(sys)
sys.stdin, sys.stdout, sys.stderr = stdin, stdout, stderr
sys.setdefaultencoding('utf-8')
In [54]:
import logging
fmtstr = '%(asctime)s [%(levelname)s][%(name)s] %(message)s'
datefmtstr = '%Y/%m/%d %H:%M:%S'
log_fn = str(dt.now().date()) + '.txt'
logger = logging.getLogger()
if len(logger.handlers) >= 1:
logger.removeHandler(a.handlers[0])
logger.addHandler(logging.FileHandler(log_fn))
logger.handlers[0].setFormatter(logging.Formatter(fmtstr, datefmtstr))
else:
logging.basicConfig(filename=log_fn, format=fmtstr,
datefmt=datefmtstr, level=logging.NOTSET)
In [2]:
stop_words = set(stopwords.words('english'))
In [3]:
def docs_out(line):
j = json.loads(line)
tmp = j.get('brief') + j.get('claim') + j.get('description')
tmp = re.sub('([,?!:;%$&*#~\<\>=+/"(){}\[\]\'])',' ',tmp)
tmp = tmp.replace(u"\u2018", " ").replace(u"\u2019", " ").replace(u"\u201c"," ").replace(u"\u201d", " ")
tmp = tmp.replace(u"\u2022", " ").replace(u"\u2013", " ").replace(u"\u2014", " ").replace(u"\u2026", " ")
tmp = tmp.replace(u"\u20ac", " ").replace(u"\u201a", " ").replace(u"\u201e", " ").replace(u"\u2020", " ")
tmp = tmp.replace(u"\u2021", " ").replace(u"\u02C6", " ").replace(u"\u2030", " ").replace(u"\u2039", " ")
tmp = tmp.replace(u"\u02dc", " ").replace(u"\u203a", " ").replace(u"\ufffe", " ").replace(u"\u00b0", " ")
tmp = tmp.replace(u"\u00b1", " ").replace(u"\u0020", " ").replace(u"\u00a0", " ").replace(u"\u1680", " ")
tmp = tmp.replace(u"\u2000", " ").replace(u"\u2001", " ").replace(u"\u2002", " ").replace(u"\u2003", " ")
tmp = tmp.replace(u"\u2004", " ").replace(u"\u2005", " ").replace(u"\u2006", " ").replace(u"\u2007", " ")
tmp = tmp.replace(u"\u2008", " ").replace(u"\u2009", " ").replace(u"\u200a", " ").replace(u"\u202f", " ")
tmp = tmp.replace(u"\u205f", " ").replace(u"\u3000", " ").replace(u"\u20ab", " ").replace(u"\u201b", " ")
tmp = tmp.replace(u"\u201f", " ").replace(u"\u2e02", " ").replace(u"\u2e04", " ").replace(u"\u2e09", " ")
tmp = tmp.replace(u"\u2e0c", " ").replace(u"\u2e1c", " ").replace(u"\u2e20", " ").replace(u"\u00bb", " ")
tmp = tmp.replace(u"\u2e03", " ").replace(u"\u2e05", " ").replace(u"\u2e0a", " ").replace(u"\u2e0d", " ")
tmp = tmp.replace(u"\u2e1d", " ").replace(u"\u2e21", " ").replace(u"\u2032", " ").replace(u"\u2031", " ")
tmp = tmp.replace(u"\u2033", " ").replace(u"\u2034", " ").replace(u"\u2035", " ").replace(u"\u2036", " ")
tmp = tmp.replace(u"\u2037", " ").replace(u"\u2038", " ")
tmp = re.sub('[.] ',' ',tmp)
return tmp, j.get('patentNumber')
In [4]:
documents = []
f = codecs.open('/share/USPatentData/tokenized_appDate_2013/2013USPTOPatents_by_skip_1.txt.tokenized','r', 'UTF-8')
for line in f:
documents.append(''.join(docs_out(line)[0]) + '\n')
In [5]:
dictionary = corpora.Dictionary([doc.split() for doc in documents])
In [7]:
stop_ids = [dictionary.token2id[stopword] for stopword in stop_words
if stopword in dictionary.token2id]
once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq <= 1]
dictionary.filter_tokens(stop_ids + once_ids)
dictionary.compactify()
#dictionary.save('USPTO_2013.dict')
In [8]:
corpus = [dictionary.doc2bow(doc.split()) for doc in documents]
In [39]:
model_tfidf = models.TfidfModel(corpus)
corpus_tfidf = model_tfidf[corpus]
LsiModel的參數num_topics=200: 設定SVD分解後要保留的維度id2word: 提供corpus的字典,方便將id轉換為wordchunksize=20000: 在記憶體中一次處理的量,值越大則占用記憶體越多,處理速度也越快decay=1.0: 因為資料會切成chunk來計算,所以會分成新舊資料,當新的chunk進來時,decay是舊chunk的加權,如果設小於1.0的值,則舊的資料會慢慢「遺忘」distributed=False: 是否開啟分散式計算,每個core會分到一塊chunkonepass=True: 設為False強制使用multi-pass stochastic algoritmpower_iters=2: 在multi-pass時設定power iteration,越大則accuracy越高,但時間越久令$X$代表corpus的TF-IDF矩陣,作完SVD分解後,會得到左矩陣lsi.projection.u及singular value lsi.projection.s。
$X = USV^T$, where $U \in \mathbb{R}^{|V|\times m}$, $S \in \mathbb{R}^{m\times m}$, $V \in \mathbb{R}^{m\times |D|}$
lsi[X]等同於$U^{-1}X=VS$。所以要求$V$的值,可以用$S^{-1}U^{-1}X$,也就是lsi[X]除以$S$。
因為lsi[X]本身沒有值,只是一個generator,要先透過gensim.matutils.corpus2dense轉換成numpy array,再除以lsi.projection.s。
In [43]:
model_lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=200)
corpus_lsi = model_lsi[corpus_tfidf]
In [75]:
# 計算V的方法,可以作為document vector
docvec_lsi = gensim.matutils.corpus2dense(corpus_lsi, len(model_lsi.projection.s)).T / model_lsi.projection.s
In [153]:
# word vector直接用U的column vector
wordsim_lsi = similarities.MatrixSimilarity(model_lsi.projection.u, num_features=model_lsi.projection.u.shape[1])
In [154]:
# 第二個版本,word vector用U*S
wordsim_lsi2 = similarities.MatrixSimilarity(model_lsi.projection.u * model_lsi.projection.s,
num_features=model_lsi.projection.u.shape[1])
In [155]:
def lsi_query(query, use_ver2=False):
qvec = model_lsi[model_tfidf[dictionary.doc2bow(query.split())]]
if use_ver2:
s = wordsim_lsi2[qvec]
else:
s = wordsim_lsi[qvec]
return [dictionary[i] for i in s.argsort()[-10:]]
In [160]:
print lsi_query('energy')
In [161]:
print lsi_query('energy', True)
sentences: 用來訓練的list of list of words,但不是必要的,因為可以先建好model,再慢慢丟資料訓練size=100: vector的維度alpha=0.025: 初始的學習速度window=5: context window的大小min_count=5: 出現次數小於min_count的單字直接忽略max_vocab_size: 限制vocabulary的大小,如果單字太多,就忽略最少見的單字,預設為無限制sample=0.001: subsampling,隨機刪除機率小於0.001的單字,兼具擴大context windows與減少stopword的功能seed=1: 隨機產生器的random seedworkers=3: 在多核心的系統上,要用幾個核心來trainmin_alpha=0.0001: 學習速度最後收斂的最小值sg=0: 0表示用CBOW,1表示用skip-gramhs=0: 1表示用hierarchical soft-max,0表示用negative samplingnegative=5: 表示使用幾組negative sample來訓練cbow_mean=1: 在使用CBOW的前提下,0表示使用sum作為hidden layer,1表示使用mean作為hidden layerhashfxn=<build-in hash function>: 隨機初始化weights使用的hash functioniter=5: 整個corpus要訓練幾次trim_rule: None表示小於min_count的單字會被忽略,也可以指定一個function(word, count, min_count),這個function的傳回值有三種,util.RULE_DISCARD、util.RULE_KEEP、util.RULE_DEFAULT。這個參數會影響dictionary的生成sorted_vocab=1: 1表示在指定word index前,先按照頻率將單字排序batch_words=10000: 要傳給worker的單字長度先產生一個空的model
model_w2v = models.Word2Vec(size=200, sg=1)
傳入一個list of words更新vocabulary
sent = [['first','sent'], ['second','sent']]
model_w2v.build_vocab(sent)
傳入一個list of words更新model
model_w2v.train(sent)
In [179]:
all_text = [doc.split() for doc in documents]
In [180]:
model_w2v = models.Word2Vec(size=200, sg=1)
In [181]:
%timeit model_w2v.build_vocab(all_text)
In [183]:
%timeit model_w2v.train(all_text)
In [189]:
model_w2v.most_similar_cosmul(['deep','learning'])
Out[189]:
Doc2Vec的參數
documents=None: 用來訓練的document,可以是list of TaggedDocument,或TaggedDocument generatorsize=300: vector的維度alpha=0.025: 初始的學習速度window=8: context window的大小min_count=5: 出現次數小於min_count的單字直接忽略max_vocab_size=None: 限制vocabulary的大小,如果單字太多,就忽略最少見的單字,預設為無限制sample=0: subsampling,隨機刪除機率小於sample的單字,兼具擴大context windows與減少stopword的功能seed=1: 隨機產生器的random seedworkers=1: 在多核心的系統上,要用幾個核心來trainmin_alpha=0.0001: 學習速度最後收斂的最小值hs=1: 1表示用hierarchical soft-max,0表示用negative samplingnegative=0: 表示使用幾組negative sample來訓練dbow_words=0: 1表示同時訓練出word-vector(用skip-gram)及doc-vector(用DBOW),0表示只訓練doc-vectordm=1: 1表示用distributed memory(PV-DM)來訓練,0表示用distributed bag-of-word(PV-DBOW)來訓練dm_concat=0: 1表示不要sum/average而用concatenation of context vectors,0表示用sum/average。使用concatenation會產生較大的model,而且輸入的vector長度會變長dm_mean=0: 在使用DBOW而且dm_concat=0的前提下,0表示使用sum作為hidden layer,1表示使用mean作為hidden layerdm_tag_count=1: 當dm_concat=1時,預期每個document有幾個document tagstrim_rule=None: None表示小於min_count的單字會被忽略,也可以指定一個function(word, count, min_count),這個function的傳回值有三種,util.RULE_DISCARD、util.RULE_KEEP、util.RULE_DEFAULT。這個參數會影響dictionary的生成
In [9]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
In [12]:
class PatentDocGenerator(object):
def __init__(self, filename):
self.filename = filename
def __iter__(self):
f = codecs.open(self.filename, 'r', 'UTF-8')
for line in f:
text, appnum = docs_out(line)
yield TaggedDocument(text.split(), appnum.split())
In [28]:
doc = PatentDocGenerator('/share/USPatentData/tokenized_appDate_2013/2013USPTOPatents_by_skip_1.txt.tokenized')
%timeit model_d2v = Doc2Vec(doc, size=200, window=8, sample=1e-5, hs=0, negative=5)
In [29]:
doc = PatentDocGenerator('/share/USPatentData/tokenized_appDate_2013/2013USPTOPatents_by_skip_1.txt.tokenized')
model_d2v = Doc2Vec(doc, size=200, window=8, sample=1e-5, hs=0, negative=5)
In [30]:
model_d2v.docvecs.most_similar(['20140187118'])
Out[30]:
In [31]:
m = Doc2Vec(size=200, window=8, sample=1e-5, hs=0, negative=5)
In [32]:
m.build_vocab(doc)
In [33]:
m.train(doc)
Out[33]:
In [34]:
m.docvecs.most_similar(['20140187118'])
Out[34]:
In [25]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
In [26]:
class PatentDocGenerator(object):
def __init__(self, filename):
self.filename = filename
def __iter__(self):
f = codecs.open(self.filename, 'r', 'UTF-8')
for line in f:
text, appnum = docs_out(line)
yield TaggedDocument(text.split(), appnum.split())
In [27]:
model_d2v = Doc2Vec(size=200, window=8, sample=1e-5, hs=0, negative=5)
root = '/share/USPatentData/tokenized_appDate_2013/'
In [57]:
for fn in sorted(listdir(root)):
doc = PatentDocGenerator(os.path.join(root, fn))
start = dt.now()
model_d2v.build_vocab(doc)
model_d2v.train(doc)
logging.info('{} training time: {}'.format(fn, str(dt.now() - start)))
In [58]:
model_d2v.save("doc2vec_uspto_2013.model")
In [ ]: