Prepare Dictionary and Corpus



In [53]:

    
import re, json, os, nltk, string, gensim, bz2
from gensim import corpora, models, similarities, utils
from nltk.corpus import stopwords
from os import listdir
from datetime import datetime as dt
import numpy as np
import codecs
import sys
stdin, stdout, stderr = sys.stdin, sys.stdout, sys.stderr
reload(sys)
sys.stdin, sys.stdout, sys.stderr = stdin, stdout, stderr
sys.setdefaultencoding('utf-8')



In [54]:

    
import logging
fmtstr = '%(asctime)s [%(levelname)s][%(name)s] %(message)s'
datefmtstr = '%Y/%m/%d %H:%M:%S'
log_fn = str(dt.now().date()) + '.txt'
logger = logging.getLogger()
if len(logger.handlers) >= 1:
    logger.removeHandler(a.handlers[0])
    logger.addHandler(logging.FileHandler(log_fn))
    logger.handlers[0].setFormatter(logging.Formatter(fmtstr, datefmtstr))
else:
    logging.basicConfig(filename=log_fn, format=fmtstr,
                        datefmt=datefmtstr, level=logging.NOTSET)



In [2]:

    
stop_words = set(stopwords.words('english'))



In [3]:

    
def docs_out(line):
    j = json.loads(line)
    tmp  = j.get('brief') + j.get('claim') + j.get('description')
    tmp = re.sub('([,?!:;%$&*#~\<\>=+/"(){}\[\]\'])',' ',tmp)
    tmp = tmp.replace(u"\u2018", " ").replace(u"\u2019", " ").replace(u"\u201c"," ").replace(u"\u201d", " ")
    tmp = tmp.replace(u"\u2022", " ").replace(u"\u2013", " ").replace(u"\u2014", " ").replace(u"\u2026", " ")
    tmp = tmp.replace(u"\u20ac", " ").replace(u"\u201a", " ").replace(u"\u201e", " ").replace(u"\u2020", " ")
    tmp = tmp.replace(u"\u2021", " ").replace(u"\u02C6", " ").replace(u"\u2030", " ").replace(u"\u2039", " ")
    tmp = tmp.replace(u"\u02dc", " ").replace(u"\u203a", " ").replace(u"\ufffe", " ").replace(u"\u00b0", " ")
    tmp = tmp.replace(u"\u00b1", " ").replace(u"\u0020", " ").replace(u"\u00a0", " ").replace(u"\u1680", " ")
    tmp = tmp.replace(u"\u2000", " ").replace(u"\u2001", " ").replace(u"\u2002", " ").replace(u"\u2003", " ")
    tmp = tmp.replace(u"\u2004", " ").replace(u"\u2005", " ").replace(u"\u2006", " ").replace(u"\u2007", " ")
    tmp = tmp.replace(u"\u2008", " ").replace(u"\u2009", " ").replace(u"\u200a", " ").replace(u"\u202f", " ")
    tmp = tmp.replace(u"\u205f", " ").replace(u"\u3000", " ").replace(u"\u20ab", " ").replace(u"\u201b", " ")
    tmp = tmp.replace(u"\u201f", " ").replace(u"\u2e02", " ").replace(u"\u2e04", " ").replace(u"\u2e09", " ")
    tmp = tmp.replace(u"\u2e0c", " ").replace(u"\u2e1c", " ").replace(u"\u2e20", " ").replace(u"\u00bb", " ")
    tmp = tmp.replace(u"\u2e03", " ").replace(u"\u2e05", " ").replace(u"\u2e0a", " ").replace(u"\u2e0d", " ")
    tmp = tmp.replace(u"\u2e1d", " ").replace(u"\u2e21", " ").replace(u"\u2032", " ").replace(u"\u2031", " ")
    tmp = tmp.replace(u"\u2033", " ").replace(u"\u2034", " ").replace(u"\u2035", " ").replace(u"\u2036", " ")
    tmp = tmp.replace(u"\u2037", " ").replace(u"\u2038", " ")
    tmp = re.sub('[.] ',' ',tmp)
    return tmp, j.get('patentNumber')



In [4]:

    
documents = []
f = codecs.open('/share/USPatentData/tokenized_appDate_2013/2013USPTOPatents_by_skip_1.txt.tokenized','r', 'UTF-8')
for line in f:
    documents.append(''.join(docs_out(line)[0]) + '\n')



In [5]:

    
dictionary = corpora.Dictionary([doc.split() for doc in documents])



In [7]:

    
stop_ids = [dictionary.token2id[stopword] for stopword in stop_words
            if stopword in dictionary.token2id]
once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq <= 1]
dictionary.filter_tokens(stop_ids + once_ids)
dictionary.compactify()
#dictionary.save('USPTO_2013.dict')



In [8]:

    
corpus = [dictionary.doc2bow(doc.split()) for doc in documents]

Build LSI Model



In [39]:

    
model_tfidf = models.TfidfModel(corpus)
corpus_tfidf = model_tfidf[corpus]

`LsiModel`的參數

num_topics=200: 設定SVD分解後要保留的維度
id2word: 提供corpus的字典，方便將id轉換為word
chunksize=20000: 在記憶體中一次處理的量，值越大則占用記憶體越多，處理速度也越快
decay=1.0: 因為資料會切成chunk來計算，所以會分成新舊資料，當新的chunk進來時，decay是舊chunk的加權，如果設小於1.0的值，則舊的資料會慢慢「遺忘」
distributed=False: 是否開啟分散式計算，每個core會分到一塊chunk
onepass=True: 設為False強制使用multi-pass stochastic algoritm
power_iters=2: 在multi-pass時設定power iteration，越大則accuracy越高，但時間越久

令$X$代表corpus的TF-IDF矩陣，作完SVD分解後，會得到左矩陣lsi.projection.u及singular value lsi.projection.s。

$X = USV^T$, where $U \in \mathbb{R}^{|V|\times m}$, $S \in \mathbb{R}^{m\times m}$, $V \in \mathbb{R}^{m\times |D|}$

lsi[X]等同於$U^{-1}X=VS$。所以要求$V$的值，可以用$S^{-1}U^{-1}X$，也就是lsi[X]除以$S$。

因為lsi[X]本身沒有值，只是一個generator，要先透過gensim.matutils.corpus2dense轉換成numpy array，再除以lsi.projection.s。



In [43]:

    
model_lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=200)
corpus_lsi = model_lsi[corpus_tfidf]



In [75]:

    
# 計算V的方法，可以作為document vector
docvec_lsi = gensim.matutils.corpus2dense(corpus_lsi, len(model_lsi.projection.s)).T / model_lsi.projection.s



In [153]:

    
# word vector直接用U的column vector
wordsim_lsi = similarities.MatrixSimilarity(model_lsi.projection.u, num_features=model_lsi.projection.u.shape[1])



In [154]:

    
# 第二個版本，word vector用U*S
wordsim_lsi2 = similarities.MatrixSimilarity(model_lsi.projection.u * model_lsi.projection.s,
                                            num_features=model_lsi.projection.u.shape[1])



In [155]:

    
def lsi_query(query, use_ver2=False):
    qvec = model_lsi[model_tfidf[dictionary.doc2bow(query.split())]]
    if use_ver2:
        s = wordsim_lsi2[qvec]
    else:
        s = wordsim_lsi[qvec]
    return [dictionary[i] for i in s.argsort()[-10:]]



In [160]:

    
print lsi_query('energy')









    



[u'duct', u'energy', u'thermosetting', u'sheet', u'thermoforming', u'heater', u'mold', u'transducer', u'zone', u'uvc']



In [161]:

    
print lsi_query('energy', True)









    



[u'private_use_areas', u'energy', u'chamber', u'transducer', u'heater', u'thermoforming', u'sheet', u'mold', u'zone', u'uvc']

Build Word2Vec Model

Word2Vec的參數

sentences: 用來訓練的list of list of words，但不是必要的，因為可以先建好model，再慢慢丟資料訓練
size=100: vector的維度
alpha=0.025: 初始的學習速度
window=5: context window的大小
min_count=5: 出現次數小於min_count的單字直接忽略
max_vocab_size: 限制vocabulary的大小，如果單字太多，就忽略最少見的單字，預設為無限制
sample=0.001: subsampling，隨機刪除機率小於0.001的單字，兼具擴大context windows與減少stopword的功能
seed=1: 隨機產生器的random seed
workers=3: 在多核心的系統上，要用幾個核心來train
min_alpha=0.0001: 學習速度最後收斂的最小值
sg=0: 0表示用CBOW，1表示用skip-gram
hs=0: 1表示用hierarchical soft-max，0表示用negative sampling
negative=5: 表示使用幾組negative sample來訓練
cbow_mean=1: 在使用CBOW的前提下，0表示使用sum作為hidden layer，1表示使用mean作為hidden layer
hashfxn=<build-in hash function>: 隨機初始化weights使用的hash function
iter=5: 整個corpus要訓練幾次
trim_rule: None表示小於min_count的單字會被忽略，也可以指定一個function(word, count, min_count)，這個function的傳回值有三種，util.RULE_DISCARD、util.RULE_KEEP、util.RULE_DEFAULT。這個參數會影響dictionary的生成
sorted_vocab=1: 1表示在指定word index前，先按照頻率將單字排序
batch_words=10000: 要傳給worker的單字長度

訓練方法

先產生一個空的model
model_w2v = models.Word2Vec(size=200, sg=1)
傳入一個list of words更新vocabulary
sent = [['first','sent'], ['second','sent']]
model_w2v.build_vocab(sent)
傳入一個list of words更新model
model_w2v.train(sent)



In [179]:

    
all_text = [doc.split() for doc in documents]



In [180]:

    
model_w2v = models.Word2Vec(size=200, sg=1)



In [181]:

    
%timeit model_w2v.build_vocab(all_text)









    



1 loops, best of 3: 5.2 s per loop



In [183]:

    
%timeit model_w2v.train(all_text)









    



1 loops, best of 3: 2min 45s per loop



In [189]:

    
model_w2v.most_similar_cosmul(['deep','learning'])









    Out[189]:





[(u'specialized', 0.44409653544425964),
 (u'adaboost', 0.42892950773239136),
 (u'extreme', 0.4117093086242676),
 (u'general-domain', 0.40930068492889404),
 (u'bayesian', 0.40740323066711426),
 (u'bischof', 0.40500378608703613),
 (u'pets', 0.404163658618927),
 (u'ensemble', 0.3979285955429077),
 (u'classifiers', 0.39685603976249695),
 (u'davis', 0.3963455259799957)]

Build Doc2Vec Model

Doc2Vec的參數

documents=None: 用來訓練的document，可以是list of TaggedDocument，或TaggedDocument generator
size=300: vector的維度
alpha=0.025: 初始的學習速度
window=8: context window的大小
min_count=5: 出現次數小於min_count的單字直接忽略
max_vocab_size=None: 限制vocabulary的大小，如果單字太多，就忽略最少見的單字，預設為無限制
sample=0: subsampling，隨機刪除機率小於sample的單字，兼具擴大context windows與減少stopword的功能
seed=1: 隨機產生器的random seed
workers=1: 在多核心的系統上，要用幾個核心來train
min_alpha=0.0001: 學習速度最後收斂的最小值
hs=1: 1表示用hierarchical soft-max，0表示用negative sampling
negative=0: 表示使用幾組negative sample來訓練
dbow_words=0: 1表示同時訓練出word-vector(用skip-gram)及doc-vector(用DBOW)，0表示只訓練doc-vector
dm=1: 1表示用distributed memory(PV-DM)來訓練，0表示用distributed bag-of-word(PV-DBOW)來訓練
dm_concat=0: 1表示不要sum/average而用concatenation of context vectors，0表示用sum/average。使用concatenation會產生較大的model，而且輸入的vector長度會變長
dm_mean=0: 在使用DBOW而且dm_concat=0的前提下，0表示使用sum作為hidden layer，1表示使用mean作為hidden layer
dm_tag_count=1: 當dm_concat=1時，預期每個document有幾個document tags
trim_rule=None: None表示小於min_count的單字會被忽略，也可以指定一個function(word, count, min_count)，這個function的傳回值有三種，util.RULE_DISCARD、util.RULE_KEEP、util.RULE_DEFAULT。這個參數會影響dictionary的生成



In [9]:

    
from gensim.models.doc2vec import Doc2Vec, TaggedDocument



In [12]:

    
class PatentDocGenerator(object):
    def __init__(self, filename):
        self.filename = filename
        
    def __iter__(self):
        f = codecs.open(self.filename, 'r', 'UTF-8')
        for line in f:
            text, appnum = docs_out(line)
            yield TaggedDocument(text.split(), appnum.split())



In [28]:

    
doc = PatentDocGenerator('/share/USPatentData/tokenized_appDate_2013/2013USPTOPatents_by_skip_1.txt.tokenized')
%timeit model_d2v = Doc2Vec(doc, size=200, window=8, sample=1e-5, hs=0, negative=5)









    



1 loops, best of 3: 38.7 s per loop



In [29]:

    
doc = PatentDocGenerator('/share/USPatentData/tokenized_appDate_2013/2013USPTOPatents_by_skip_1.txt.tokenized')
model_d2v = Doc2Vec(doc, size=200, window=8, sample=1e-5, hs=0, negative=5)



In [30]:

    
model_d2v.docvecs.most_similar(['20140187118'])









    Out[30]:





[(u'US00D689829', 0.4423035681247711),
 (u'US00D690799', 0.43438467383384705),
 (u'20140182392', 0.42934495210647583),
 (u'20150022662', 0.4291260838508606),
 (u'20140125536', 0.4286767244338989),
 (u'20130270568', 0.4284389019012451),
 (u'US008650834', 0.428107351064682),
 (u'20130247928', 0.4270566999912262),
 (u'20140183325', 0.4267898201942444),
 (u'20140016249', 0.4265807569026947)]



In [31]:

    
m = Doc2Vec(size=200, window=8, sample=1e-5, hs=0, negative=5)



In [32]:

    
m.build_vocab(doc)



In [33]:

    
m.train(doc)









    Out[33]:





2257832



In [34]:

    
m.docvecs.most_similar(['20140187118'])









    Out[34]:





[(u'US00D689829', 0.4423035681247711),
 (u'US00D690799', 0.43438467383384705),
 (u'20140182392', 0.42934495210647583),
 (u'20150022662', 0.4291260838508606),
 (u'20140125536', 0.4286767244338989),
 (u'20130270568', 0.4284389019012451),
 (u'US008650834', 0.428107351064682),
 (u'20130247928', 0.4270566999912262),
 (u'20140183325', 0.4267898201942444),
 (u'20140016249', 0.4265807569026947)]

Build Doc2Vec Model from 2013 USPTO Patents



In [25]:

    
from gensim.models.doc2vec import Doc2Vec, TaggedDocument



In [26]:

    
class PatentDocGenerator(object):
    def __init__(self, filename):
        self.filename = filename
        
    def __iter__(self):
        f = codecs.open(self.filename, 'r', 'UTF-8')
        for line in f:
            text, appnum = docs_out(line)
            yield TaggedDocument(text.split(), appnum.split())



In [27]:

    
model_d2v = Doc2Vec(size=200, window=8, sample=1e-5, hs=0, negative=5)
root = '/share/USPatentData/tokenized_appDate_2013/'



In [57]:

    
for fn in sorted(listdir(root)):
    doc = PatentDocGenerator(os.path.join(root, fn))
    start = dt.now()
    model_d2v.build_vocab(doc)
    model_d2v.train(doc)
    logging.info('{} training time: {}'.format(fn, str(dt.now() - start)))



In [58]:

    
model_d2v.save("doc2vec_uspto_2013.model")



In [ ]: