Prepare Dictionary and Corpus


In [53]:
import re, json, os, nltk, string, gensim, bz2
from gensim import corpora, models, similarities, utils
from nltk.corpus import stopwords
from os import listdir
from datetime import datetime as dt
import numpy as np
import codecs
import sys
stdin, stdout, stderr = sys.stdin, sys.stdout, sys.stderr
reload(sys)
sys.stdin, sys.stdout, sys.stderr = stdin, stdout, stderr
sys.setdefaultencoding('utf-8')

In [54]:
import logging
fmtstr = '%(asctime)s [%(levelname)s][%(name)s] %(message)s'
datefmtstr = '%Y/%m/%d %H:%M:%S'
log_fn = str(dt.now().date()) + '.txt'
logger = logging.getLogger()
if len(logger.handlers) >= 1:
    logger.removeHandler(a.handlers[0])
    logger.addHandler(logging.FileHandler(log_fn))
    logger.handlers[0].setFormatter(logging.Formatter(fmtstr, datefmtstr))
else:
    logging.basicConfig(filename=log_fn, format=fmtstr,
                        datefmt=datefmtstr, level=logging.NOTSET)

In [2]:
stop_words = set(stopwords.words('english'))

In [3]:
def docs_out(line):
    j = json.loads(line)
    tmp  = j.get('brief') + j.get('claim') + j.get('description')
    tmp = re.sub('([,?!:;%$&*#~\<\>=+/"(){}\[\]\'])',' ',tmp)
    tmp = tmp.replace(u"\u2018", " ").replace(u"\u2019", " ").replace(u"\u201c"," ").replace(u"\u201d", " ")
    tmp = tmp.replace(u"\u2022", " ").replace(u"\u2013", " ").replace(u"\u2014", " ").replace(u"\u2026", " ")
    tmp = tmp.replace(u"\u20ac", " ").replace(u"\u201a", " ").replace(u"\u201e", " ").replace(u"\u2020", " ")
    tmp = tmp.replace(u"\u2021", " ").replace(u"\u02C6", " ").replace(u"\u2030", " ").replace(u"\u2039", " ")
    tmp = tmp.replace(u"\u02dc", " ").replace(u"\u203a", " ").replace(u"\ufffe", " ").replace(u"\u00b0", " ")
    tmp = tmp.replace(u"\u00b1", " ").replace(u"\u0020", " ").replace(u"\u00a0", " ").replace(u"\u1680", " ")
    tmp = tmp.replace(u"\u2000", " ").replace(u"\u2001", " ").replace(u"\u2002", " ").replace(u"\u2003", " ")
    tmp = tmp.replace(u"\u2004", " ").replace(u"\u2005", " ").replace(u"\u2006", " ").replace(u"\u2007", " ")
    tmp = tmp.replace(u"\u2008", " ").replace(u"\u2009", " ").replace(u"\u200a", " ").replace(u"\u202f", " ")
    tmp = tmp.replace(u"\u205f", " ").replace(u"\u3000", " ").replace(u"\u20ab", " ").replace(u"\u201b", " ")
    tmp = tmp.replace(u"\u201f", " ").replace(u"\u2e02", " ").replace(u"\u2e04", " ").replace(u"\u2e09", " ")
    tmp = tmp.replace(u"\u2e0c", " ").replace(u"\u2e1c", " ").replace(u"\u2e20", " ").replace(u"\u00bb", " ")
    tmp = tmp.replace(u"\u2e03", " ").replace(u"\u2e05", " ").replace(u"\u2e0a", " ").replace(u"\u2e0d", " ")
    tmp = tmp.replace(u"\u2e1d", " ").replace(u"\u2e21", " ").replace(u"\u2032", " ").replace(u"\u2031", " ")
    tmp = tmp.replace(u"\u2033", " ").replace(u"\u2034", " ").replace(u"\u2035", " ").replace(u"\u2036", " ")
    tmp = tmp.replace(u"\u2037", " ").replace(u"\u2038", " ")
    tmp = re.sub('[.] ',' ',tmp)
    return tmp, j.get('patentNumber')

In [4]:
documents = []
f = codecs.open('/share/USPatentData/tokenized_appDate_2013/2013USPTOPatents_by_skip_1.txt.tokenized','r', 'UTF-8')
for line in f:
    documents.append(''.join(docs_out(line)[0]) + '\n')

In [5]:
dictionary = corpora.Dictionary([doc.split() for doc in documents])

In [7]:
stop_ids = [dictionary.token2id[stopword] for stopword in stop_words
            if stopword in dictionary.token2id]
once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq <= 1]
dictionary.filter_tokens(stop_ids + once_ids)
dictionary.compactify()
#dictionary.save('USPTO_2013.dict')

In [8]:
corpus = [dictionary.doc2bow(doc.split()) for doc in documents]

Build LSI Model


In [39]:
model_tfidf = models.TfidfModel(corpus)
corpus_tfidf = model_tfidf[corpus]

LsiModel的參數

  • num_topics=200: 設定SVD分解後要保留的維度
  • id2word: 提供corpus的字典,方便將id轉換為word
  • chunksize=20000: 在記憶體中一次處理的量,值越大則占用記憶體越多,處理速度也越快
  • decay=1.0: 因為資料會切成chunk來計算,所以會分成新舊資料,當新的chunk進來時,decay是舊chunk的加權,如果設小於1.0的值,則舊的資料會慢慢「遺忘」
  • distributed=False: 是否開啟分散式計算,每個core會分到一塊chunk
  • onepass=True: 設為False強制使用multi-pass stochastic algoritm
  • power_iters=2: 在multi-pass時設定power iteration,越大則accuracy越高,但時間越久

令$X$代表corpus的TF-IDF矩陣,作完SVD分解後,會得到左矩陣lsi.projection.u及singular value lsi.projection.s

$X = USV^T$, where $U \in \mathbb{R}^{|V|\times m}$, $S \in \mathbb{R}^{m\times m}$, $V \in \mathbb{R}^{m\times |D|}$

lsi[X]等同於$U^{-1}X=VS$。所以要求$V$的值,可以用$S^{-1}U^{-1}X$,也就是lsi[X]除以$S$。

因為lsi[X]本身沒有值,只是一個generator,要先透過gensim.matutils.corpus2dense轉換成numpy array,再除以lsi.projection.s


In [43]:
model_lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=200)
corpus_lsi = model_lsi[corpus_tfidf]

In [75]:
# 計算V的方法,可以作為document vector
docvec_lsi = gensim.matutils.corpus2dense(corpus_lsi, len(model_lsi.projection.s)).T / model_lsi.projection.s

In [153]:
# word vector直接用U的column vector
wordsim_lsi = similarities.MatrixSimilarity(model_lsi.projection.u, num_features=model_lsi.projection.u.shape[1])

In [154]:
# 第二個版本,word vector用U*S
wordsim_lsi2 = similarities.MatrixSimilarity(model_lsi.projection.u * model_lsi.projection.s,
                                            num_features=model_lsi.projection.u.shape[1])

In [155]:
def lsi_query(query, use_ver2=False):
    qvec = model_lsi[model_tfidf[dictionary.doc2bow(query.split())]]
    if use_ver2:
        s = wordsim_lsi2[qvec]
    else:
        s = wordsim_lsi[qvec]
    return [dictionary[i] for i in s.argsort()[-10:]]

In [160]:
print lsi_query('energy')


[u'duct', u'energy', u'thermosetting', u'sheet', u'thermoforming', u'heater', u'mold', u'transducer', u'zone', u'uvc']

In [161]:
print lsi_query('energy', True)


[u'private_use_areas', u'energy', u'chamber', u'transducer', u'heater', u'thermoforming', u'sheet', u'mold', u'zone', u'uvc']

Build Word2Vec Model

Word2Vec的參數

  • sentences: 用來訓練的list of list of words,但不是必要的,因為可以先建好model,再慢慢丟資料訓練
  • size=100: vector的維度
  • alpha=0.025: 初始的學習速度
  • window=5: context window的大小
  • min_count=5: 出現次數小於min_count的單字直接忽略
  • max_vocab_size: 限制vocabulary的大小,如果單字太多,就忽略最少見的單字,預設為無限制
  • sample=0.001: subsampling,隨機刪除機率小於0.001的單字,兼具擴大context windows與減少stopword的功能
  • seed=1: 隨機產生器的random seed
  • workers=3: 在多核心的系統上,要用幾個核心來train
  • min_alpha=0.0001: 學習速度最後收斂的最小值
  • sg=0: 0表示用CBOW,1表示用skip-gram
  • hs=0: 1表示用hierarchical soft-max,0表示用negative sampling
  • negative=5: 表示使用幾組negative sample來訓練
  • cbow_mean=1: 在使用CBOW的前提下,0表示使用sum作為hidden layer,1表示使用mean作為hidden layer
  • hashfxn=<build-in hash function>: 隨機初始化weights使用的hash function
  • iter=5: 整個corpus要訓練幾次
  • trim_rule: None表示小於min_count的單字會被忽略,也可以指定一個function(word, count, min_count),這個function的傳回值有三種,util.RULE_DISCARDutil.RULE_KEEPutil.RULE_DEFAULT。這個參數會影響dictionary的生成
  • sorted_vocab=1: 1表示在指定word index前,先按照頻率將單字排序
  • batch_words=10000: 要傳給worker的單字長度

訓練方法

先產生一個空的model
model_w2v = models.Word2Vec(size=200, sg=1)
傳入一個list of words更新vocabulary
sent = [['first','sent'], ['second','sent']]
model_w2v.build_vocab(sent)
傳入一個list of words更新model
model_w2v.train(sent)


In [179]:
all_text = [doc.split() for doc in documents]

In [180]:
model_w2v = models.Word2Vec(size=200, sg=1)

In [181]:
%timeit model_w2v.build_vocab(all_text)


1 loops, best of 3: 5.2 s per loop

In [183]:
%timeit model_w2v.train(all_text)


1 loops, best of 3: 2min 45s per loop

In [189]:
model_w2v.most_similar_cosmul(['deep','learning'])


Out[189]:
[(u'specialized', 0.44409653544425964),
 (u'adaboost', 0.42892950773239136),
 (u'extreme', 0.4117093086242676),
 (u'general-domain', 0.40930068492889404),
 (u'bayesian', 0.40740323066711426),
 (u'bischof', 0.40500378608703613),
 (u'pets', 0.404163658618927),
 (u'ensemble', 0.3979285955429077),
 (u'classifiers', 0.39685603976249695),
 (u'davis', 0.3963455259799957)]

Build Doc2Vec Model

Doc2Vec的參數

  • documents=None: 用來訓練的document,可以是list of TaggedDocument,或TaggedDocument generator
  • size=300: vector的維度
  • alpha=0.025: 初始的學習速度
  • window=8: context window的大小
  • min_count=5: 出現次數小於min_count的單字直接忽略
  • max_vocab_size=None: 限制vocabulary的大小,如果單字太多,就忽略最少見的單字,預設為無限制
  • sample=0: subsampling,隨機刪除機率小於sample的單字,兼具擴大context windows與減少stopword的功能
  • seed=1: 隨機產生器的random seed
  • workers=1: 在多核心的系統上,要用幾個核心來train
  • min_alpha=0.0001: 學習速度最後收斂的最小值
  • hs=1: 1表示用hierarchical soft-max,0表示用negative sampling
  • negative=0: 表示使用幾組negative sample來訓練
  • dbow_words=0: 1表示同時訓練出word-vector(用skip-gram)及doc-vector(用DBOW),0表示只訓練doc-vector
  • dm=1: 1表示用distributed memory(PV-DM)來訓練,0表示用distributed bag-of-word(PV-DBOW)來訓練
  • dm_concat=0: 1表示不要sum/average而用concatenation of context vectors,0表示用sum/average。使用concatenation會產生較大的model,而且輸入的vector長度會變長
  • dm_mean=0: 在使用DBOW而且dm_concat=0的前提下,0表示使用sum作為hidden layer,1表示使用mean作為hidden layer
  • dm_tag_count=1: 當dm_concat=1時,預期每個document有幾個document tags
  • trim_rule=None: None表示小於min_count的單字會被忽略,也可以指定一個function(word, count, min_count),這個function的傳回值有三種,util.RULE_DISCARD、util.RULE_KEEP、util.RULE_DEFAULT。這個參數會影響dictionary的生成

In [9]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [12]:
class PatentDocGenerator(object):
    def __init__(self, filename):
        self.filename = filename
        
    def __iter__(self):
        f = codecs.open(self.filename, 'r', 'UTF-8')
        for line in f:
            text, appnum = docs_out(line)
            yield TaggedDocument(text.split(), appnum.split())

In [28]:
doc = PatentDocGenerator('/share/USPatentData/tokenized_appDate_2013/2013USPTOPatents_by_skip_1.txt.tokenized')
%timeit model_d2v = Doc2Vec(doc, size=200, window=8, sample=1e-5, hs=0, negative=5)


1 loops, best of 3: 38.7 s per loop

In [29]:
doc = PatentDocGenerator('/share/USPatentData/tokenized_appDate_2013/2013USPTOPatents_by_skip_1.txt.tokenized')
model_d2v = Doc2Vec(doc, size=200, window=8, sample=1e-5, hs=0, negative=5)

In [30]:
model_d2v.docvecs.most_similar(['20140187118'])


Out[30]:
[(u'US00D689829', 0.4423035681247711),
 (u'US00D690799', 0.43438467383384705),
 (u'20140182392', 0.42934495210647583),
 (u'20150022662', 0.4291260838508606),
 (u'20140125536', 0.4286767244338989),
 (u'20130270568', 0.4284389019012451),
 (u'US008650834', 0.428107351064682),
 (u'20130247928', 0.4270566999912262),
 (u'20140183325', 0.4267898201942444),
 (u'20140016249', 0.4265807569026947)]

In [31]:
m = Doc2Vec(size=200, window=8, sample=1e-5, hs=0, negative=5)

In [32]:
m.build_vocab(doc)

In [33]:
m.train(doc)


Out[33]:
2257832

In [34]:
m.docvecs.most_similar(['20140187118'])


Out[34]:
[(u'US00D689829', 0.4423035681247711),
 (u'US00D690799', 0.43438467383384705),
 (u'20140182392', 0.42934495210647583),
 (u'20150022662', 0.4291260838508606),
 (u'20140125536', 0.4286767244338989),
 (u'20130270568', 0.4284389019012451),
 (u'US008650834', 0.428107351064682),
 (u'20130247928', 0.4270566999912262),
 (u'20140183325', 0.4267898201942444),
 (u'20140016249', 0.4265807569026947)]

Build Doc2Vec Model from 2013 USPTO Patents


In [25]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [26]:
class PatentDocGenerator(object):
    def __init__(self, filename):
        self.filename = filename
        
    def __iter__(self):
        f = codecs.open(self.filename, 'r', 'UTF-8')
        for line in f:
            text, appnum = docs_out(line)
            yield TaggedDocument(text.split(), appnum.split())

In [27]:
model_d2v = Doc2Vec(size=200, window=8, sample=1e-5, hs=0, negative=5)
root = '/share/USPatentData/tokenized_appDate_2013/'

In [57]:
for fn in sorted(listdir(root)):
    doc = PatentDocGenerator(os.path.join(root, fn))
    start = dt.now()
    model_d2v.build_vocab(doc)
    model_d2v.train(doc)
    logging.info('{} training time: {}'.format(fn, str(dt.now() - start)))

In [58]:
model_d2v.save("doc2vec_uspto_2013.model")

In [ ]: