In [1]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from os import listdir
from datetime import datetime as dt
from collections import defaultdict
import numpy as np
import codecs
import sys, os, json, re
import pickle
stdin, stdout, stderr = sys.stdin, sys.stdout, sys.stderr
reload(sys)
sys.stdin, sys.stdout, sys.stderr = stdin, stdout, stderr
sys.setdefaultencoding('utf-8')
In [2]:
root = '/share/USPatentData/tokenized_appDate_2013/'
patentFiles = sorted(listdir(root))
os.path.join(root, patentFiles[0])
Out[2]:
In [ ]:
ipc = defaultdict(list)
for fn in patentFiles:
f = codecs.open(os.path.join(root, fn), 'r', 'UTF-8')
print fn
for line in f:
j = json.loads(line)
ipc[j.get('mainIPC')].append(j.get('patentNumber'))
In [4]:
# 將讀好的IPC寫入檔案
pickle.dump(ipc, open('ipc.pickle', 'wb'))
In [3]:
# 下次要用時,可以讀回IPC
ipc = pickle.load(open('ipc.pickle', 'rb'))
In [5]:
model = Doc2Vec.load('./doc2vec_uspto_2013_dim200_win8_hs0_neg5.model')
In [6]:
ipc.keys()[:10] # ipc中每個key代表一種ipc
Out[6]:
In [12]:
ipc[u'A61D 19/02'] # ipc每個item代表list of patentNumber
Out[12]:
In [9]:
def distProperty(docs):
n = len(docs)
dist = np.array([model.docvecs.similarity(docs[i],docs[j]) for i in np.arange(n) for j in np.arange(i)])
return((np.max(dist), np.average(dist), np.min(dist)))
In [11]:
distProperty(ipc[u'A61D 19/02'])
Out[11]:
In [ ]: