In [1]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from os import listdir
from datetime import datetime as dt
from collections import defaultdict
import numpy as np
import codecs
import sys, os, json, re
import pickle
stdin, stdout, stderr = sys.stdin, sys.stdout, sys.stderr
reload(sys)
sys.stdin, sys.stdout, sys.stderr = stdin, stdout, stderr
sys.setdefaultencoding('utf-8')

將2013 patents的IPC, PatentNumber讀進來


In [2]:
root = '/share/USPatentData/tokenized_appDate_2013/'
patentFiles = sorted(listdir(root))
os.path.join(root, patentFiles[0])


Out[2]:
'/share/USPatentData/tokenized_appDate_2013/2013USPTOPatents_by_skip_1.txt.tokenized'

In [ ]:
ipc = defaultdict(list)
for fn in patentFiles:
    f = codecs.open(os.path.join(root, fn), 'r', 'UTF-8')
    print fn
    for line in f:
        j = json.loads(line)
        ipc[j.get('mainIPC')].append(j.get('patentNumber'))

In [4]:
# 將讀好的IPC寫入檔案
pickle.dump(ipc, open('ipc.pickle', 'wb'))

In [3]:
# 下次要用時,可以讀回IPC
ipc = pickle.load(open('ipc.pickle', 'rb'))

對每個IPC計算最大、最小、平均similarity


In [5]:
model = Doc2Vec.load('./doc2vec_uspto_2013_dim200_win8_hs0_neg5.model')

In [6]:
ipc.keys()[:10]  # ipc中每個key代表一種ipc


Out[6]:
[u'C09K 8/38',
 u'A61D 19/02',
 u'A61D 19/04',
 u'A45D0040260000',
 u'C09K 8/32',
 u'C09K 8/34',
 u'C09K 8/36',
 u'H01M0010480000',
 u'F04F 5/54',
 u'B66C 7/10']

In [12]:
ipc[u'A61D 19/02']  # ipc每個item代表list of patentNumber


Out[12]:
[u'20130331693',
 u'20150112124',
 u'20130345497',
 u'20150148666',
 u'20140024967',
 u'20140039431',
 u'20140142525',
 u'20140039246',
 u'20140046126',
 u'20140142421',
 u'20150150666']

In [9]:
def distProperty(docs):
    n = len(docs)
    dist = np.array([model.docvecs.similarity(docs[i],docs[j]) for i in np.arange(n) for j in np.arange(i)])
    return((np.max(dist), np.average(dist), np.min(dist)))

In [11]:
distProperty(ipc[u'A61D 19/02'])


Out[11]:
(0.14294106350336475, -0.012224296612939764, -0.2030735918859212)

In [ ]: