In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.svm import LinearSVC
import pandas as pd
import numpy as np

In [56]:
pinitial_train=pd.read_csv('/Volumes/XIAONAN PAS/Capstone/pgpub_claims_fulltext.csv',delimiter=',',nrows=30000,encoding='utf-8').dropna(subset=['claim_txt'])
pfinal_train=pd.read_csv('/Volumes/XIAONAN PAS/Capstone/patent_claims_fulltext.csv',delimiter=',',nrows=30000,encoding='utf-8').dropna(subset=['claim_txt'])
pinitial_test=pd.read_csv('/Volumes/XIAONAN PAS/Capstone/pgpub_claims_fulltext.csv',delimiter=',',nrows=30000,skiprows=range(1,30000),encoding='utf-8').dropna(subset=['claim_txt'])
pfinal_test=pd.read_csv('/Volumes/XIAONAN PAS/Capstone/patent_claims_fulltext.csv',delimiter=',',nrows=30000,skiprows=range(1,30000),encoding='utf-8').dropna(subset=['claim_txt'])

In [57]:
x_train = pinitial_train['claim_txt'].tolist() + pfinal_train['claim_txt'].tolist()#.astype(str)
y_train = np.append(np.zeros(len(pinitial_train)),np.ones(len(pfinal_train)))

x_test = pinitial_test['claim_txt'].tolist()+pfinal_test['claim_txt'].tolist()
y_test = np.append(np.zeros(len(pinitial_test)),np.ones(len(pfinal_test)))

In [58]:
x_train_1 = pinitial_train['claim_txt'].tolist()
x_train_2 = pfinal_train['claim_txt'].tolist()

In [59]:
pinitial_train.head()


Out[59]:
pub_no appl_id claim_no claim_txt dependencies ind_flg
0 20010000001 9725796 11 The method of claim 10 comprising up to about ... 10 0
1 20010000001 9725796 16 The method of claim 15 wherein said sec-butano... 15 0
2 20010000001 9725796 4 The solvent mixture of claim 3 wherein said te... 3 0
3 20010000001 9725796 9 A method for cleaning an article in a vapor de... NaN 1
4 20010000001 9725796 8 The solvent mixture of claim 7 wherein said se... 7 0

In [60]:
pfinal_train.head()


Out[60]:
pat_no claim_no claim_txt dependencies ind_flg appl_id
0 3930271 1 1. A golf glove comprising at least an index f... NaN 1 NaN
1 3930271 4 4. A golf glove adapted for use on one hand of... NaN 1 NaN
2 3930271 3 3. A glove comprising an index finger receptac... NaN 1 NaN
3 3930271 2 2. A golf glove in accordance with claim 1 whe... 1 0 NaN
4 3930272 1 1. In combination with a height adjustable cri... NaN 1 NaN

In [61]:
print(len(x_train), len(y_train))


(59992, 59992)

In [62]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1, 2))),
                    ('tfidf', TfidfTransformer()),
                    ('clf', OneVsRestClassifier(LinearSVC(random_state=0)))
                     ])

In [68]:
# fit using pipeline
clf = text_clf.fit(x_train, y_train)

In [66]:
# predict
#predicted = clf.predict(X_test)
clf.score(x_train, y_train)


Out[66]:
0.99968329110548071

In [71]:
clf.score(x_test, y_test)


Out[71]:
0.85363698504526431

In [ ]: