notebook.community

Edit and run



In [1]:

    
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.svm import LinearSVC
import pandas as pd
import numpy as np



In [56]:

    
pinitial_train=pd.read_csv('/Volumes/XIAONAN PAS/Capstone/pgpub_claims_fulltext.csv',delimiter=',',nrows=30000,encoding='utf-8').dropna(subset=['claim_txt'])
pfinal_train=pd.read_csv('/Volumes/XIAONAN PAS/Capstone/patent_claims_fulltext.csv',delimiter=',',nrows=30000,encoding='utf-8').dropna(subset=['claim_txt'])
pinitial_test=pd.read_csv('/Volumes/XIAONAN PAS/Capstone/pgpub_claims_fulltext.csv',delimiter=',',nrows=30000,skiprows=range(1,30000),encoding='utf-8').dropna(subset=['claim_txt'])
pfinal_test=pd.read_csv('/Volumes/XIAONAN PAS/Capstone/patent_claims_fulltext.csv',delimiter=',',nrows=30000,skiprows=range(1,30000),encoding='utf-8').dropna(subset=['claim_txt'])



In [57]:

    
x_train = pinitial_train['claim_txt'].tolist() + pfinal_train['claim_txt'].tolist()#.astype(str)
y_train = np.append(np.zeros(len(pinitial_train)),np.ones(len(pfinal_train)))

x_test = pinitial_test['claim_txt'].tolist()+pfinal_test['claim_txt'].tolist()
y_test = np.append(np.zeros(len(pinitial_test)),np.ones(len(pfinal_test)))



In [58]:

    
x_train_1 = pinitial_train['claim_txt'].tolist()
x_train_2 = pfinal_train['claim_txt'].tolist()



In [59]:

    
pinitial_train.head()









    Out[59]:






  
    
      
      pub_no
      appl_id
      claim_no
      claim_txt
      dependencies
      ind_flg
    
  
  
    
      0
      20010000001
      9725796
      11
      The method of claim 10 comprising up to about ...
      10
      0
    
    
      1
      20010000001
      9725796
      16
      The method of claim 15 wherein said sec-butano...
      15
      0
    
    
      2
      20010000001
      9725796
      4
      The solvent mixture of claim 3 wherein said te...
      3
      0
    
    
      3
      20010000001
      9725796
      9
      A method for cleaning an article in a vapor de...
      NaN
      1
    
    
      4
      20010000001
      9725796
      8
      The solvent mixture of claim 7 wherein said se...
      7
      0



In [60]:

    
pfinal_train.head()









    Out[60]:






  
    
      
      pat_no
      claim_no
      claim_txt
      dependencies
      ind_flg
      appl_id
    
  
  
    
      0
      3930271
      1
      1. A golf glove comprising at least an index f...
      NaN
      1
      NaN
    
    
      1
      3930271
      4
      4. A golf glove adapted for use on one hand of...
      NaN
      1
      NaN
    
    
      2
      3930271
      3
      3. A glove comprising an index finger receptac...
      NaN
      1
      NaN
    
    
      3
      3930271
      2
      2. A golf glove in accordance with claim 1 whe...
      1
      0
      NaN
    
    
      4
      3930272
      1
      1. In combination with a height adjustable cri...
      NaN
      1
      NaN



In [61]:

    
print(len(x_train), len(y_train))









    



(59992, 59992)



In [62]:

    
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1, 2))),
                    ('tfidf', TfidfTransformer()),
                    ('clf', OneVsRestClassifier(LinearSVC(random_state=0)))
                     ])



In [68]:

    
# fit using pipeline
clf = text_clf.fit(x_train, y_train)



In [66]:

    
# predict
#predicted = clf.predict(X_test)
clf.score(x_train, y_train)









    Out[66]:





0.99968329110548071



In [71]:

    
clf.score(x_test, y_test)









    Out[71]:





0.85363698504526431



In [ ]:

	pub_no	appl_id	claim_no	claim_txt	dependencies	ind_flg
0	20010000001	9725796	11	The method of claim 10 comprising up to about ...	10	0
1	20010000001	9725796	16	The method of claim 15 wherein said sec-butano...	15	0
2	20010000001	9725796	4	The solvent mixture of claim 3 wherein said te...	3	0
3	20010000001	9725796	9	A method for cleaning an article in a vapor de...	NaN	1
4	20010000001	9725796	8	The solvent mixture of claim 7 wherein said se...	7	0

	pat_no	claim_no	claim_txt	dependencies	ind_flg	appl_id
0	3930271	1	1. A golf glove comprising at least an index f...	NaN	1	NaN
1	3930271	4	4. A golf glove adapted for use on one hand of...	NaN	1	NaN
2	3930271	3	3. A glove comprising an index finger receptac...	NaN	1	NaN
3	3930271	2	2. A golf glove in accordance with claim 1 whe...	1	0	NaN
4	3930272	1	1. In combination with a height adjustable cri...	NaN	1	NaN