In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.svm import LinearSVC
import pandas as pd
import numpy as np
In [56]:
pinitial_train=pd.read_csv('/Volumes/XIAONAN PAS/Capstone/pgpub_claims_fulltext.csv',delimiter=',',nrows=30000,encoding='utf-8').dropna(subset=['claim_txt'])
pfinal_train=pd.read_csv('/Volumes/XIAONAN PAS/Capstone/patent_claims_fulltext.csv',delimiter=',',nrows=30000,encoding='utf-8').dropna(subset=['claim_txt'])
pinitial_test=pd.read_csv('/Volumes/XIAONAN PAS/Capstone/pgpub_claims_fulltext.csv',delimiter=',',nrows=30000,skiprows=range(1,30000),encoding='utf-8').dropna(subset=['claim_txt'])
pfinal_test=pd.read_csv('/Volumes/XIAONAN PAS/Capstone/patent_claims_fulltext.csv',delimiter=',',nrows=30000,skiprows=range(1,30000),encoding='utf-8').dropna(subset=['claim_txt'])
In [57]:
x_train = pinitial_train['claim_txt'].tolist() + pfinal_train['claim_txt'].tolist()#.astype(str)
y_train = np.append(np.zeros(len(pinitial_train)),np.ones(len(pfinal_train)))
x_test = pinitial_test['claim_txt'].tolist()+pfinal_test['claim_txt'].tolist()
y_test = np.append(np.zeros(len(pinitial_test)),np.ones(len(pfinal_test)))
In [58]:
x_train_1 = pinitial_train['claim_txt'].tolist()
x_train_2 = pfinal_train['claim_txt'].tolist()
In [59]:
pinitial_train.head()
Out[59]:
In [60]:
pfinal_train.head()
Out[60]:
In [61]:
print(len(x_train), len(y_train))
In [62]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1, 2))),
('tfidf', TfidfTransformer()),
('clf', OneVsRestClassifier(LinearSVC(random_state=0)))
])
In [68]:
# fit using pipeline
clf = text_clf.fit(x_train, y_train)
In [66]:
# predict
#predicted = clf.predict(X_test)
clf.score(x_train, y_train)
Out[66]:
In [71]:
clf.score(x_test, y_test)
Out[71]:
In [ ]: