In [1]:
cate_list = ['가전·디지털', '식품·건강', '화장품·이미용']
In [2]:
import pandas as pd
In [3]:
df = pd.read_csv('clf.csv', index_col='name')
In [4]:
df[df.cate1==cate_list[0]][1:3]
Out[4]:
In [5]:
df[df.cate1==cate_list[1]][1:3]
Out[5]:
In [6]:
df[df.cate1==cate_list[2]][1:3]
Out[6]:
In [7]:
from sklearn.feature_extraction.text import CountVectorizer
In [8]:
vectorizer = CountVectorizer(min_df=1, tokenizer=lambda x: list(x))
In [9]:
corpus = [name.decode('utf-8') for name in df.index]
print corpus[0]
In [10]:
analyze = vectorizer.build_analyzer()
In [11]:
analyze(u'삼육 두유')
Out[11]:
In [12]:
X = vectorizer.fit_transform(corpus)
In [13]:
for el in cate_list:
print el
In [14]:
for el in df.cate1[:3].tolist():
print el
In [15]:
Y = map(cate_list.index, df.cate1.tolist())
In [16]:
print X.shape[0], len(Y)
In [17]:
from sklearn.svm import LinearSVC
In [18]:
clf = LinearSVC(C=1.0)
In [19]:
clf.fit(X[:-200], Y[:-200])
Out[19]:
In [20]:
from sklearn.metrics import accuracy_score
In [21]:
P = clf.predict(X[-200:])
print accuracy_score(Y[-200:], P) * 100
In [22]:
print cate_list[clf.predict(vectorizer.transform([u'자동 물걸레 청소기']))[0]]
In [23]:
print cate_list[clf.predict(vectorizer.transform([u'에센스 커버 팩트 리미티드 패키지']))[0]]
In [24]:
print cate_list[clf.predict(vectorizer.transform([u'야생 블루베리 무려 10박스']))[0]]