In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from konlpy.tag import Twitter
import pandas as pd
import pickle
In [2]:
df_list = []
article_df = ""
with open("./data/article_2016-06-01.plk", 'rb') as file:
article_df = pickle.load(file)
len(article_df), article_df.columns
Out[2]:
In [3]:
X_train, X_test, y_train, y_test = train_test_split(article_df.content, article_df.category, test_size=0.1, random_state=1)
len(X_train), len(X_test), len(y_train), len(y_test)
Out[3]:
In [4]:
pos_tagger = Twitter()
def tokenize_pos(doc):
return ['/'.join(t) for t in pos_tagger.pos(doc, norm=True, stem=True)]
In [5]:
clf = Pipeline([
# ('vect', TfidfVectorizer(tokenizer=tokenize_pos, stop_words=stop_words, ngram_range=(1,2))),
('vect', TfidfVectorizer()),
('clf', MultinomialNB(alpha=0.01)),
])
In [6]:
%time model = clf.fit(X_train,y_train)
In [7]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
%time y_pred = model.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
In [8]:
X_test = X_test.reset_index(drop=True)
In [9]:
# 0:정치, 1:경제, 2:사회, 3.생활/문화, 4.세계, 5.IT/과학
classification_dict = {
0:"정치",
1:"경제",
2:"사회",
3:"생활/문화",
4:"세계",
5:"IT/과학",
}
result = model.predict([ X_test[0], X_test[1], X_test[2] ])
for idx, category in enumerate(result):
print(classification_dict[category], X_test[idx][:80])
In [10]:
import pickle
pickle.dump(model, open("./models/classification_model.plk", "wb"))
In [11]:
load_model = pickle.load(open("./models/classification_model.plk", "rb"))
In [12]:
test_str1 = "네이버와 카카오 드론 기술 발전에 주력"
test_str2 = "요즘에 환율과 주가가 예측 불허"
print( classification_dict[ load_model.predict([test_str1])[0] ] )
print( classification_dict[ load_model.predict([test_str2])[0] ] )
In [ ]: