In [117]:
%matplotlib inline
%config InlineBackend.figure_formats = {'png', 'retina'}
import scipy.stats
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
from konlpy.utils import pprint
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
In [4]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.cross_validation import train_test_split
In [20]:
# get sample data
news = fetch_20newsgroups(subset="all")
# train_test_split : division train and test data
X_train, X_test, y_train, y_test = train_test_split(news.data, news.target, test_size=0.1, random_state=1)
print(len(X_train), len(y_train), len(X_test), len(y_test))
In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
# Using LogistincRegression Model
clf_0 = Pipeline([
('vect', CountVectorizer()),
('clf', LogisticRegression()),
])
# Using MultinomialNB Model
clf_1 = Pipeline([
('vect', CountVectorizer()),
('clf', MultinomialNB()),
])
# Using Tfidf Model
clf_2 = Pipeline([
('vect', TfidfVectorizer()),
('clf', MultinomialNB()),
])
# modify tocken
clf_3 = Pipeline([
('vect', TfidfVectorizer(token_pattern=r"\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b")),
('clf', MultinomialNB()),
])
# add stop words
clf_4 = Pipeline([
('vect', TfidfVectorizer(stop_words="english", token_pattern=r"\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b")),
('clf', MultinomialNB()),
])
# add alpha
clf_5 = Pipeline([
('vect', TfidfVectorizer(stop_words="english", token_pattern=r"\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b")),
('clf', MultinomialNB(alpha=0.01)), # add smooding filter
])
In [9]:
clf_0 = clf_0.fit(X_train[:1000],y_train[:1000])
y_pred = clf_0.predict(X_test[:1000])
In [12]:
print(confusion_matrix(y_test[:1000], y_pred))
print(classification_report(y_test[:1000], y_pred))
In [27]:
from sklearn.cross_validation import cross_val_score, KFold
from scipy.stats import sem
for i, clf in enumerate([clf_1, clf_2, clf_3, clf_4, clf_5]):
scores = cross_val_score(clf, X_test, y_test, cv=5)
print(("Model {0:d}: Mean score: {1:.3f} (+/-{2:.3f})").format(i+1, np.mean(scores), sem(scores)))
In [ ]:
In [115]:
import codecs
from konlpy.utils import pprint
def read_data(filename):
with codecs.open(filename, encoding='utf-8', mode='r') as f:
data = [line.split('\t') for line in f.read().splitlines()]
data = data[1:] # remove header
return data
train_data = read_data('./ratings_train.txt')
test_data = read_data('./ratings_test.txt')
t1, t2, t3 = zip(*train_data) # python3 zip function() - return tuple, python2 zip function() - return list
X = t2
y = np.array(t3, dtype=int) # chage type string to integer
In [118]:
from konlpy.tag import Twitter
pos_tagger = Twitter()
In [123]:
def tokenize(doc):
return ['/'.join(t) for t in pos_tagger.pos(doc, norm=True, stem=True)]
train_docs = [(tokenize(row[1]), row[2]) for row in train_data[:10000]]
tokens = [t for d in train_docs for t in d[0]]
import nltk
text = nltk.Text(tokens, name='NMSC')
# mpl.rcParams["font.family"] = "NanumGothic"
mpl.rcParams["font.family"] = "sans-serif"
plt.figure(figsize=(12,10))
text.plot(50)
plt.show()
In [ ]: