$ curl -O http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
$ tar xfz aclImdb_v1.tar.gz
$ pip install pyprind
In [18]:
# Added version check for recent scikit-learn 0.18 checks
from distutils.version import LooseVersion as Version
from sklearn import __version__ as sklearn_version
In [9]:
# データを読み込む
import pyprind
import pandas as pd
import os
pbar = pyprind.ProgBar(50000)
labels = {'pos': 1, 'neg': 0}
df = pd.DataFrame()
for set in ('test', 'train'):
for label in ('pos', 'neg'):
path = os.path.join('.', 'aclImdb', set, label)
for file in os.listdir(path):
with open(os.path.join(path, file), encoding='utf-8') as f:
txt = f.read()
df = df.append([[txt, labels[label]]], ignore_index=True)
pbar.update()
df.columns = ['review', 'sentiment']
In [10]:
import numpy as np
np.random.seed(0)
# 行の順番をシャッフル
df = df.reindex(np.random.permutation(df.index))
# CSV に保存
df.to_csv('./movie_data.csv', index=False)
In [2]:
import pandas as pd
df = pd.read_csv('./movie_data.csv')
df.head(3)
Out[2]:
In [3]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer()
docs = np.array([
'The sun is shining',
'The weather is sweet',
'The sun is shining, the weather is sweet, and one and one is two'])
bag = count.fit_transform(docs)
In [4]:
# 語彙の中身を出力
print(count.vocabulary_)
# 特徴ベクトルを出力
print(bag.toarray())
In [5]:
np.set_printoptions(precision=2)
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer(use_idf=True, norm='l2', smooth_idf=True)
print(tfidf.fit_transform(count.fit_transform(docs)).toarray())
In [6]:
df.loc[0, 'review'][-50:]
Out[6]:
In [7]:
import re
def preprocessor(text):
text = re.sub('<[^>]*>', '', text)
emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
return text
In [8]:
preprocessor(df.loc[0, 'review'][-50:])
Out[8]:
In [10]:
preprocessor("</a>This :) is :( a test :-)!")
Out[10]:
In [11]:
df['review'] = df['review'].apply(preprocessor)
In [12]:
def tokenizer(text):
return text.split()
tokenizer('runners like running and thus they run')
Out[12]:
In [13]:
# Porterステミングを行う
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
def tokenizer_porter(text):
return [porter.stem(word) for word in text.split()]
tokenizer_porter('runners like running and thus they run')
Out[13]:
In [14]:
# ストップワードをダウンロードする
import nltk
nltk.download('stopwords')
Out[14]:
In [15]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
[w for w in tokenizer_porter('a runner likes running and runs a lot')[-10:] if w not in stop]
Out[15]:
In [16]:
X_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values
In [23]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
if Version(sklearn_version) < '0.18':
from sklearn.grid_search import GridSearchCV
else:
from sklearn.model_selection import GridSearchCV
tfidf = TfidfVectorizer(strip_accents=None,
lowercase=False,
preprocessor=None)
param_grid = [{'vect__ngram_range': [(1, 1)],
'vect__stop_words': [stop, None],
'vect__tokenizer': [tokenizer, tokenizer_porter],
'clf__penalty': ['l1', 'l2'],
'clf__C': [1.0, 10.0, 100.0]},
{'vect__ngram_range': [(1, 1)],
'vect__stop_words': [stop, None],
'vect__tokenizer': [tokenizer, tokenizer_porter],
'vect__use_idf':[False],
'vect__norm':[None],
'clf__penalty': ['l1', 'l2'],
'clf__C': [1.0, 10.0, 100.0]},
]
lr_tfidf = Pipeline([('vect', tfidf),
('clf', LogisticRegression(random_state=0))])
gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid,
scoring='accuracy',
cv=5,
verbose=1,
n_jobs=-1)
gs_lr_tfidf.fit(X_train, y_train)
Out[23]:
In [24]:
print('Best parameter set: %s ' % gs_lr_tfidf.best_params_)
print('CV Accuracy: %.3f' % gs_lr_tfidf.best_score_)
In [25]:
clf = gs_lr_tfidf.best_estimator_
print('Test Accuracy: %.3f' % clf.score(X_test, y_test))
In [36]:
import numpy as np
import re
from nltk.corpus import stopwords
stop = stopwords.words('english')
def tokenizer(text):
text = re.sub('<[^>]*>', '', text)
emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
tokenized = [w for w in text.split() if w not in stop]
return tokenized
In [37]:
def stream_docs(path):
with open(path, 'r', encoding='utf-8') as csv:
next(csv)
for line in csv:
text, label = line[:-3], int(line[-2])
yield text, label
In [38]:
next(stream_docs(path='./movie_data.csv'))
Out[38]:
In [39]:
def get_minibatch(doc_stream, size):
docs, y = [], []
try:
for _ in range(size):
text, label = next(doc_stream)
docs.append(text)
y.append(label)
except StopIteration:
return None, None
return docs, y
In [40]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier
vect = HashingVectorizer(decode_error='ignore',
n_features=2**21,
preprocessor=None,
tokenizer=tokenizer)
clf = SGDClassifier(loss='log', random_state=1, n_iter=1)
doc_stream = stream_docs(path='./movie_data.csv')
In [41]:
import pyprind
pbar = pyprind.ProgBar(45)
classes = np.array([0, 1])
for _ in range(45):
X_train, y_train = get_minibatch(doc_stream, size=1000)
if not X_train:
break
X_train = vect.transform(X_train)
clf.partial_fit(X_train, y_train, classes=classes)
pbar.update()
In [42]:
X_test, y_test = get_minibatch(doc_stream, size=5000)
X_test = vect.transform(X_test)
print('Accuracy: %.3f' % clf.score(X_test, y_test))
In [43]:
clf = clf.partial_fit(X_test, y_test)
In [ ]: