In [3]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPool1D
from keras.datasets import imdb

In [4]:
max_features = 5000
maxlen = 400
batch_size = 32
embedding_dims = 50
filters = 250
kernel_size = 3
hidden_dims = 250
epochs = 2

感情 (肯定/否定) のラベル付けをされた,25,000のIMDB映画レビューのデータセット.レビューは前処理済みで,各レビューは単語のインデックス(整数)のシーケンスとしてエンコードされています.便宜上,単語はデータセットにおいての出現頻度によってインデックスされています.そのため例えば,整数"3"はデータの中で3番目に頻度が多い単語にエンコードされます.これによって"上位20個の頻出語を除いた,上位10,000個の頻出語についてのみ考える"というようなフィルタリング作業を高速に行うことができます. 慣例として,"0"は特定の単語を表さずに,未知語にエンコードされます.


In [27]:
print('Loading data...')
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)


Loading data...
  • 25000文章
  • 各文章は単語列のlistからなると

In [30]:
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')


25000 train sequences
25000 test sequences

In [31]:
# 最大の単語数は2494
lens = []
for i in range(25000):
    lens.append(len(x_train[i]))
print(max(lens))


2494

In [33]:
print('Pad sequences (sample x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)


Pad sequences (sample x time)
x_train shape: (25000, 400)
x_test shape: (25000, 400)
  • maxlenより長いシーケンスは切り捨てられる
  • maxlenより短いシーケンスは前半部分が0パディングされる

In [36]:
x_train[0]


Out[36]:
array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    1,   14,   22,   16,   43,
        530,  973, 1622, 1385,   65,  458, 4468,   66, 3941,    4,  173,
         36,  256,    5,   25,  100,   43,  838,  112,   50,  670,    2,
          9,   35,  480,  284,    5,  150,    4,  172,  112,  167,    2,
        336,  385,   39,    4,  172, 4536, 1111,   17,  546,   38,   13,
        447,    4,  192,   50,   16,    6,  147, 2025,   19,   14,   22,
          4, 1920, 4613,  469,    4,   22,   71,   87,   12,   16,   43,
        530,   38,   76,   15,   13, 1247,    4,   22,   17,  515,   17,
         12,   16,  626,   18,    2,    5,   62,  386,   12,    8,  316,
          8,  106,    5,    4, 2223,    2,   16,  480,   66, 3785,   33,
          4,  130,   12,   16,   38,  619,    5,   25,  124,   51,   36,
        135,   48,   25, 1415,   33,    6,   22,   12,  215,   28,   77,
         52,    5,   14,  407,   16,   82,    2,    8,    4,  107,  117,
          2,   15,  256,    4,    2,    7, 3766,    5,  723,   36,   71,
         43,  530,  476,   26,  400,  317,   46,    7,    4,    2, 1029,
         13,  104,   88,    4,  381,   15,  297,   98,   32, 2071,   56,
         26,  141,    6,  194,    2,   18,    4,  226,   22,   21,  134,
        476,   26,  480,    5,  144,   30,    2,   18,   51,   36,   28,
        224,   92,   25,  104,    4,  226,   65,   16,   38, 1334,   88,
         12,   16,  283,    5,   16, 4472,  113,  103,   32,   15,   16,
          2,   19,  178,   32], dtype=int32)

In [48]:
print('Build model...')
model = Sequential()

# 単語IDを密ベクトルに変換する層
# Embedding(input_dim, output_dim, input_length)
# input_dim : 語彙数 = 5000
# output_dim: 密ベクトルの次元数 = 50
# input_length: 入力の系列長 = 400
model.add(Embedding(max_features, embedding_dims, input_length=maxlen))
model.add(Dropout(0.2))
model.add(Conv1D(filters, kernel_size, padding='valid', activation='relu', strides=1))
# 各フィルタごとに最大値を出力する
model.add(GlobalMaxPool1D())
model.add(Dense(hidden_dims))
model.add(Dropout(0.2))
model.add(Activation('relu'))
model.add(Dense(1))
model.add(Activation('sigmoid'))


Build model...

In [49]:
model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_6 (Embedding)      (None, 400, 50)           250000    
_________________________________________________________________
dropout_5 (Dropout)          (None, 400, 50)           0         
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 398, 250)          37750     
_________________________________________________________________
global_max_pooling1d_3 (Glob (None, 250)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 250)               62750     
_________________________________________________________________
dropout_6 (Dropout)          (None, 250)               0         
_________________________________________________________________
activation_1 (Activation)    (None, 250)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 251       
_________________________________________________________________
activation_2 (Activation)    (None, 1)                 0         
=================================================================
Total params: 350,751
Trainable params: 350,751
Non-trainable params: 0
_________________________________________________________________

In [51]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [53]:
model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(x_test, y_test))


Train on 25000 samples, validate on 25000 samples
Epoch 1/2
25000/25000 [==============================] - 207s - loss: 0.3849 - acc: 0.8233 - val_loss: 0.2812 - val_acc: 0.8825
Epoch 2/2
25000/25000 [==============================] - 211s - loss: 0.2360 - acc: 0.9064 - val_loss: 0.2673 - val_acc: 0.8910
Out[53]:
<keras.callbacks.History at 0x14e6e5828>

Conv1Dの実装確認


In [96]:
import numpy as np
from keras.layers import Input
from keras.models import Model

In [84]:
inputs = Input(shape=(10, 4))

In [86]:
inputs


Out[86]:
<tf.Tensor 'input_3:0' shape=(?, 10, 4) dtype=float32>

In [66]:
c1 = Conv1D(2, 3, padding='valid', activation='linear', strides=1)

In [93]:
y = c1(inputs)

In [94]:
y


Out[94]:
<tf.Tensor 'conv1d_5_1/add:0' shape=(?, 8, 2) dtype=float32>

In [97]:
model = Model(inputs=inputs, outputs=y)

In [98]:
model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
input_3 (InputLayer)         (None, 10, 4)             0         
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 8, 2)              26        
=================================================================
Total params: 26
Trainable params: 26
Non-trainable params: 0
_________________________________________________________________

In [99]:
model.predict(x)


Out[99]:
array([[[ 0.37662736, -0.32646391],
        [-0.18585569,  0.20100182],
        [ 0.20233709, -0.16918498],
        [ 0.06839636, -0.1482055 ],
        [ 0.48623711, -0.27335343],
        [ 0.06013006,  0.01767102],
        [-0.39668548,  0.01521906],
        [ 0.30709803, -0.33090657]]], dtype=float32)

In [106]:
model.layers[1].get_weights()[0].shape


Out[106]:
(3, 4, 2)

In [107]:
w = model.layers[1].get_weights()[0]

In [113]:
x[0][:3]


Out[113]:
array([[ 0.95066452,  0.09726897,  0.53935917,  0.31524858],
       [ 0.02387925,  0.75517061,  0.97179245,  0.4076146 ],
       [ 0.92778269,  0.10227056,  0.6795394 ,  0.23183314]])

In [125]:
# model.predict(x)の1行目と一致する
print(x[0][:3].shape)
print(w.shape)
print(np.sum(x[0][:3] * w[:, :, 0]))
print(np.sum(x[0][:3] * w[:, :, 1]))


(3, 4)
(3, 4, 2)
0.376627355059
-0.326463890816

In [126]:
# xは1つずらす(strides=1)
# model.predict(x)の2行目と一致する
print(np.sum(x[0][1:4] * w[:, :, 0]))
print(np.sum(x[0][1:4] * w[:, :, 1]))


-0.185855683969
0.201001819528

In [ ]: