模型介绍

  1. 基于卷积神经网络的情感分析
  2. 输入为随机初始化词向量
  3. CNN的结构:filter_length = 3,4,5的卷积核各100 个。
  4. SST-5数据集

In [1]:
import keras 
from  os.path import join
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout,Activation, Lambda,Input
from keras.layers import Embedding
from keras.layers import Convolution1D
from keras.datasets import imdb
from keras import backend as K
from keras.layers import Convolution1D, GlobalMaxPooling1D,Convolution2D,merge
from keras.utils import np_utils
from keras.models import Model
from keras.regularizers import l2


Using Theano backend.
Using gpu device 0: GeForce GT 630 (CNMeM is disabled, cuDNN not available)

数据预处理


In [2]:
file_names = ['stsa.fine.test','stsa.fine.train','stsa.fine.dev']
file_path = '/home/bruce/data/sentiment/'
def read_file(fname=''):
    with open(join(file_path,fname)) as fr:
        lines = fr.readlines()
    lines = [line.strip().lower() for line in lines]
    lables = [int(line[0:1]) for line in lines]
    words = [line[2:].split() for line in lines]
    return words,lables       
train_X,train_y = read_file(fname='stsa.fine.train')
test_X,test_y = read_file(fname='stsa.fine.test')
dev_X,dev_y = read_file(fname='stsa.fine.dev')
print(len(train_X))
print(len(test_X))
print(len(dev_X))
print(train_X[0:2])
print(train_y[0:2])


8544
2210
1101
[['a', 'stirring', ',', 'funny', 'and', 'finally', 'transporting', 're-imagining', 'of', 'beauty', 'and', 'the', 'beast', 'and', '1930s', 'horror', 'films'], ['apparently', 'reassembled', 'from', 'the', 'cutting-room', 'floor', 'of', 'any', 'given', 'daytime', 'soap', '.']]
[4, 1]

句子长度统计信息


In [3]:
def statics_list2(arrays=[]):
    lengths = [len(i) for i in arrays]
    lengths = sorted(lengths)
    length = len(lengths)
    print('length = ',len(lengths))
    print('max = ',lengths[-1])
    print('min =',lengths[0])
    print('average = ',sum(lengths)/length)
    print('top 50% = ',lengths[int(0.5*length)])
    print('top 80% = ',lengths[int(0.8*length)])
    print('top 90% = ',lengths[int(0.9*length)])
    print('top 95% = ',lengths[int(0.95*length)])
    
statics_list2(arrays=train_X)


length =  8544
max =  52
min = 2
average =  19.143960674157302
top 50% =  18
top 80% =  27
top 90% =  32
top 95% =  35

In [4]:
def token_to_index(datas=[]):
    word_index={}
    count=1
    for data in datas:
        for list_ in data:
            for w in list_:
                if w not in word_index:
                    word_index[w] = count
                    count = count + 1
    print('leng of word_index =',len(word_index))
    for i in range(len(datas)):
        datas[i] = [[ word_index[w] for w in line ] for line in datas[i]] 
    return datas,word_index

In [5]:
X,word_index = token_to_index(datas=[train_X,dev_X])
train_X,dev_X = X
print(len(word_index))


leng of word_index = 17611
17611

构建模型


In [29]:
max_len = 52
batch_size=32

max_features = 17612
embedding_dim = 100

nb_filter = 100
dense1_hindden = 300
nb_classes = 5

In [37]:
print('Build model...')
input_random = Input(shape=(max_len,), dtype='int32', name='main_input1')
embedding = Embedding(output_dim=embedding_dim, input_dim=max_features)(input_random)
# 卷积层
conv1 = Convolution1D(nb_filter = nb_filter,
                        filter_length = 2,
                        border_mode = 'valid',
                        activation='relu'
                       )(embedding)
conv2 = Convolution1D(nb_filter = nb_filter,
                        filter_length = 3,
                        border_mode = 'valid',
                        activation='relu'
                       )(embedding)

conv3 = Convolution1D(nb_filter = nb_filter,
                        filter_length = 4,
                        border_mode = 'valid',
                        activation='relu'
                       )(embedding)
conv1 =GlobalMaxPooling1D()(conv1)
conv2 =GlobalMaxPooling1D()(conv2)
conv3 =GlobalMaxPooling1D()(conv3)
merged_vector = merge([conv1,conv2,conv3], mode='concat')
# 全连接层
#dense_layer = Dense(dense1_hindden)
#dens1 = dense_layer(merged_vector)
print('dense_layer input_shape should == (300,)')
#print(dense_layer.input_shape)
#dens1 = Activation('relu')(dens1)

# softmax层
dens2 = Dense(nb_classes)(merged_vector)
output_random = Activation('softmax')(dens2)

model = Model(input=input_random,output=output_random)
print('finish build model')
model.compile(optimizer='adadelta',
              loss='categorical_crossentropy',
              metrics=['accuracy'])


Build model...
dense_layer input_shape should == (300,)
finish build model

模型图


In [38]:
from IPython.display import SVG
from keras.utils.visualize_util import model_to_dot
SVG(model_to_dot(model).create(prog='dot', format='svg'))


Out[38]:
G 140639711883672 main_input1 (InputLayer) 140639711883616 embedding_5 (Embedding) 140639711883672->140639711883616 140639711886192 convolution1d_13 (Convolution1D) 140639711883616->140639711886192 140639643025360 convolution1d_14 (Convolution1D) 140639711883616->140639643025360 140639712369800 convolution1d_15 (Convolution1D) 140639711883616->140639712369800 140639642479080 globalmaxpooling1d_13 (GlobalMaxPooling1D) 140639711886192->140639642479080 140639712303480 globalmaxpooling1d_14 (GlobalMaxPooling1D) 140639643025360->140639712303480 140639712468552 globalmaxpooling1d_15 (GlobalMaxPooling1D) 140639712369800->140639712468552 140639712466984 merge_5 (Merge) 140639642479080->140639712466984 140639712303480->140639712466984 140639712468552->140639712466984 140639712465192 dense_8 (Dense) 140639712466984->140639712465192 140639711077544 activation_8 (Activation) 140639712465192->140639711077544

模型输入


In [39]:
print(type(train_y[0]))
train_y_model = np_utils.to_categorical(train_y, nb_classes)
dev_y_model = np_utils.to_categorical(dev_y, nb_classes)
train_X_model = sequence.pad_sequences(train_X, maxlen=max_len)
dev_X_model = sequence.pad_sequences(dev_X, maxlen=max_len)


<class 'int'>

In [40]:
#test 数据
test_index_X= [[word_index[w] if w in word_index else 0 for w in line] for line in test_X]
test_X_model = sequence.pad_sequences(test_index_X,maxlen=max_len)
test_y_model = np_utils.to_categorical(test_y,nb_classes)

In [41]:
print(test_y_model[0:10])


[[ 0.  1.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.]
 [ 0.  0.  1.  0.  0.]
 [ 0.  0.  1.  0.  0.]
 [ 1.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  1.]
 [ 0.  0.  0.  1.  0.]
 [ 0.  1.  0.  0.  0.]
 [ 0.  0.  0.  1.  0.]]

In [42]:
def my_generator(X=None,y=None):
    i = 0
    max_i = int(len(X)/batch_size)
    while True:
        i = i % max_i
        x_batch = X[i*batch_size:(i+1)*batch_size]
        y_batch = y[i*batch_size:(i+1)*batch_size]
        yield (x_batch,y_batch)
        i = i + 1

In [43]:
model.fit_generator(my_generator(train_X_model,train_y_model),samples_per_epoch = 32*100,nb_epoch=100,verbose=1,validation_data=(test_X_model,test_y_model))


Epoch 1/100
3200/3200 [==============================] - 12s - loss: 1.5772 - acc: 0.2684 - val_loss: 1.5833 - val_acc: 0.2312
Epoch 2/100
3200/3200 [==============================] - 12s - loss: 1.5574 - acc: 0.2881 - val_loss: 1.5716 - val_acc: 0.2462
Epoch 3/100
3200/3200 [==============================] - 12s - loss: 1.5560 - acc: 0.2969 - val_loss: 1.5553 - val_acc: 0.2982
Epoch 4/100
3200/3200 [==============================] - 12s - loss: 1.5340 - acc: 0.3156 - val_loss: 1.5520 - val_acc: 0.2923
Epoch 5/100
3200/3200 [==============================] - 12s - loss: 1.5285 - acc: 0.3225 - val_loss: 1.5315 - val_acc: 0.3186
Epoch 6/100
3200/3200 [==============================] - 12s - loss: 1.5048 - acc: 0.3297 - val_loss: 1.5244 - val_acc: 0.3172
Epoch 7/100
3200/3200 [==============================] - 12s - loss: 1.4893 - acc: 0.3431 - val_loss: 1.5135 - val_acc: 0.3244
Epoch 8/100
3200/3200 [==============================] - 12s - loss: 1.4810 - acc: 0.3569 - val_loss: 1.4956 - val_acc: 0.3303
Epoch 9/100
3200/3200 [==============================] - 12s - loss: 1.4493 - acc: 0.3694 - val_loss: 1.4830 - val_acc: 0.3330
Epoch 10/100
3200/3200 [==============================] - 12s - loss: 1.4366 - acc: 0.3784 - val_loss: 1.4719 - val_acc: 0.3321
Epoch 11/100
3200/3200 [==============================] - 12s - loss: 1.4133 - acc: 0.3900 - val_loss: 1.4518 - val_acc: 0.3434
Epoch 12/100
3200/3200 [==============================] - 12s - loss: 1.3852 - acc: 0.4069 - val_loss: 1.4457 - val_acc: 0.3457
Epoch 13/100
3200/3200 [==============================] - 12s - loss: 1.3583 - acc: 0.4312 - val_loss: 1.4174 - val_acc: 0.3765
Epoch 14/100
3200/3200 [==============================] - 12s - loss: 1.3312 - acc: 0.4397 - val_loss: 1.4120 - val_acc: 0.3692
Epoch 15/100
3200/3200 [==============================] - 12s - loss: 1.3116 - acc: 0.4528 - val_loss: 1.4046 - val_acc: 0.3729
Epoch 16/100
3200/3200 [==============================] - 12s - loss: 1.2774 - acc: 0.4700 - val_loss: 1.3802 - val_acc: 0.3968
Epoch 17/100
3200/3200 [==============================] - 12s - loss: 1.2406 - acc: 0.4859 - val_loss: 1.3638 - val_acc: 0.3995
Epoch 18/100
3200/3200 [==============================] - 12s - loss: 1.2316 - acc: 0.5069 - val_loss: 1.3629 - val_acc: 0.3932
Epoch 19/100
3200/3200 [==============================] - 12s - loss: 1.1879 - acc: 0.5150 - val_loss: 1.3441 - val_acc: 0.4109
Epoch 20/100
3200/3200 [==============================] - 12s - loss: 1.1677 - acc: 0.5250 - val_loss: 1.3452 - val_acc: 0.4000
Epoch 21/100
3200/3200 [==============================] - 12s - loss: 1.1338 - acc: 0.5466 - val_loss: 1.3215 - val_acc: 0.4195
Epoch 22/100
3200/3200 [==============================] - 12s - loss: 1.1118 - acc: 0.5484 - val_loss: 1.3249 - val_acc: 0.4127
Epoch 23/100
3200/3200 [==============================] - 12s - loss: 1.0988 - acc: 0.5687 - val_loss: 1.3323 - val_acc: 0.4081
Epoch 24/100
3200/3200 [==============================] - 12s - loss: 1.0660 - acc: 0.5750 - val_loss: 1.3159 - val_acc: 0.4163
Epoch 25/100
3200/3200 [==============================] - 12s - loss: 1.0249 - acc: 0.6125 - val_loss: 1.3092 - val_acc: 0.4195
Epoch 26/100
3200/3200 [==============================] - 12s - loss: 1.0272 - acc: 0.5969 - val_loss: 1.3124 - val_acc: 0.4195
Epoch 27/100
3200/3200 [==============================] - 12s - loss: 0.9871 - acc: 0.6256 - val_loss: 1.3129 - val_acc: 0.4253
Epoch 28/100
3200/3200 [==============================] - 12s - loss: 0.9670 - acc: 0.6497 - val_loss: 1.3188 - val_acc: 0.4262
Epoch 29/100
3200/3200 [==============================] - 12s - loss: 0.9327 - acc: 0.6534 - val_loss: 1.3017 - val_acc: 0.4204
Epoch 30/100
3200/3200 [==============================] - 12s - loss: 0.9147 - acc: 0.6675 - val_loss: 1.3154 - val_acc: 0.4199
Epoch 31/100
3200/3200 [==============================] - 12s - loss: 0.9018 - acc: 0.6809 - val_loss: 1.3224 - val_acc: 0.4204
Epoch 32/100
3200/3200 [==============================] - 12s - loss: 0.8725 - acc: 0.6844 - val_loss: 1.3212 - val_acc: 0.4131
Epoch 33/100
3200/3200 [==============================] - 12s - loss: 0.8283 - acc: 0.7197 - val_loss: 1.3230 - val_acc: 0.4158
Epoch 34/100
3200/3200 [==============================] - 12s - loss: 0.8314 - acc: 0.7153 - val_loss: 1.3233 - val_acc: 0.4131
Epoch 35/100
3200/3200 [==============================] - 12s - loss: 0.7987 - acc: 0.7225 - val_loss: 1.3242 - val_acc: 0.4195
Epoch 36/100
3200/3200 [==============================] - 12s - loss: 0.7708 - acc: 0.7537 - val_loss: 1.3540 - val_acc: 0.4204
Epoch 37/100
3200/3200 [==============================] - 12s - loss: 0.7404 - acc: 0.7606 - val_loss: 1.3335 - val_acc: 0.4149
Epoch 38/100
3200/3200 [==============================] - 12s - loss: 0.7225 - acc: 0.7656 - val_loss: 1.3518 - val_acc: 0.4195
Epoch 39/100
3200/3200 [==============================] - 12s - loss: 0.7063 - acc: 0.7844 - val_loss: 1.3700 - val_acc: 0.4127
Epoch 40/100
3200/3200 [==============================] - 12s - loss: 0.6781 - acc: 0.7916 - val_loss: 1.3709 - val_acc: 0.4181
Epoch 41/100
3200/3200 [==============================] - 12s - loss: 0.6389 - acc: 0.8075 - val_loss: 1.3776 - val_acc: 0.4154
Epoch 42/100
3200/3200 [==============================] - 12s - loss: 0.6347 - acc: 0.8147 - val_loss: 1.3896 - val_acc: 0.4158
Epoch 43/100
3200/3200 [==============================] - 12s - loss: 0.6111 - acc: 0.8184 - val_loss: 1.3847 - val_acc: 0.4154
Epoch 44/100
3200/3200 [==============================] - 12s - loss: 0.5786 - acc: 0.8350 - val_loss: 1.4063 - val_acc: 0.4036
Epoch 45/100
3200/3200 [==============================] - 12s - loss: 0.5528 - acc: 0.8506 - val_loss: 1.4021 - val_acc: 0.4104
Epoch 46/100
3200/3200 [==============================] - 12s - loss: 0.5360 - acc: 0.8538 - val_loss: 1.4370 - val_acc: 0.4086
Epoch 47/100
3200/3200 [==============================] - 12s - loss: 0.5193 - acc: 0.8563 - val_loss: 1.4451 - val_acc: 0.4131
Epoch 48/100
3200/3200 [==============================] - 12s - loss: 0.4965 - acc: 0.8728 - val_loss: 1.4550 - val_acc: 0.4032
Epoch 49/100
3200/3200 [==============================] - 12s - loss: 0.4580 - acc: 0.8806 - val_loss: 1.4652 - val_acc: 0.4122
Epoch 50/100
3200/3200 [==============================] - 12s - loss: 0.4525 - acc: 0.8831 - val_loss: 1.4822 - val_acc: 0.4127
Epoch 51/100
3200/3200 [==============================] - 12s - loss: 0.4386 - acc: 0.8897 - val_loss: 1.4740 - val_acc: 0.4109
Epoch 52/100
3200/3200 [==============================] - 12s - loss: 0.4064 - acc: 0.9012 - val_loss: 1.5020 - val_acc: 0.4050
Epoch 53/100
3200/3200 [==============================] - 12s - loss: 0.3852 - acc: 0.9084 - val_loss: 1.5019 - val_acc: 0.4045
Epoch 54/100
3200/3200 [==============================] - 12s - loss: 0.3718 - acc: 0.9119 - val_loss: 1.5530 - val_acc: 0.4018
Epoch 55/100
3200/3200 [==============================] - 12s - loss: 0.3569 - acc: 0.9159 - val_loss: 1.5511 - val_acc: 0.4068
Epoch 56/100
3200/3200 [==============================] - 12s - loss: 0.3404 - acc: 0.9256 - val_loss: 1.5705 - val_acc: 0.4009
Epoch 57/100
3200/3200 [==============================] - 12s - loss: 0.3079 - acc: 0.9294 - val_loss: 1.5692 - val_acc: 0.4081
Epoch 58/100
3200/3200 [==============================] - 12s - loss: 0.3038 - acc: 0.9394 - val_loss: 1.6020 - val_acc: 0.4063
Epoch 59/100
3200/3200 [==============================] - 12s - loss: 0.2943 - acc: 0.9325 - val_loss: 1.5965 - val_acc: 0.4059
Epoch 60/100
3200/3200 [==============================] - 12s - loss: 0.2662 - acc: 0.9475 - val_loss: 1.6128 - val_acc: 0.3910
Epoch 61/100
3200/3200 [==============================] - 14s - loss: 0.2552 - acc: 0.9472 - val_loss: 1.6349 - val_acc: 0.3855
Epoch 62/100
1568/3200 [=============>................] - ETA: 9s - loss: 0.2635 - acc: 0.9375
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-43-35a1beb29423> in <module>()
----> 1 model.fit_generator(my_generator(train_X_model,train_y_model),samples_per_epoch = 32*100,nb_epoch=100,verbose=1,validation_data=(test_X_model,test_y_model))

/home/bruce/anaconda3/lib/python3.5/site-packages/keras/engine/training.py in fit_generator(self, generator, samples_per_epoch, nb_epoch, verbose, callbacks, validation_data, nb_val_samples, class_weight, max_q_size, nb_worker, pickle_safe)
   1441                     outs = self.train_on_batch(x, y,
   1442                                                sample_weight=sample_weight,
-> 1443                                                class_weight=class_weight)
   1444                 except:
   1445                     _stop.set()

/home/bruce/anaconda3/lib/python3.5/site-packages/keras/engine/training.py in train_on_batch(self, x, y, sample_weight, class_weight)
   1219             ins = x + y + sample_weights
   1220         self._make_train_function()
-> 1221         outputs = self.train_function(ins)
   1222         if len(outputs) == 1:
   1223             return outputs[0]

/home/bruce/anaconda3/lib/python3.5/site-packages/keras/backend/theano_backend.py in __call__(self, inputs)
    715     def __call__(self, inputs):
    716         assert type(inputs) in {list, tuple}
--> 717         return self.function(*inputs)
    718 
    719 

/home/bruce/anaconda3/lib/python3.5/site-packages/theano/compile/function_module.py in __call__(self, *args, **kwargs)
    857         t0_fn = time.time()
    858         try:
--> 859             outputs = self.fn()
    860         except Exception:
    861             if hasattr(self.fn, 'position_of_error'):

/home/bruce/anaconda3/lib/python3.5/site-packages/theano/gof/op.py in rval(p, i, o, n)
    909         if params is graph.NoParams:
    910             # default arguments are stored in the closure of `rval`
--> 911             def rval(p=p, i=node_input_storage, o=node_output_storage, n=node):
    912                 r = p(n, [x[0] for x in i], o)
    913                 for o in node.outputs:

KeyboardInterrupt: 

试验记录

2016年11月12日 10:16 最佳 0.4015

参数

max_len = 36 batch_size=32

max_features = 14714 embedding_dims = 100

nb_filter = 150 filter_length = 2 dense1_hindden = 100 nb_classes = 5

2016年11月12日 10:22 最佳成绩 0.4069

参数

max_len = 36 batch_size=32

max_features = 14714 embedding_dims = 50

nb_filter = 150 filter_length = 2 dense1_hindden = 100 nb_classes = 5

2016年11月12日 10:22 最佳成绩 0.4151

参数

max_len = 36 batch_size=32

max_features = 14714 embedding_dims = 150

nb_filter = 150 filter_length = 2 dense1_hindden = 100 nb_classes = 5

2016年11月12日 10:22 最佳成绩 0.4242 [ 0.4214, 0.4033,0.4024 ,0.4151,0.4242]

参数

max_len = 36 batch_size=32

max_features = 14714 embedding_dims = 200

nb_filter = 150 filter_length = 2 dense1_hindden = 100 nb_classes = 5


In [5]:
from keras.models import Sequential
from keras.layers import LSTM, Dense
import numpy as np

data_dim = 16
timesteps = 8
nb_classes = 10

# expected input data shape: (batch_size, timesteps, data_dim)
model = Sequential()
model.add(LSTM(32, return_sequences=True,
               input_shape=(timesteps, data_dim)))  # returns a sequence of vectors of dimension 32
model.add(LSTM(32, return_sequences=True))  # returns a sequence of vectors of dimension 32
model.add(LSTM(32))  # return a single vector of dimension 32
model.add(Dense(10, activation='softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

In [ ]:


In [ ]: