In [1]:
import keras 
from  os.path import join
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout,Activation, Lambda,Input
from keras.layers import Embedding
from keras.layers import Convolution1D
from keras.datasets import imdb
from keras import backend as K
from keras.layers import Convolution1D, GlobalMaxPooling1D,Convolution2D
from keras.utils import np_utils
from keras.models import Model


Using Theano backend.
Using gpu device 0: GeForce GT 630 (CNMeM is enabled with initial size: 95.0% of memory, cuDNN not available)

数据预处理


In [2]:
file_names = ['stsa.fine.test','stsa.fine.train','stsa.fine.dev']
file_path = '/home/bruce/data/sentiment/citai_process'
def read_file(fname=''):
    with open(join(file_path,fname)) as fr:
        lines = fr.readlines()
    lines = [line.strip().lower() for line in lines]
    lables = [int(line[0:1]) for line in lines]
    words = [line[2:].split() for line in lines]
    return words,lables       
train_X,train_y = read_file(fname='stsa.fine.train')
test_X,test_y = read_file(fname='stsa.fine.test')
dev_X,dev_y = read_file(fname='stsa.fine.dev')
print(len(train_X))
print(len(test_X))
print(len(dev_X))
print(train_X[0:2])
print(train_y[0:2])


8544
2210
1101
[['a', 'stirring', ',', 'funny', 'and', 'finally', 'transport', 're-imagining', 'of', 'beauty', 'and', 'the', 'beast', 'and', '1930s', 'horror', 'film'], ['apparently', 'reassemble', 'from', 'the', 'cutting-room', 'floor', 'of', 'any', 'give', 'daytime', 'soap', '.']]
[4, 1]

句子长度统计信息


In [3]:
def statics_list2(arrays=[]):
    lengths = [len(i) for i in arrays]
    lengths = sorted(lengths)
    length = len(lengths)
    print('length = ',len(lengths))
    print('max = ',lengths[-1])
    print('min =',lengths[0])
    print('average = ',sum(lengths)/length)
    print('top 50% = ',lengths[int(0.5*length)])
    print('top 80% = ',lengths[int(0.8*length)])
    print('top 90% = ',lengths[int(0.9*length)])
    print('top 95% = ',lengths[int(0.95*length)])
    
statics_list2(arrays=train_X)


length =  8544
max =  55
min = 2
average =  19.579470973782772
top 50% =  19
top 80% =  28
top 90% =  33
top 95% =  36

In [4]:
def token_to_index(datas=[]):
    word_index={}
    count=1
    for data in datas:
        for list_ in data:
            for w in list_:
                if w not in word_index:
                    word_index[w] = count
                    count = count + 1
    print('leng of word_index =',len(word_index))
    for i in range(len(datas)):
        datas[i] = [[ word_index[w] for w in line ] for line in datas[i]] 
    return datas,word_index

In [5]:
X,word_index = token_to_index(datas=[train_X,dev_X])
train_X,dev_X = X
print(len(word_index))


leng of word_index = 14497
14497

构建模型


In [6]:
max_len = 36
batch_size=32

max_features = 14498
embedding_dims = 250

nb_filter = 150
filter_length = 2
dense1_hindden = 150
nb_classes = 5

In [7]:
print('Build model...')
model = Sequential()
model.add(Embedding(input_dim=max_features,
                    output_dim = embedding_dims
                   ))
model.add(Convolution1D(nb_filter = nb_filter,
                        filter_length = filter_length,
                        border_mode = 'valid',
                        activation='relu',
                        subsample_length = 1
                       ))
model.add(GlobalMaxPooling1D())
model.add(Dense(dense1_hindden))
model.add(Dropout(0.2))
model.add(Activation('relu'))

model.add(Dense(nb_classes))
model.add(Activation('softmax'))
model.compile(loss = 'categorical_crossentropy',
              optimizer = 'adadelta',
              metrics=['accuracy']
             )
print('finish build')


Build model...
finish build

In [ ]:
## 模型图

In [9]:
def my_generator(X=None,y=None):
    i = 0
    max_i = int(len(X)/batch_size)
    while True:
        i = i % max_i
        x_batch = X[i*batch_size:(i+1)*batch_size]
        y_batch = y[i*batch_size:(i+1)*batch_size]
        yield (x_batch,y_batch)
        i = i + 1

In [ ]:
model.fit_generator(my_generator(train_X_model,train_y_model),samples_per_epoch = 32*100,nb_epoch=100,verbose=1,validation_data=(dev_X_model,dev_y_model))


Epoch 1/100
3200/3200 [==============================] - 7s - loss: 1.3270 - acc: 0.4428 - val_loss: 1.4102 - val_acc: 0.3815
Epoch 2/100
3200/3200 [==============================] - 7s - loss: 1.3314 - acc: 0.4347 - val_loss: 1.3897 - val_acc: 0.3960
Epoch 3/100
3200/3200 [==============================] - 7s - loss: 1.2626 - acc: 0.4738 - val_loss: 1.3828 - val_acc: 0.3996
Epoch 4/100
3072/3200 [===========================>..] - ETA: 0s - loss: 1.2142 - acc: 0.4945

试验记录

2016年11月12日 10:16 最佳 0.4015

参数

max_len = 36 batch_size=32

max_features = 14714 embedding_dims = 100

nb_filter = 150 filter_length = 2 dense1_hindden = 100 nb_classes = 5

2016年11月12日 10:22 最佳成绩 0.4069

参数

max_len = 36 batch_size=32

max_features = 14714 embedding_dims = 50

nb_filter = 150 filter_length = 2 dense1_hindden = 100 nb_classes = 5

2016年11月12日 10:22 最佳成绩 0.4151

参数

max_len = 36 batch_size=32

max_features = 14714 embedding_dims = 150

nb_filter = 150 filter_length = 2 dense1_hindden = 100 nb_classes = 5

2016年11月12日 10:22 最佳成绩 0.4242 [ 0.4214, 0.4033,0.4024 ,0.4151,0.4242]

参数

max_len = 36 batch_size=32

max_features = 14714 embedding_dims = 200

nb_filter = 150 filter_length = 2 dense1_hindden = 100 nb_classes = 5


In [ ]: