In [1]:
import keras
from os.path import join
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout,Activation, Lambda,Input
from keras.layers import Embedding
from keras.layers import Convolution1D
from keras.datasets import imdb
from keras import backend as K
from keras.layers import Convolution1D, GlobalMaxPooling1D,Convolution2D
from keras.utils import np_utils
from keras.models import Model
In [2]:
file_names = ['stsa.fine.test','stsa.fine.train','stsa.fine.dev']
file_path = '/home/bruce/data/sentiment/citai_process'
def read_file(fname=''):
with open(join(file_path,fname)) as fr:
lines = fr.readlines()
lines = [line.strip().lower() for line in lines]
lables = [int(line[0:1]) for line in lines]
words = [line[2:].split() for line in lines]
return words,lables
train_X,train_y = read_file(fname='stsa.fine.train')
test_X,test_y = read_file(fname='stsa.fine.test')
dev_X,dev_y = read_file(fname='stsa.fine.dev')
print(len(train_X))
print(len(test_X))
print(len(dev_X))
print(train_X[0:2])
print(train_y[0:2])
In [3]:
def statics_list2(arrays=[]):
lengths = [len(i) for i in arrays]
lengths = sorted(lengths)
length = len(lengths)
print('length = ',len(lengths))
print('max = ',lengths[-1])
print('min =',lengths[0])
print('average = ',sum(lengths)/length)
print('top 50% = ',lengths[int(0.5*length)])
print('top 80% = ',lengths[int(0.8*length)])
print('top 90% = ',lengths[int(0.9*length)])
print('top 95% = ',lengths[int(0.95*length)])
statics_list2(arrays=train_X)
In [4]:
def token_to_index(datas=[]):
word_index={}
count=1
for data in datas:
for list_ in data:
for w in list_:
if w not in word_index:
word_index[w] = count
count = count + 1
print('leng of word_index =',len(word_index))
for i in range(len(datas)):
datas[i] = [[ word_index[w] for w in line ] for line in datas[i]]
return datas,word_index
In [5]:
X,word_index = token_to_index(datas=[train_X,dev_X])
train_X,dev_X = X
print(len(word_index))
In [6]:
max_len = 36
batch_size=32
max_features = 14498
embedding_dims = 250
nb_filter = 150
filter_length = 2
dense1_hindden = 150
nb_classes = 5
In [7]:
print('Build model...')
model = Sequential()
model.add(Embedding(input_dim=max_features,
output_dim = embedding_dims
))
model.add(Convolution1D(nb_filter = nb_filter,
filter_length = filter_length,
border_mode = 'valid',
activation='relu',
subsample_length = 1
))
model.add(GlobalMaxPooling1D())
model.add(Dense(dense1_hindden))
model.add(Dropout(0.2))
model.add(Activation('relu'))
model.add(Dense(nb_classes))
model.add(Activation('softmax'))
model.compile(loss = 'categorical_crossentropy',
optimizer = 'adadelta',
metrics=['accuracy']
)
print('finish build')
In [ ]:
## 模型图
In [9]:
def my_generator(X=None,y=None):
i = 0
max_i = int(len(X)/batch_size)
while True:
i = i % max_i
x_batch = X[i*batch_size:(i+1)*batch_size]
y_batch = y[i*batch_size:(i+1)*batch_size]
yield (x_batch,y_batch)
i = i + 1
In [ ]:
model.fit_generator(my_generator(train_X_model,train_y_model),samples_per_epoch = 32*100,nb_epoch=100,verbose=1,validation_data=(dev_X_model,dev_y_model))
参数
max_len = 36 batch_size=32
max_features = 14714 embedding_dims = 100
nb_filter = 150 filter_length = 2 dense1_hindden = 100 nb_classes = 5
参数
max_len = 36 batch_size=32
max_features = 14714 embedding_dims = 50
nb_filter = 150 filter_length = 2 dense1_hindden = 100 nb_classes = 5
参数
max_len = 36 batch_size=32
max_features = 14714 embedding_dims = 150
nb_filter = 150 filter_length = 2 dense1_hindden = 100 nb_classes = 5
参数
max_len = 36 batch_size=32
max_features = 14714 embedding_dims = 200
nb_filter = 150 filter_length = 2 dense1_hindden = 100 nb_classes = 5
In [ ]: