In [1]:
import keras
from os.path import join
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout,Activation, Lambda,Input
from keras.layers import Embedding
from keras.layers import Convolution1D
from keras.datasets import imdb
from keras import backend as K
from keras.layers import Convolution1D, GlobalMaxPooling1D,Convolution2D
from keras.utils import np_utils
from keras.models import Model
In [2]:
file_names = ['stsa.fine.test','stsa.fine.train','stsa.fine.dev']
file_path = '/home/bruce/data/sentiment/citai_process'
def read_file(fname=''):
with open(join(file_path,fname)) as fr:
lines = fr.readlines()
lines = [line.strip() for line in lines]
lables = [int(line[0:1]) for line in lines]
characters = [list(line[2:]) for line in lines]
return characters,lables
train_X,train_y = read_file(fname='stsa.fine.train')
test_X,test_y = read_file(fname='stsa.fine.test')
dev_X,dev_y = read_file(fname='stsa.fine.dev')
print(len(train_X))
print(len(test_X))
print(len(dev_X))
print(train_X[0:2])
print(train_y[0:2])
In [ ]:
In [3]:
def statics_list2(arrays=[]):
lengths = [len(i) for i in arrays]
lengths = sorted(lengths)
length = len(lengths)
print('length = ',len(lengths))
print('max = ',lengths[-1])
print('min =',lengths[0])
print('average = ',sum(lengths)/length)
print('top 50% = ',lengths[int(0.5*length)])
print('top 80% = ',lengths[int(0.8*length)])
print('top 90% = ',lengths[int(0.9*length)])
print('top 95% = ',lengths[int(0.95*length)])
statics_list2(arrays=train_X)
In [4]:
def token_to_index(datas=[]):
word_index={}
count=1
for data in datas:
for list_ in data:
for w in list_:
if w not in word_index:
word_index[w] = count
count = count + 1
print('leng of word_index =',len(word_index))
for i in range(len(datas)):
datas[i] = [[ word_index[w] for w in line ] for line in datas[i]]
return datas,word_index
In [5]:
X,word_index = token_to_index(datas=[train_X,dev_X])
train_X,dev_X = X
print(len(word_index))
print(word_index)
In [6]:
print(train_X[0])
In [7]:
max_len = 190
batch_size=32
max_features = 100
embedding_dims = 150
nb_filter = 150
nb_filter = 150
filter_length = 3
dense1_hindden = 150
nb_classes = 5
In [8]:
print('Build model...')
model = Sequential()
model.add(Embedding(input_dim=max_features,
output_dim = embedding_dims
))
model.add(Convolution1D(nb_filter = nb_filter,
filter_length = filter_length,
border_mode = 'valid',
activation='relu',
subsample_length = 1
))
model.add(GlobalMaxPooling1D())
model.add(Dense(dense1_hindden))
model.add(Dropout(0.2))
model.add(Activation('relu'))
model.add(Dense(nb_classes))
model.add(Activation('softmax'))
model.compile(loss = 'categorical_crossentropy',
optimizer = 'adadelta',
metrics=['accuracy']
)
print('finish build')
In [9]:
print(type(train_y[0]))
train_y = np_utils.to_categorical(train_y, nb_classes)
dev_y = np_utils.to_categorical(dev_y, nb_classes)
train_X = sequence.pad_sequences(train_X, maxlen=max_len)
dev_X = sequence.pad_sequences(dev_X, maxlen=max_len)
In [10]:
def my_generator(X=None,y=None):
i = 0
max_i = int(len(X)/batch_size)
while True:
i = i % max_i
x_batch = X[i*batch_size:(i+1)*batch_size]
y_batch = y[i*batch_size:(i+1)*batch_size]
yield (x_batch,y_batch)
i = i + 1
In [11]:
model.fit_generator(my_generator(train_X,train_y),samples_per_epoch = 32*267,nb_epoch=500,verbose=1,validation_data=(dev_X,dev_y))
In [ ]:
score = model.evaluate(X_test, Y_test, verbose=0)
print('Test score:', score[0])
print('Test accuracy:', score[1])