一个有三个通道,分别是word embedding,POS 标签 embedding, 词的情感极性强度embedding
In [14]:
import keras
from os.path import join
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout,Activation, Lambda,Input
from keras.layers import Embedding
from keras.layers import Convolution1D
from keras.datasets import imdb
from keras import backend as K
from keras.layers import Convolution1D, GlobalMaxPooling1D,Convolution2D,Merge,merge,Reshape,MaxPooling2D,Flatten
from keras.utils import np_utils
from keras.models import Model
import nltk
from nltk.tag import pos_tag
import numpy as np
from keras.regularizers import l2
import theano
Tag word 的方法: http://www.nltk.org/book/ch05.html
In [2]:
file_names = ['stsa.fine.test','stsa.fine.train','stsa.fine.dev']
file_path = '/home/bruce/data/sentiment/citai_process'
def read_file(fname=''):
with open(join(file_path,fname)) as fr:
lines = fr.readlines()
lines = [line.strip().lower() for line in lines]
lables = [int(line[0:1]) for line in lines]
words = [line[2:].split() for line in lines]
return words,lables
train_X,train_y = read_file(fname='stsa.fine.train')
test_X,test_y = read_file(fname='stsa.fine.test')
dev_X,dev_y = read_file(fname='stsa.fine.dev')
print(len(train_X))
print(len(test_X))
print(len(dev_X))
print(train_X[0:2])
print(train_y[0:2])
In [3]:
def tag_sentence(X=[]):
tag_X=[]
for line in X:
word_tag = pos_tag(line,tagset='universal')
tag = [i[1] for i in word_tag]
tag_X.append(tag)
return tag_X
train_tag_X = tag_sentence(X=train_X)
dev_tag_X = tag_sentence(X=dev_X)
test_tag_X = tag_sentence(X=test_X)
print(train_X[0])
print(train_tag_X[0])
In [4]:
senti_file = '/home/bruce/data/sentiment/sentiment_diction/wordwithStrength.txt'
def construct_senti_dict(senti_file=''):
with open(senti_file) as fr:
lines = fr.readlines()
lines = [line.strip().split() for line in lines]
lines = [(i[0],float(i[1])) for i in lines]
return dict(lines)
sentiment_dict=construct_senti_dict(senti_file)
print('sentiment number =',len(sentiment_dict))
In [5]:
def sentiment_strength(X=[],sentiment_dict=sentiment_dict):
sentiment_X = [[sentiment_dict[w] if w in sentiment_dict else 0 for w in line ]for line in X]
sentiment_X = [[ str(int(val*10)) if val <=0 else '+'+str(int(val*10)) for val in line] for line in sentiment_X]
return sentiment_X
train_sentiment_X = sentiment_strength(X=train_X,sentiment_dict=sentiment_dict)
dev_sentiment_X = sentiment_strength(X=dev_X,sentiment_dict=sentiment_dict)
test_sentiment_X = sentiment_strength(X=test_X,sentiment_dict=sentiment_dict)
assert len(train_sentiment_X) == len(train_X)
print(train_sentiment_X[0:5])
print(train_X[0:5])
print(train_y[0:5])
In [6]:
def token_to_index(datas=[]):
word_index={}
count=1
for data in datas:
for list_ in data:
for w in list_:
if w not in word_index:
word_index[w] = count
count = count + 1
print('leng of word_index =',len(word_index))
for i in range(len(datas)):
datas[i] = [[ word_index[w] for w in line ] for line in datas[i]]
return datas,word_index
X,word_index = token_to_index(datas=[train_X,dev_X,train_sentiment_X,train_tag_X,dev_sentiment_X,dev_tag_X])
train_X,dev_X,train_sentiment_X,train_tag_X,dev_sentiment_X,dev_tag_X = X
print('length of dict_index = ',len(word_index))
In [7]:
print(train_sentiment_X[0:2])
print(train_X[0:2])
print(train_y[0:2])
In [8]:
embedding_dim = 100
we_file = '/home/bruce/data/glove/twitter/glove.twitter.27B.{0}d.txt'.format(embedding_dim)
def get_index_wordembedding(we_file='',word_index={}):
index_wordembedding ={}
zeros = np.zeros(embedding_dim)
for line in open(we_file):
elements = line.strip().split()
if elements[0] in word_index:
index = word_index[elements[0]]
wordembedding = [float(i) for i in elements[1:]]
index_wordembedding[index] = wordembedding
print('总word的数目= ',len(word_index))
print('总word embedding 的数目 = ',len(index_wordembedding))
for word,index in word_index.items():
if index not in index_wordembedding:
index_wordembedding[index] = zeros
assert len(index_wordembedding) == len(word_index)
return index_wordembedding
index_wordembedding = get_index_wordembedding(we_file=we_file,word_index=word_index)
In [9]:
def get_trained_embedding(index_wordembedding=None):
index_we = sorted(index_wordembedding.items())
print('index_we[0] =',index_we[0])
trained_embedding = [t[1] for t in index_we]
zeros = np.zeros(embedding_dim)
trained_embedding = np.vstack((zeros,trained_embedding))
return np.array(trained_embedding)
In [10]:
def batch_indexData_embedding(X=None,index_wordembedding={}):
zeros = np.zeros(embedding_dim)
return [ [ index_wordembedding[w] if w in index_wordembedding else zeros for w in line ] for line in X ]
In [25]:
max_len = 36
batch_size=50
max_features= 14526
#embedding_dims=50
nb_filter = 300
filter_length1 = 2
filter_length2 = 3
filter_length3 = 4
filter_size=(3,100)
dense1_hindden = 150*2
nb_classes = 5
In [ ]:
1.输入的变量和后面同名
In [29]:
print('Build model...')
input_random = Input(shape=(max_len,), dtype='int32', name='main_input1')
embedding = Embedding(output_dim=embedding_dim, input_dim=max_features)(input_random)
# 卷积层
conv1 = Convolution1D(nb_filter = nb_filter,
filter_length = filter_length1,
border_mode = 'valid',
activation='relu'
)(embedding)
conv2 = Convolution1D(nb_filter = nb_filter,
filter_length = filter_length1,
border_mode = 'valid',
activation='relu'
)(embedding)
conv3 = Convolution1D(nb_filter = nb_filter,
filter_length = filter_length1,
border_mode = 'valid',
activation='relu'
)(embedding)
conv1 =GlobalMaxPooling1D(conv1)
conv2 =GlobalMaxPooling1D()(conv2)
conv3 =GlobalMaxPooling1D()(conv3)
merged_vector = merge([conv1,conv2,conv3], mode='concat')
# 全连接层
dense_layer = Dense(dense1_hindden)
dens1 = dense_layer(merged_vector)
print('dense_layer input_shape should == (300,)')
print(dense_layer.input_shape)
dens1 = Activation('relu')(dens1)
# softmax层
dens2 = Dense(nb_classes)(dens1)
output_random = Activation('softmax')(dens2)
model = Model(input=input_random,output=output_random)
print('finish build model')
model.compile(optimizer='adadelta',
loss='categorical_crossentropy',
metrics=['accuracy'])
In [12]:
input_static = Input(shape=(max_len,embedding_dim), name='main_input2')
# 卷积层
conv1 = Convolution1D(nb_filter = nb_filter,
filter_length = filter_length1,
border_mode = 'valid',
activation='relu'
)(input_static)
conv2 = Convolution1D(nb_filter = nb_filter,
filter_length = filter_length1,
border_mode = 'valid',
activation='relu'
)(input_static)
conv3 = Convolution1D(nb_filter = nb_filter,
filter_length = filter_length1,
border_mode = 'valid',
activation='relu'
)(input_static)
conv1 =GlobalMaxPooling1D()(conv1)
conv2 =GlobalMaxPooling1D()(conv2)
conv3 =GlobalMaxPooling1D()(conv3)
merged_vector = merge([conv1,conv2,conv3], mode='concat')
# 全连接层
dens1 = Dense(dense1_hindden)(merged_vector)
dens1 = Activation('relu')(dens1)
# softmax层
dens2 = Dense(nb_classes)(dens1)
output_static = Activation('softmax')(dens2)
model = Model(input=input_static,output=output_static)
print('finish build model')
model.compile(optimizer='adadelta',
loss='categorical_crossentropy',
metrics=['accuracy'])
In [32]:
print('Build model...')
input_non_static = Input(shape=(max_len,), dtype='int32', name='main_input1')
#初始化Embedding层
trained_embedding = get_trained_embedding(index_wordembedding=index_wordembedding)
embedding_layer = Embedding(max_features,
embedding_dim,
weights=[trained_embedding]
)
embedding = embedding_layer(input_non_static)
conv1 = Convolution1D(nb_filter = nb_filter,
filter_length = filter_length1,
border_mode = 'valid',
activation='relu'
)(embedding)
conv2 = Convolution1D(nb_filter = nb_filter,
filter_length = filter_length1,
border_mode = 'valid',
activation='relu'
)(embedding)
conv3 = Convolution1D(nb_filter = nb_filter,
filter_length = filter_length1,
border_mode = 'valid',
activation='relu'
)(embedding)
dropout = Dropout(0.5)
conv1 =GlobalMaxPooling1D()(conv1)
conv2 =GlobalMaxPooling1D()(conv2)
conv3 =GlobalMaxPooling1D()(conv3)
#conv1 = dropout(conv1)
#conv2 = dropout(conv2)
#conv3 = dropout(conv3)
merged_vector = merge([conv1,conv2,conv3], mode='concat')
# 全连接层
dense_layer = Dense(dense1_hindden)
dens1 = dense_layer(merged_vector)
print('dense_layer input shpae = ',dense_layer.input_shape)
dens1 = Activation('relu')(dens1)
dens1 = dropout(dens1)
# softmax层
dens2 = Dense(nb_classes)(dens1)
output_non_static = Activation('softmax')(dens2)
model = Model(input=input_non_static,output=output_non_static)
print('finish build model')
model.compile(optimizer='adadelta',
loss='categorical_crossentropy',
metrics=['accuracy'])
In [30]:
print('Build model...')
input1 = Input(shape=(max_len,), dtype='int32', name='main_input1')
input2 = Input(shape=(max_len,), name='main_input2')
#input3 = Input(shape=(max_len,), dtype='int32', name='main_input3')
embedding = Embedding(output_dim=embedding_dim, input_dim=max_features)
embedding1 = embedding(input1)
print('embedding1 output_shape = ',embedding.output_shape)
embedding2 = embedding(input2)
merged_vector = merge([embedding1,embedding2], mode='concat')
reshape = Reshape((2,max_len,embedding_dim))
word_sentiment = reshape(merged_vector)
print('reshape output_shape = ',reshape.output_shape)
conv_layer1 = Convolution2D(nb_filter, filter_size[0], filter_size[1],
activation='relu',
border_mode='valid')
conv1 = conv_layer1(word_sentiment)
print('conv_layer1 output shpae should be (100,35,1)',conv_layer1.output_shape)
maxpool = MaxPooling2D(pool_size=(34, 1))
conv1 = maxpool(conv1)
print('(100,1)==', maxpool.output_shape)
fatten = Flatten()
conv1 = fatten(conv1)
dens1 = Dense(dense1_hindden)(conv1)
dens1 = Activation('relu')(dens1)
dropout = Dropout(0.5)
dens1 = dropout(dens1)
dens2 = Dense(nb_classes)(dens1)
output = Activation('softmax')(dens2)
#model = Model(input=[input1,input2],output=output)
model = Model(input=[input1,input2],output=output)
print('finish build model')
model.compile(optimizer='adadelta',
loss='categorical_crossentropy',
metrics=['accuracy'])
In [42]:
print('Build model...')
input1 = Input(shape=(max_len,), dtype='int32', name='main_input1')
input2 = Input(shape=(max_len,), name='main_input2')
#input3 = Input(shape=(max_len,), dtype='int32', name='main_input3')
embedding = Embedding(output_dim=embedding_dim, input_dim=max_features)
embedding1 = embedding(input1)
embedding2 = embedding(input2)
merged_vector = merge([conv11,conv12], mode='concat')
reshape = Reshape((2,max_len,embedding_dim))
word_sentiment = reshape(merged_vector)
print('reshape input shpae should be (72,100)= ',reshape.input_shape)
#embedding3 = embedding(input3)
#---------------------------------------------------------------------------
#卷积方法一:每个通道,用不同的卷积核
'''
cov1_out1 = Convolution1D(nb_filter = nb_filter,
filter_length = filter_length,
border_mode = 'valid',
activation='relu'
)(embedding1)
cov1_out2 = Convolution1D(nb_filter = nb_filter,
filter_length = filter_length,
border_mode = 'valid',
activation='relu'
)(embedding2)
cov1_out3 = Convolution1D(nb_filter = nb_filter,
filter_length = filter_length,
border_mode = 'valid',
activation='relu'
)(embedding3)
'''
# 卷积方法二:每个通道用相同的卷积核
conv11 = Convolution1D(nb_filter = nb_filter,
filter_length = filter_length1,
border_mode = 'valid',
activation='relu'
)
conv12 = Convolution1D(nb_filter = nb_filter,
filter_length = filter_length2,
border_mode = 'valid',
activation='relu'
)
conv13 = Convolution1D(nb_filter = nb_filter,
filter_length = filter_length3,
border_mode = 'valid',
activation='relu'
)
conv14 = Convolution1D(nb_filter = nb_filter,
filter_length = filter_length1,
border_mode = 'valid',
activation='relu'
)
conv15 = Convolution1D(nb_filter = nb_filter,
filter_length = filter_length2,
border_mode = 'valid',
activation='relu'
)
conv16 = Convolution1D(nb_filter = nb_filter,
filter_length = filter_length3,
border_mode = 'valid',
activation='relu'
)
dropout = Dropout(0.5)
#第一个通道
cov1_out11 = conv11(embedding1)
cov1_out12 = conv12(embedding1)
cov1_out13 = conv13(embedding1)
'''
cov1_out11 = dropout(cov1_out11)
cov1_out12 = dropout(cov1_out12)
cov1_out13 = dropout(cov1_out13)
'''
'''
#第二个通道
cov1_out14 = conv14(embedding2)
cov1_out15 = conv15(embedding2)
cov1_out16 = conv16(embedding2)
'''
#第三个通道:
'''
cov1_out14 = dropout(cov1_out14)
cov1_out15 = dropout(cov1_out15)
cov1_out16 = dropout(cov1_out16)
'''
#cov1_out2 = conv(embedding2)
#cov1_out3 = conv(embedding3)
#------------------------------------------------------------------------------
maxpooling = GlobalMaxPooling1D()
conv11 = maxpooling(cov1_out11)
conv12 = maxpooling(cov1_out12)
conv13 = maxpooling(cov1_out13)
conv14 = maxpooling(cov1_out14)
conv15 = maxpooling(cov1_out15)
conv16 = maxpooling(cov1_out16)
#merged_vector = merge([conv11,conv12,conv13,conv14,conv15,conv16], mode='concat')
merged_vector = merge([conv11,conv12,conv13], mode='concat')
#dropout = Dropout(0.5)
#merged_vector = dropout(merged_vector)
dens1 = Dense(dense1_hindden)(merged_vector)
dens1 = Activation('relu')(dens1)
dens1 = dropout(dens1)
dens2 = Dense(nb_classes)(dens1)
output = Activation('softmax')(dens2)
#model = Model(input=[input1,input2],output=output)
model = Model(input=[input1],output=output)
print('finish build model')
model.compile(optimizer='adadelta',
loss='categorical_crossentropy',
metrics=['accuracy'])
In [31]:
from IPython.display import SVG
from keras.utils.visualize_util import model_to_dot
SVG(model_to_dot(model).create(prog='dot', format='svg'))
Out[31]:
In [19]:
print(type(train_y[0]))
train_y_model = np_utils.to_categorical(train_y, nb_classes)
dev_y_model = np_utils.to_categorical(dev_y, nb_classes)
train_X_model = sequence.pad_sequences(train_X, maxlen=max_len)
dev_X_model = sequence.pad_sequences(dev_X, maxlen=max_len)
train_sentiment_X_model = sequence.pad_sequences(train_sentiment_X,maxlen=max_len)
train_tag_X_model= sequence.pad_sequences(train_tag_X,maxlen=max_len)
dev_sentiment_X_model = sequence.pad_sequences(dev_sentiment_X,maxlen=max_len)
dev_tag_X_model = sequence.pad_sequences(dev_tag_X,maxlen=max_len)
#train_embedding_X_model = batch_indexData_embedding(X=train_X_model,index_wordembedding=index_wordembedding)
dev_embedding_X_model = batch_indexData_embedding(X=dev_X_model,index_wordembedding=index_wordembedding)
dev_embedding_X_model = np.array(dev_embedding_X_model)
In [20]:
#转为index
def to_index(word_index={},data=[]):
return [[word_index[w] if w in word_index else 0 for w in sentence] for sentence in data]
test_index_X = to_index(word_index,test_X)
test_sentiment_X = to_index(word_index,test_sentiment_X)
test_tag_X = to_index(word_index,test_tag_X)
#删补
test_index_X_model = sequence.pad_sequences(test_index_X, maxlen=max_len)
test_sentiment_X_model = sequence.pad_sequences(test_sentiment_X, maxlen=max_len)
test_tag_X_model = sequence.pad_sequences(test_tag_X, maxlen=max_len)
#embedding
test_embedding_X = batch_indexData_embedding(X=test_index_X,index_wordembedding=index_wordembedding)
test_y_model = np_utils.to_categorical(test_y, nb_classes)
## test
In [21]:
def my_generator4(X1=None,X2=None,y=None):
i = 0
max_i = int(len(X1)/batch_size)
while True:
i = i % max_i
x1_batch = X1[i*batch_size:(i+1)*batch_size]
x2_batch = X2[i*batch_size:(i+1)*batch_size]
#x3_batch = X3[i*batch_size:(i+1)*batch_size]
y_batch = y[i*batch_size:(i+1)*batch_size]
yield ([x1_batch,x2_batch],y_batch)
i = i + 1
def my_generator3(X1=None,y=None):
i = 0
max_i = int(len(X1)/batch_size)
while True:
i = i % max_i
x1_batch = X1[i*batch_size:(i+1)*batch_size]
x2_batch = batch_indexData_embedding(X=x1_batch,index_wordembedding=index_wordembedding)
x2_batch = np.array(x2_batch)
y_batch = y[i*batch_size:(i+1)*batch_size]
yield ([x1_batch,x2_batch],y_batch)
i = i + 1
def my_generator1(X1=None,y=None):
i = 0
max_i = int(len(X1)/batch_size)
while True:
i = i % max_i
x1_batch = X1[i*batch_size:(i+1)*batch_size]
y_batch = y[i*batch_size:(i+1)*batch_size]
yield (x1_batch,y_batch)
i = i + 1
def my_generator2(X1=None,y=None):
i = 0
max_i = int(len(X1)/batch_size)
while True:
i = i % max_i
x1_batch = X1[i*batch_size:(i+1)*batch_size]
x1_batch = batch_indexData_embedding(X=x1_batch,index_wordembedding=index_wordembedding)
x1_batch = np.array(x1_batch)
y_batch = y[i*batch_size:(i+1)*batch_size]
yield (x1_batch,y_batch)
i = i + 1
In [26]:
model.fit_generator(my_generator1(train_X_model,train_y_model),samples_per_epoch = 32*100,nb_epoch=100,verbose=1,validation_data=(dev_X_model,dev_y_model))
time | max_len | batch_size | max_features | embedding_dims | nb_filter | filter_length | dense1_hindden | val_acc |
---|---|---|---|---|---|---|---|---|
2016-11-25 9:52 | 36 | 50 | 14526 | 100 | 各100 | 3,4,5 | 300 | 0.4169 |
In [22]:
model.fit_generator(my_generator4(train_X_model,train_sentiment_X_model,train_y_model),samples_per_epoch = 32*100,nb_epoch=100,verbose=1,validation_data=([test_embedding_X,test_sentiment_X_model],test_y))
time | max_len | batch_size | max_features | embedding_dims | nb_filter | filter_length | dense1_hindden | val_acc |
---|---|---|---|---|---|---|---|---|
2016-11-25 9:52 | 36 | 50 | 14526 | 100 | 各100 | 3,4,5 | 300 | 0.4253 |
In [ ]:
In [34]:
model.fit_generator(my_generator1(train_X_model,train_y_model),samples_per_epoch = 50*40,nb_epoch=100,verbose=1,validation_data=(test_index_X,test_y))
Out[34]:
time | max_len | batch_size | max_features | embedding_dims | nb_filter | filter_length | dense1_hindden | val_acc |
---|---|---|---|---|---|---|---|---|
2016-11-25 9:52 | 36 | 50 | 14526 | 100 | 各100 | 3,4,5 | 300 | 0.4204 |
2016-11-26 9:52 | 36 | 50 | 14526 | 100 | 各100 | 3,4,5 | 300 | 0.4471 |
In [29]:
#model.fit_generator(my_generator1(train_X_model,train_y_model),samples_per_epoch = 50*60,nb_epoch=100,verbose=1,validation_data=([test_index_X_model],test_y))
model.fit_generator(my_generator4(train_X_model,train_sentiment_X_model,train_y_model),samples_per_epoch = 50*60,nb_epoch=100,verbose=1,validation_data=([test_index_X_model,test_sentiment_X_model],test_y_model))
In [32]:
model.fit_generator(my_generator4(train_X_model,train_sentiment_X_model,train_y_model),samples_per_epoch = 50*60,nb_epoch=100,verbose=1,validation_data=([test_index_X_model,test_sentiment_X_model],test_y_model))
Out[32]:
model | max_len | batch_size | max_features | embedding_dims | nb_filter | filter_length | dense1_hindden | val_acc |
---|---|---|---|---|---|---|---|---|
word+sentiment | 36 | 50 | 14526 | 100 | 各100(600) | 3,4,5 | 300 | 0.4303 |
不加sentiment的结果:0.4285,0.4348,0.4348,0.4312
time | max_len | batch_size | max_features | embedding_dims | nb_filter | filter_length | dense1_hindden | val_acc |
---|---|---|---|---|---|---|---|---|
2016-11-25 9:52 | 36 | 32 | 14526 | 100 | 各100 | 3,4,5 | 300 | 0.4124 |
In [ ]: