In [101]:
from __future__ import print_function
import numpy as np
np.random.seed(1337) # for reproducibility
from os.path import join
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout,Activation, Lambda,Input
from keras.layers import Embedding
from keras.layers import Convolution1D
from keras.datasets import imdb
from keras import backend as K
from keras.layers import Convolution1D, GlobalMaxPooling1D
from keras.models import Model
def character_vec_dict(vec_path=''):
print('loading character embedding')
vec_dic={}
with open(vec_path) as fr:
lines = fr.readlines()
for line in lines[1:]:
try :
v = [float(i) for i in line[1:].split()]
vec_dic[line[0]] = v
except:
pass
vec_dic[u'*'] = [float(0) for _ in range(200)]
return vec_dic
def sen_len(train_path=''):
print('calculating sentence length')
with open(join(train_path,'train_X')) as fr:
lines = fr.readlines()
length = [len(line.strip()) for line in lines]
length = sorted(length)
return length[int(len(length)*0.9)]
def pad(X=[],padlen=None):
print('padding sentence ')
print('X[0] type must be unicode =',type(X[0]))
new_x=[]
for index,sen in enumerate(X):
len_s = len(sen)
num = len_s-padlen
if num>=0:
X[index] = sen[0:padlen]
else:
X[index] = sen+(-1*num)*u'*'
print('pad to length = ',padlen)
return X
def embedding(x_batch=[],vec_dict=None):
x_batch_embedding=[]
for sen in x_batch:
sen_embedding=[vec_dict[ch] if ch in vec_dict else vec_dict[u'*'] for ch in sen]
x_batch_embedding.append(sen_embedding)
return x_batch_embedding
#generator 每次生成一个batch的数据。
def my_generator(data_path='',batch_size=None):
'''
1.加载数据到字典中
2.读取训练数据到numpy中。
3.句子长度一样。
3.每次生成一个batch的句子表示。
'''
with open(join(data_path,'train_X')) as fr:
lines = fr.readlines()
X_str = [line.strip() for line in lines]
with open(join(data_path,'train_y')) as fr:
lines = fr.readlines()
y = [int(line.strip()) for line in lines]
print(len(X_str))
print(len(y))
assert len(X_str) == len(y),'error ,训练数据集的数据X,y不相等'
#加载字向量
vec_dict = character_vec_dict(join(data_path,'wikiw2v_zh_zi.txt'))
#pad X
padlen = sen_len(data_path)
X_str = pad(X_str,padlen)
i = 0
max_i = int(len(X_str)/batch_size)
while True:
i = i % max_i
x_batch = X_str[i*batch_size:(i+1)*batch_size]
y_batch = y[i*batch_size:(i+1)*batch_size]
x_batch = embedding(x_batch,vec_dict)
x_batch = np.array(x_batch)
y_batch = np.array(y_batch)
yield (x_batch,y_batch)
i = i + 1
def get_input_shape(data_path=''):
with open(join(data_path,'train_X')) as fr:
lines = fr.readlines()
X_str = [line.strip() for line in lines]
#pad X
padlen = sen_len(data_path)
return (padlen,200)
def get_validata_shape(data_path='',pad_len=None):
with open(join(data_path,'test_X'))as fr:
lines = fr.readlines()
print('len lines = ',len(lines))
test_X= [line.strip() for line in lines]
with open(join(data_path,'test_y')) as fr:
lines = fr.readlines()
test_y = [int(line.strip()) for line in lines]
print(len(test_X))
print(len(test_y))
assert len(test_X) == len(test_y),'test_X == test_y'
#加载字向量
vec_dict = character_vec_dict(join(data_path,'wikiw2v_zh_zi.txt'))
test_X = pad(test_X,pad_len)
print('len vec_dict = ',len(vec_dict))
print('len text_X = ',len(test_X))
test_X = embedding(test_X,vec_dict)
test_X = np.array(test_X)
test_y = np.array(test_y)
return test_X,test_y
In [110]:
print('Build model...')
# set parameters:
batch_size = 32
embedding_dims = 200
nb_filter = 150
filter_length = 2
hidden_dims = 300
nb_epoch = 2
data_path = '/home/bruce/code/DLNLP/data'
input_shape = get_input_shape(data_path)
print('input_shape = ',input_shape)
test_X,test_y = get_validata_shape(data_path,188)
##########################################################################
print('Build model...')
model = Sequential()
# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
# we add a Convolution1D, which will learn nb_filter
# word group filters of size filter_length:
model.add(Convolution1D(nb_filter=nb_filter,
filter_length=filter_length,
border_mode='valid',
activation='relu',
subsample_length=1,
input_shape=input_shape))
# we use max pooling:
model.add(GlobalMaxPooling1D())
# We add a vanilla hidden layer:
model.add(Dense(hidden_dims))
model.add(Dropout(0.2))
model.add(Activation('relu'))
# We project onto a single unit output layer, and squash it with a sigmoid:
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy'])
'''
cnn_in = Input(shape=input_shape,name='cnn_in')
c1 = Convolution1D(nb_filter=nb_filter,filter_length=2,border_mode='valid',activation='relu',subsample_length=1,name='c1')(cnn_in)
#c1=Dropout(0.2)(c1)
def max_1d(X):
return K.max(X, axis=1)
maxpool = Lambda(max_1d, output_shape=(nb_filter,))
c1 = maxpool(c1)
dense_out = Dense(nb_filter,activation='relu')(c1)
#dense_out = Dropout(0.2)(dense_out)
loss = Dense(1, activation='sigmoid', name='loss')(dense_out)
model = Model(input=cnn_in, output=loss)
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
'''
Out[110]:
In [111]:
import time
# model_merge.fit([X_train,doc_train],y_train,nb_epoch=15,batch_size= 32,verbose=2, validation_data=([X_test,doc_test],y_test))
#model_merge.fit(doc_train,y_train,nb_epoch=15,batch_size= 32,verbose=2, validation_data=(doc_test,y_test))
#model.fit_generator(my_generator(data_path,batch_size=64),samples_per_epoch = 20105,nb_epoch=10,verbose=1,validation_data=(test_X,test_y))
#model.fit_generator(myGenerator(), , nb_epoch = 2, verbose=2, show_accuracy=True, callbacks=[], validation_data=None, class_weight=None, nb_worker=1)
# model_merge.fit(X_train,y_train,nb_epoch=15,batch_size= 32,verbose=2, validation_data=(X_test,y_test))
model.fit_generator(my_generator(data_path,batch_size=64),samples_per_epoch = 64*20,nb_epoch=50,verbose=1,validation_data=(test_X,test_y))
Out[111]:
In [ ]:
In [10]:
def character_vec_dict(vec_path=''):
print('loading character embedding')
vec_dic={}
with open(vec_path) as fr:
lines = fr.readlines()
for line in lines[1:]:
try :
v = [float(i) for i in line[1:].split()]
vec_dic[line[0]] = v
except:
pass
vec_dic[u'*'] = [float(0) for _ in range(200)]
return vec_dic
vd = character_vec_dict(join(data_path,'wikiw2v_zh_zi.txt'))
In [13]:
with
In [63]:
def split_data(file_path=''):
train_data = []
train_y =[]
test_data=[]
test_y =[]
with open (join(file_path,'pos'))as fr:
lines = fr.readlines()
lines = [line for line in lines if len(line.strip())>0]
print('total pos line=',len(lines))
train_data = train_data + lines[:-500]
train_y = [1 for _ in range(len(train_data))]
test_data = test_data + lines[-500:]
test_y = [1 for _ in range(500)]
with open(join(file_path,'neg'))as fr:
lines = fr.readlines()
lines = [line for line in lines if len(line.strip())>0 ]
print('total neg lines =',len(lines))
train_data = train_data + lines[:-500]
test_data = test_data + lines[-500:]
train_y = train_y + [0 for _ in range(len(lines[:-500]))]
test_y = test_y + [0 for _ in range(500)]
assert len(train_data) == len(train_y),'train'
assert len(test_data) == len(test_y),'test'
index_train = list(range(len(train_data)))
index_test = list(range(len(test_data)))
np.random.shuffle(index_train)
np.random.shuffle(index_test)
train_data = np.array(train_data)
train_y = np.array(train_y)
train_data = train_data[index_train]
train_y = train_y[index_train]
test_data = np.array(test_data)
test_y = np.array(test_y)
test_data = test_data[index_test]
test_y = test_y[index_test]
assert len(test_data) == len(test_y)
print(len(test_data))
print(len(test_y))
with open(join(file_path,'train_X'),'w') as fw:
for line in train_data:
fw.write(line.strip()+'\n')
with open(join(file_path,'train_y'),'w') as fw:
for line in train_y:
fw.write(str(line)+'\n')
with open(join(file_path,'test_X'),'w') as fw:
print('test_data',len(test_data))
for line in test_data:
fw.write(line.strip()+'\n')
with open(join(file_path,'test_y'),'w') as fw:
for line in test_y:
fw.write(str(line)+'\n')
split_data('/home/bruce/code/DLNLP/data')
In [ ]: