In [101]:
from __future__ import print_function

import numpy as np
np.random.seed(1337)  # for reproducibility
from os.path import join
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout,Activation, Lambda,Input
from keras.layers import Embedding
from keras.layers import Convolution1D
from keras.datasets import imdb
from keras import backend as K
from keras.layers import Convolution1D, GlobalMaxPooling1D

from keras.models import Model

def character_vec_dict(vec_path=''):
    print('loading character embedding')
    vec_dic={}
    with open(vec_path) as fr:
        lines = fr.readlines()
        for line in lines[1:]:
            try :
                v = [float(i) for i in line[1:].split()]
                vec_dic[line[0]] = v
            except:
                pass
    vec_dic[u'*'] = [float(0) for _ in range(200)]
    return vec_dic

def sen_len(train_path=''):
    print('calculating sentence length')
    with open(join(train_path,'train_X')) as fr:
        lines = fr.readlines()
    length = [len(line.strip()) for line in lines]
    length = sorted(length)
    return length[int(len(length)*0.9)]
def pad(X=[],padlen=None):
    print('padding sentence ')
    print('X[0] type must be unicode =',type(X[0]))
    new_x=[]
    for index,sen in enumerate(X):
        len_s = len(sen)
        num = len_s-padlen
        if num>=0:
            X[index] = sen[0:padlen]
        else:
            X[index] = sen+(-1*num)*u'*'
    print('pad to length = ',padlen)
    return X
def embedding(x_batch=[],vec_dict=None):
    x_batch_embedding=[]
    for sen in x_batch:
        sen_embedding=[vec_dict[ch] if ch in vec_dict else vec_dict[u'*'] for ch in sen]
        x_batch_embedding.append(sen_embedding)
    return x_batch_embedding


#generator 每次生成一个batch的数据。
def my_generator(data_path='',batch_size=None):
    '''
    1.加载数据到字典中
    2.读取训练数据到numpy中。
    3.句子长度一样。
    3.每次生成一个batch的句子表示。
    '''
    with open(join(data_path,'train_X')) as fr:
        lines = fr.readlines()
        X_str = [line.strip() for line in lines]
    with open(join(data_path,'train_y')) as fr:
        lines = fr.readlines()
        y = [int(line.strip()) for line in lines]
    print(len(X_str))
    print(len(y))
    assert len(X_str) == len(y),'error ,训练数据集的数据X,y不相等'
    #加载字向量
    vec_dict = character_vec_dict(join(data_path,'wikiw2v_zh_zi.txt'))
    #pad X 
    padlen  = sen_len(data_path)
    X_str = pad(X_str,padlen)
    i = 0
    max_i = int(len(X_str)/batch_size)
    while True:
        i = i % max_i
        x_batch = X_str[i*batch_size:(i+1)*batch_size]
        y_batch = y[i*batch_size:(i+1)*batch_size]
        x_batch = embedding(x_batch,vec_dict)
        x_batch = np.array(x_batch)
        y_batch = np.array(y_batch)
        yield (x_batch,y_batch)
        i = i + 1
def get_input_shape(data_path=''):
   
    with open(join(data_path,'train_X')) as fr:
        lines = fr.readlines()
        X_str = [line.strip() for line in lines]
    #pad X 
    padlen  = sen_len(data_path)
    return (padlen,200)
def get_validata_shape(data_path='',pad_len=None):
    with open(join(data_path,'test_X'))as fr:
        lines = fr.readlines()
        print('len lines = ',len(lines))
        test_X= [line.strip() for line in lines]
    with open(join(data_path,'test_y')) as fr:
        lines = fr.readlines()
        test_y = [int(line.strip()) for line in lines]
    print(len(test_X))
    print(len(test_y))
    assert len(test_X) == len(test_y),'test_X == test_y'
    #加载字向量
    vec_dict = character_vec_dict(join(data_path,'wikiw2v_zh_zi.txt'))
    test_X = pad(test_X,pad_len)
    print('len vec_dict = ',len(vec_dict))
    print('len text_X = ',len(test_X))
    test_X = embedding(test_X,vec_dict)
    test_X = np.array(test_X)
    test_y = np.array(test_y)
    return test_X,test_y

In [110]:
print('Build model...')
# set parameters:
batch_size = 32
embedding_dims = 200
nb_filter = 150
filter_length = 2
hidden_dims = 300
nb_epoch = 2
data_path = '/home/bruce/code/DLNLP/data'
input_shape = get_input_shape(data_path)
print('input_shape = ',input_shape)
test_X,test_y = get_validata_shape(data_path,188)


##########################################################################
print('Build model...')
model = Sequential()
# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions

# we add a Convolution1D, which will learn nb_filter
# word group filters of size filter_length:
model.add(Convolution1D(nb_filter=nb_filter,
                        filter_length=filter_length,
                        border_mode='valid',
                        activation='relu',
                        subsample_length=1,
                        input_shape=input_shape))
# we use max pooling:
model.add(GlobalMaxPooling1D())
# We add a vanilla hidden layer:
model.add(Dense(hidden_dims))
model.add(Dropout(0.2))
model.add(Activation('relu'))
# We project onto a single unit output layer, and squash it with a sigmoid:
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

'''
cnn_in = Input(shape=input_shape,name='cnn_in')

c1 = Convolution1D(nb_filter=nb_filter,filter_length=2,border_mode='valid',activation='relu',subsample_length=1,name='c1')(cnn_in)
#c1=Dropout(0.2)(c1)
def max_1d(X):
    return K.max(X, axis=1)
maxpool = Lambda(max_1d, output_shape=(nb_filter,))
c1 = maxpool(c1)
dense_out = Dense(nb_filter,activation='relu')(c1)
#dense_out = Dropout(0.2)(dense_out)
loss = Dense(1, activation='sigmoid', name='loss')(dense_out)
model = Model(input=cnn_in, output=loss)
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
'''


Build model...
calculating sentence length
input_shape =  (188, 200)
len lines =  1000
1000
1000
loading character embedding
padding sentence 
X[0] type must be unicode = <class 'str'>
pad to length =  188
len vec_dict =  10128
len text_X =  1000
Build model...
Out[110]:
"\ncnn_in = Input(shape=input_shape,name='cnn_in')\n\nc1 = Convolution1D(nb_filter=nb_filter,filter_length=2,border_mode='valid',activation='relu',subsample_length=1,name='c1')(cnn_in)\n#c1=Dropout(0.2)(c1)\ndef max_1d(X):\n    return K.max(X, axis=1)\nmaxpool = Lambda(max_1d, output_shape=(nb_filter,))\nc1 = maxpool(c1)\ndense_out = Dense(nb_filter,activation='relu')(c1)\n#dense_out = Dropout(0.2)(dense_out)\nloss = Dense(1, activation='sigmoid', name='loss')(dense_out)\nmodel = Model(input=cnn_in, output=loss)\nmodel.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])\n"

In [111]:
import time

# model_merge.fit([X_train,doc_train],y_train,nb_epoch=15,batch_size= 32,verbose=2, validation_data=([X_test,doc_test],y_test))
#model_merge.fit(doc_train,y_train,nb_epoch=15,batch_size= 32,verbose=2, validation_data=(doc_test,y_test))
#model.fit_generator(my_generator(data_path,batch_size=64),samples_per_epoch = 20105,nb_epoch=10,verbose=1,validation_data=(test_X,test_y))
#model.fit_generator(myGenerator(), , nb_epoch = 2, verbose=2, show_accuracy=True, callbacks=[], validation_data=None, class_weight=None, nb_worker=1)

# model_merge.fit(X_train,y_train,nb_epoch=15,batch_size= 32,verbose=2, validation_data=(X_test,y_test))

model.fit_generator(my_generator(data_path,batch_size=64),samples_per_epoch = 64*20,nb_epoch=50,verbose=1,validation_data=(test_X,test_y))


Epoch 1/50
20105
20105
loading character embedding
calculating sentence length
padding sentence 
X[0] type must be unicode = <class 'str'>
pad to length =  188
1280/1280 [==============================] - 5s - loss: 1.0273 - acc: 0.5578 - val_loss: 0.8065 - val_acc: 0.5080
Epoch 2/50
1280/1280 [==============================] - 3s - loss: 0.6096 - acc: 0.6992 - val_loss: 0.6891 - val_acc: 0.6210
Epoch 3/50
1280/1280 [==============================] - 3s - loss: 0.5037 - acc: 0.7766 - val_loss: 0.7025 - val_acc: 0.6610
Epoch 4/50
1280/1280 [==============================] - 3s - loss: 0.4865 - acc: 0.7688 - val_loss: 0.5842 - val_acc: 0.7140
Epoch 5/50
1280/1280 [==============================] - 3s - loss: 0.4311 - acc: 0.8062 - val_loss: 0.5391 - val_acc: 0.7300
Epoch 6/50
1280/1280 [==============================] - 3s - loss: 0.4214 - acc: 0.8234 - val_loss: 0.5344 - val_acc: 0.7440
Epoch 7/50
1280/1280 [==============================] - 3s - loss: 0.3570 - acc: 0.8414 - val_loss: 0.6445 - val_acc: 0.6900
Epoch 8/50
1280/1280 [==============================] - 3s - loss: 0.4023 - acc: 0.8188 - val_loss: 0.4886 - val_acc: 0.7870
Epoch 9/50
1280/1280 [==============================] - 3s - loss: 0.3340 - acc: 0.8641 - val_loss: 0.4805 - val_acc: 0.7860
Epoch 10/50
1280/1280 [==============================] - 3s - loss: 0.3464 - acc: 0.8609 - val_loss: 0.4885 - val_acc: 0.7770
Epoch 11/50
1280/1280 [==============================] - 3s - loss: 0.3282 - acc: 0.8484 - val_loss: 0.4527 - val_acc: 0.7940
Epoch 12/50
1280/1280 [==============================] - 3s - loss: 0.3646 - acc: 0.8422 - val_loss: 0.4709 - val_acc: 0.8070
Epoch 13/50
1280/1280 [==============================] - 3s - loss: 0.3153 - acc: 0.8680 - val_loss: 0.4375 - val_acc: 0.8130
Epoch 14/50
1280/1280 [==============================] - 3s - loss: 0.3308 - acc: 0.8648 - val_loss: 0.4814 - val_acc: 0.7970
Epoch 15/50
1280/1280 [==============================] - 3s - loss: 0.3194 - acc: 0.8680 - val_loss: 0.4694 - val_acc: 0.7900
Epoch 16/50
1280/1280 [==============================] - 3s - loss: 0.3107 - acc: 0.8781 - val_loss: 0.4724 - val_acc: 0.8000
Epoch 17/50
1280/1280 [==============================] - 3s - loss: 0.3079 - acc: 0.8656 - val_loss: 0.4887 - val_acc: 0.7950
Epoch 18/50
1280/1280 [==============================] - 3s - loss: 0.2932 - acc: 0.8750 - val_loss: 0.5450 - val_acc: 0.7580
Epoch 19/50
1280/1280 [==============================] - 3s - loss: 0.2672 - acc: 0.9031 - val_loss: 0.6643 - val_acc: 0.7470
Epoch 20/50
1280/1280 [==============================] - 3s - loss: 0.2602 - acc: 0.8938 - val_loss: 0.4973 - val_acc: 0.7990
Epoch 21/50
1280/1280 [==============================] - 3s - loss: 0.2543 - acc: 0.8992 - val_loss: 0.4675 - val_acc: 0.8130
Epoch 22/50
1280/1280 [==============================] - 3s - loss: 0.2818 - acc: 0.8891 - val_loss: 0.5019 - val_acc: 0.7880
Epoch 23/50
1280/1280 [==============================] - 3s - loss: 0.2318 - acc: 0.9062 - val_loss: 0.4610 - val_acc: 0.8050
Epoch 24/50
1280/1280 [==============================] - 3s - loss: 0.2471 - acc: 0.9047 - val_loss: 0.4315 - val_acc: 0.8190
Epoch 25/50
1280/1280 [==============================] - 3s - loss: 0.2416 - acc: 0.9133 - val_loss: 0.4341 - val_acc: 0.8170
Epoch 26/50
1280/1280 [==============================] - 3s - loss: 0.2276 - acc: 0.9156 - val_loss: 0.4981 - val_acc: 0.8150
Epoch 27/50
1280/1280 [==============================] - 3s - loss: 0.2470 - acc: 0.9000 - val_loss: 0.4841 - val_acc: 0.8340
Epoch 28/50
1280/1280 [==============================] - 3s - loss: 0.2564 - acc: 0.9031 - val_loss: 0.4196 - val_acc: 0.8260
Epoch 29/50
1280/1280 [==============================] - 3s - loss: 0.2399 - acc: 0.9086 - val_loss: 0.4347 - val_acc: 0.8050
Epoch 30/50
1280/1280 [==============================] - 3s - loss: 0.2438 - acc: 0.8992 - val_loss: 0.4292 - val_acc: 0.8240
Epoch 31/50
1280/1280 [==============================] - 3s - loss: 0.2089 - acc: 0.9172 - val_loss: 0.4660 - val_acc: 0.8220
Epoch 32/50
1280/1280 [==============================] - 3s - loss: 0.1928 - acc: 0.9297 - val_loss: 0.5130 - val_acc: 0.8170
Epoch 33/50
1280/1280 [==============================] - 3s - loss: 0.2350 - acc: 0.9023 - val_loss: 0.4489 - val_acc: 0.8060
Epoch 34/50
1280/1280 [==============================] - 3s - loss: 0.2176 - acc: 0.9133 - val_loss: 0.5324 - val_acc: 0.7810
Epoch 35/50
1280/1280 [==============================] - 3s - loss: 0.2311 - acc: 0.9078 - val_loss: 0.5802 - val_acc: 0.7680
Epoch 36/50
1280/1280 [==============================] - 3s - loss: 0.2126 - acc: 0.9305 - val_loss: 0.4707 - val_acc: 0.8010
Epoch 37/50
1280/1280 [==============================] - 3s - loss: 0.1927 - acc: 0.9297 - val_loss: 0.4600 - val_acc: 0.8320
Epoch 38/50
1280/1280 [==============================] - 3s - loss: 0.2013 - acc: 0.9219 - val_loss: 0.5154 - val_acc: 0.8170
Epoch 39/50
1280/1280 [==============================] - 3s - loss: 0.1856 - acc: 0.9305 - val_loss: 0.5290 - val_acc: 0.8170
Epoch 40/50
1280/1280 [==============================] - 3s - loss: 0.2137 - acc: 0.9109 - val_loss: 0.5503 - val_acc: 0.8120
Epoch 41/50
1280/1280 [==============================] - 3s - loss: 0.1617 - acc: 0.9398 - val_loss: 0.4893 - val_acc: 0.8210
Epoch 42/50
1280/1280 [==============================] - 3s - loss: 0.1586 - acc: 0.9398 - val_loss: 0.4213 - val_acc: 0.8380
Epoch 43/50
1280/1280 [==============================] - 3s - loss: 0.1929 - acc: 0.9211 - val_loss: 0.4116 - val_acc: 0.8430
Epoch 44/50
1280/1280 [==============================] - 3s - loss: 0.1665 - acc: 0.9336 - val_loss: 0.4224 - val_acc: 0.8440
Epoch 45/50
1280/1280 [==============================] - 3s - loss: 0.1822 - acc: 0.9336 - val_loss: 0.4286 - val_acc: 0.8360
Epoch 46/50
1280/1280 [==============================] - 3s - loss: 0.1943 - acc: 0.9227 - val_loss: 0.5650 - val_acc: 0.8210
Epoch 47/50
1280/1280 [==============================] - 3s - loss: 0.1525 - acc: 0.9320 - val_loss: 0.4923 - val_acc: 0.8250
Epoch 48/50
1280/1280 [==============================] - 3s - loss: 0.1777 - acc: 0.9328 - val_loss: 0.4678 - val_acc: 0.8320
Epoch 49/50
1280/1280 [==============================] - 3s - loss: 0.1863 - acc: 0.9281 - val_loss: 0.4607 - val_acc: 0.8220
Epoch 50/50
1280/1280 [==============================] - 3s - loss: 0.1508 - acc: 0.9461 - val_loss: 0.5971 - val_acc: 0.7770
Out[111]:
<keras.callbacks.History at 0x7fe7919886a0>

In [ ]:


In [10]:
def character_vec_dict(vec_path=''):
    print('loading character embedding')
    vec_dic={}
    with open(vec_path) as fr:
        lines = fr.readlines()
        for line in lines[1:]:
            try :
                v = [float(i) for i in line[1:].split()]
                vec_dic[line[0]] = v
            except:
                pass
    vec_dic[u'*'] = [float(0) for _ in range(200)]
    return vec_dic
vd = character_vec_dict(join(data_path,'wikiw2v_zh_zi.txt'))


loading character embedding

In [13]:
with


yes

In [63]:
def split_data(file_path=''):
    train_data = []
    train_y =[]
    test_data=[]
    test_y =[]
    
    
    with open (join(file_path,'pos'))as fr:
        lines = fr.readlines()
        lines = [line for line in lines if len(line.strip())>0]
        print('total pos line=',len(lines))
        train_data = train_data + lines[:-500]
        train_y = [1 for _ in range(len(train_data))]
        test_data = test_data + lines[-500:]
        test_y = [1 for _ in range(500)]
    with open(join(file_path,'neg'))as fr:
        lines = fr.readlines()
        lines = [line for line in lines if len(line.strip())>0 ]
        print('total neg lines =',len(lines))
        train_data = train_data + lines[:-500]
        test_data = test_data + lines[-500:]
        train_y = train_y + [0 for _ in range(len(lines[:-500]))]
        test_y = test_y + [0 for _ in range(500)]
    assert len(train_data) == len(train_y),'train'
    assert len(test_data) == len(test_y),'test'
    index_train = list(range(len(train_data)))
    index_test = list(range(len(test_data)))
    np.random.shuffle(index_train)
    np.random.shuffle(index_test)
    train_data = np.array(train_data)
    train_y = np.array(train_y)
    train_data = train_data[index_train]
    train_y = train_y[index_train]
    test_data = np.array(test_data)
    test_y = np.array(test_y)
    test_data = test_data[index_test]
    test_y = test_y[index_test]
    assert len(test_data) == len(test_y)
    print(len(test_data))
    print(len(test_y))
    
    with open(join(file_path,'train_X'),'w') as fw:
        for line in train_data:
            fw.write(line.strip()+'\n')
    with open(join(file_path,'train_y'),'w') as fw:
        for line in train_y:
            fw.write(str(line)+'\n')
    with open(join(file_path,'test_X'),'w') as fw:
        print('test_data',len(test_data))
        for line in test_data:
            fw.write(line.strip()+'\n')
    with open(join(file_path,'test_y'),'w') as fw:
        for line in test_y:
            fw.write(str(line)+'\n')
    
    
    
split_data('/home/bruce/code/DLNLP/data')


total pos line= 10677
total neg lines = 10428
1000
1000
test_data 1000

In [ ]: