In [1]:
from __future__ import print_function

import numpy as np
np.random.seed(1337)  # for reproducibility
from os.path import join
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout,Activation, Lambda,Input
from keras.layers import Embedding
from keras.layers import Convolution1D
from keras.datasets import imdb
from keras import backend as K
from keras.layers import Convolution1D, GlobalMaxPooling1D,Convolution2D

from keras.models import Model

def character_vec_dict(vec_path=''):
    print('loading character embedding')
    vec_dic={}
    with open(vec_path) as fr:
        lines = fr.readlines()
        for line in lines[1:]:
            try :
                v = [float(i) for i in line[1:].split()]
                vec_dic[line[0]] = v
            except:
                pass
    vec_dic[u'*'] = [float(0) for _ in range(200)]
    return vec_dic

def sen_len(train_path=''):
    print('calculating sentence length')
    with open(join(train_path,'train_X')) as fr:
        lines = fr.readlines()
    length = [len(line.strip()) for line in lines]
    length = sorted(length)
    return length[int(len(length)*0.9)]
def pad(X=[],padlen=None):
    print('padding sentence ')
    print('X[0] type must be unicode =',type(X[0]))
    new_x=[]
    for index,sen in enumerate(X):
        len_s = len(sen)
        num = len_s-padlen
        if num>=0:
            X[index] = sen[0:padlen]
        else:
            X[index] = sen+(-1*num)*u'*'
    print('pad to length = ',padlen)
    return X
def embedding(x_batch=[],vec_dict=None):
    x_batch_embedding=[]
    for sen in x_batch:
        sen_embedding=[vec_dict[ch] if ch in vec_dict else vec_dict[u'*'] for ch in sen]
        x_batch_embedding.append(sen_embedding)
    return x_batch_embedding


#generator 每次生成一个batch的数据。
def my_generator(data_path='',batch_size=None):
    '''
    1.加载数据到字典中
    2.读取训练数据到numpy中。
    3.句子长度一样。
    3.每次生成一个batch的句子表示。
    '''
    with open(join(data_path,'train_X')) as fr:
        lines = fr.readlines()
        X_str = [line.strip() for line in lines]
    with open(join(data_path,'train_y')) as fr:
        lines = fr.readlines()
        y = [int(line.strip()) for line in lines]
    print(len(X_str))
    print(len(y))
    assert len(X_str) == len(y),'error ,训练数据集的数据X,y不相等'
    #加载字向量
    vec_dict = character_vec_dict(join(data_path,'wikiw2v_zh_zi.txt'))
    #pad X 
    padlen  = sen_len(data_path)
    X_str = pad(X_str,padlen)
    i = 0
    max_i = int(len(X_str)/batch_size)
    while True:
        i = i % max_i
        x_batch = X_str[i*batch_size:(i+1)*batch_size]
        y_batch = y[i*batch_size:(i+1)*batch_size]
        x_batch = embedding(x_batch,vec_dict)
        x_batch = np.array(x_batch)
        y_batch = np.array(y_batch)
        yield (x_batch,y_batch)
        i = i + 1
def get_input_shape(data_path=''):
   
    with open(join(data_path,'train_X')) as fr:
        lines = fr.readlines()
        X_str = [line.strip() for line in lines]
    #pad X 
    padlen  = sen_len(data_path)
    return (padlen,200)
def get_validata_shape(data_path='',pad_len=None):
    with open(join(data_path,'test_X'))as fr:
        lines = fr.readlines()
        print('len lines = ',len(lines))
        test_X= [line.strip() for line in lines]
    with open(join(data_path,'test_y')) as fr:
        lines = fr.readlines()
        test_y = [int(line.strip()) for line in lines]
    print(len(test_X))
    print(len(test_y))
    assert len(test_X) == len(test_y),'test_X == test_y'
    #加载字向量
    vec_dict = character_vec_dict(join(data_path,'wikiw2v_zh_zi.txt'))
    test_X = pad(test_X,pad_len)
    print('len vec_dict = ',len(vec_dict))
    print('len text_X = ',len(test_X))
    test_X = embedding(test_X,vec_dict)
    test_X = np.array(test_X)
    test_y = np.array(test_y)
    return test_X,test_y


Using TensorFlow backend.

In [2]:
print('Build model...')
# set parameters:
batch_size = 32
embedding_dims = 200
nb_filter = 150
filter_length = 2
hidden_dims = 300
nb_epoch = 2
data_path = '/home/bruce/code/DLNLP/data'
input_shape = get_input_shape(data_path)
print('input_shape = ',input_shape)
test_X,test_y = get_validata_shape(data_path,input_shape[0])
print(test_X.shape)


##########################################################################
print('Build model...')
model = Sequential()
# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions

# we add a Convolution1D, which will learn nb_filter
# word group filters of size filter_length:


model.add(Convolution1D(nb_filter=nb_filter,
                        filter_length=filter_length,
                        border_mode='valid',
                        activation='relu',
                        subsample_length=1,
                        input_shape=input_shape))
# we use max pooling:
model.add(GlobalMaxPooling1D())
# We add a vanilla hidden layer:
model.add(Dense(hidden_dims))
model.add(Activation('relu'))
# We project onto a single unit output layer, and squash it with a sigmoid:
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])


Build model...
calculating sentence length
input_shape =  (188, 200)
len lines =  1000
1000
1000
loading character embedding
padding sentence 
X[0] type must be unicode = <class 'str'>
pad to length =  188
len vec_dict =  10128
len text_X =  1000
(1000, 188, 200)
Build model...

In [3]:
import time
'''
model_merge.fit([X_train,doc_train],y_train,nb_epoch=15,batch_size= 32,verbose=2, validation_data=([X_test,doc_test],y_test))
model_merge.fit(doc_train,y_train,nb_epoch=15,batch_size= 32,verbose=2, validation_data=(doc_test,y_test))
model.fit_generator(my_generator(data_path,batch_size=64),samples_per_epoch = 20105,nb_epoch=10,verbose=1,validation_data=(test_X,test_y))
model.fit_generator(myGenerator(), , nb_epoch = 2, verbose=2, show_accuracy=True, callbacks=[], validation_data=None, class_weight=None, nb_worker=1)

model_merge.fit(X_train,y_train,nb_epoch=15,batch_size= 32,verbose=2, validation_data=(X_test,y_test))
'''
model.fit_generator(my_generator(data_path,batch_size=64),samples_per_epoch = 64*20,nb_epoch=60,verbose=1,validation_data=(test_X,test_y))


Epoch 1/60
20105
20105
loading character embedding
calculating sentence length
padding sentence 
X[0] type must be unicode = <class 'str'>
pad to length =  188
1280/1280 [==============================] - 4s - loss: 0.9065 - acc: 0.5813 - val_loss: 0.6715 - val_acc: 0.6460
Epoch 2/60
1280/1280 [==============================] - 3s - loss: 0.5491 - acc: 0.7328 - val_loss: 0.6942 - val_acc: 0.6770
Epoch 3/60
1280/1280 [==============================] - 3s - loss: 0.4491 - acc: 0.7867 - val_loss: 0.6760 - val_acc: 0.6730
Epoch 4/60
1280/1280 [==============================] - 3s - loss: 0.4180 - acc: 0.8133 - val_loss: 0.6191 - val_acc: 0.6970
Epoch 5/60
1280/1280 [==============================] - 3s - loss: 0.3902 - acc: 0.8195 - val_loss: 0.5303 - val_acc: 0.7530
Epoch 6/60
1280/1280 [==============================] - 3s - loss: 0.4190 - acc: 0.8094 - val_loss: 0.6123 - val_acc: 0.7230
Epoch 7/60
1280/1280 [==============================] - 3s - loss: 0.3460 - acc: 0.8500 - val_loss: 0.5872 - val_acc: 0.7450
Epoch 8/60
1280/1280 [==============================] - 3s - loss: 0.3288 - acc: 0.8617 - val_loss: 0.4550 - val_acc: 0.8130
Epoch 9/60
1280/1280 [==============================] - 3s - loss: 0.3253 - acc: 0.8617 - val_loss: 0.4632 - val_acc: 0.8140
Epoch 10/60
1280/1280 [==============================] - 3s - loss: 0.3312 - acc: 0.8773 - val_loss: 0.5920 - val_acc: 0.7610
Epoch 11/60
1280/1280 [==============================] - 3s - loss: 0.3622 - acc: 0.8500 - val_loss: 0.4590 - val_acc: 0.7970
Epoch 12/60
1280/1280 [==============================] - 3s - loss: 0.3746 - acc: 0.8367 - val_loss: 0.4247 - val_acc: 0.8230
Epoch 13/60
1280/1280 [==============================] - 3s - loss: 0.3179 - acc: 0.8648 - val_loss: 0.4526 - val_acc: 0.8070
Epoch 14/60
1280/1280 [==============================] - 3s - loss: 0.3215 - acc: 0.8734 - val_loss: 0.4403 - val_acc: 0.8140
Epoch 15/60
1280/1280 [==============================] - 3s - loss: 0.2933 - acc: 0.8789 - val_loss: 0.4732 - val_acc: 0.7990
Epoch 16/60
1280/1280 [==============================] - 3s - loss: 0.2635 - acc: 0.8914 - val_loss: 0.4421 - val_acc: 0.8190
Epoch 17/60
1280/1280 [==============================] - 3s - loss: 0.2631 - acc: 0.8961 - val_loss: 0.4711 - val_acc: 0.8020
Epoch 18/60
1280/1280 [==============================] - 3s - loss: 0.2595 - acc: 0.8992 - val_loss: 0.5227 - val_acc: 0.7810
Epoch 19/60
1280/1280 [==============================] - 3s - loss: 0.2406 - acc: 0.9047 - val_loss: 0.5125 - val_acc: 0.8040
Epoch 20/60
1280/1280 [==============================] - 3s - loss: 0.2237 - acc: 0.9141 - val_loss: 0.4864 - val_acc: 0.8190
Epoch 21/60
1280/1280 [==============================] - 3s - loss: 0.2192 - acc: 0.9141 - val_loss: 0.4317 - val_acc: 0.8300
Epoch 22/60
1280/1280 [==============================] - 3s - loss: 0.2388 - acc: 0.9039 - val_loss: 0.4871 - val_acc: 0.8140
Epoch 23/60
1280/1280 [==============================] - 3s - loss: 0.2237 - acc: 0.9172 - val_loss: 0.4552 - val_acc: 0.8290
Epoch 24/60
1280/1280 [==============================] - 3s - loss: 0.2230 - acc: 0.9055 - val_loss: 0.5033 - val_acc: 0.8230
Epoch 25/60
1280/1280 [==============================] - 3s - loss: 0.2116 - acc: 0.9180 - val_loss: 0.4566 - val_acc: 0.8190
Epoch 26/60
1280/1280 [==============================] - 3s - loss: 0.2107 - acc: 0.9070 - val_loss: 0.4552 - val_acc: 0.8190
Epoch 27/60
1280/1280 [==============================] - 3s - loss: 0.2885 - acc: 0.8813 - val_loss: 0.4271 - val_acc: 0.8390
Epoch 28/60
1280/1280 [==============================] - 3s - loss: 0.3185 - acc: 0.8758 - val_loss: 0.4843 - val_acc: 0.8350
Epoch 29/60
1280/1280 [==============================] - 3s - loss: 0.2082 - acc: 0.9164 - val_loss: 0.5262 - val_acc: 0.8120
Epoch 30/60
1280/1280 [==============================] - 3s - loss: 0.2122 - acc: 0.9180 - val_loss: 0.4220 - val_acc: 0.8190
Epoch 31/60
1280/1280 [==============================] - 3s - loss: 0.2121 - acc: 0.9250 - val_loss: 0.5417 - val_acc: 0.8040
Epoch 32/60
1280/1280 [==============================] - 3s - loss: 0.2016 - acc: 0.9227 - val_loss: 0.4433 - val_acc: 0.8400
Epoch 33/60
1280/1280 [==============================] - 3s - loss: 0.2812 - acc: 0.8922 - val_loss: 0.5251 - val_acc: 0.7910
Epoch 34/60
1280/1280 [==============================] - 3s - loss: 0.2538 - acc: 0.8969 - val_loss: 0.4986 - val_acc: 0.8010
Epoch 35/60
1280/1280 [==============================] - 3s - loss: 0.2165 - acc: 0.9227 - val_loss: 0.4967 - val_acc: 0.8270
Epoch 36/60
1280/1280 [==============================] - 3s - loss: 0.1928 - acc: 0.9219 - val_loss: 0.5501 - val_acc: 0.8210
Epoch 37/60
1280/1280 [==============================] - 3s - loss: 0.1761 - acc: 0.9383 - val_loss: 0.5551 - val_acc: 0.8210
Epoch 38/60
1280/1280 [==============================] - 3s - loss: 0.2640 - acc: 0.8945 - val_loss: 0.5858 - val_acc: 0.8100
Epoch 39/60
1280/1280 [==============================] - 3s - loss: 0.1649 - acc: 0.9359 - val_loss: 0.5641 - val_acc: 0.8160
Epoch 40/60
1280/1280 [==============================] - 3s - loss: 0.1959 - acc: 0.9258 - val_loss: 0.4500 - val_acc: 0.8470
Epoch 41/60
1280/1280 [==============================] - 3s - loss: 0.1460 - acc: 0.9516 - val_loss: 0.4768 - val_acc: 0.8380
Epoch 42/60
1280/1280 [==============================] - 3s - loss: 0.1300 - acc: 0.9555 - val_loss: 0.4224 - val_acc: 0.8490
Epoch 43/60
1280/1280 [==============================] - 3s - loss: 0.2344 - acc: 0.9086 - val_loss: 0.4311 - val_acc: 0.8550
Epoch 44/60
1280/1280 [==============================] - 3s - loss: 0.2470 - acc: 0.8992 - val_loss: 0.6358 - val_acc: 0.7950
Epoch 45/60
1280/1280 [==============================] - 3s - loss: 0.1454 - acc: 0.9453 - val_loss: 0.4549 - val_acc: 0.8370
Epoch 46/60
1280/1280 [==============================] - 3s - loss: 0.1379 - acc: 0.9484 - val_loss: 0.6193 - val_acc: 0.8040
Epoch 47/60
1280/1280 [==============================] - 3s - loss: 0.1647 - acc: 0.9305 - val_loss: 0.4651 - val_acc: 0.8250
Epoch 48/60
1280/1280 [==============================] - 3s - loss: 0.1667 - acc: 0.9352 - val_loss: 0.4510 - val_acc: 0.8440
Epoch 49/60
1280/1280 [==============================] - 3s - loss: 0.1796 - acc: 0.9281 - val_loss: 0.9326 - val_acc: 0.7210
Epoch 50/60
1280/1280 [==============================] - 3s - loss: 0.2751 - acc: 0.8914 - val_loss: 0.9002 - val_acc: 0.7300
Epoch 51/60
1280/1280 [==============================] - 3s - loss: 0.3235 - acc: 0.8852 - val_loss: 0.6964 - val_acc: 0.7690
Epoch 52/60
1280/1280 [==============================] - 3s - loss: 0.1751 - acc: 0.9383 - val_loss: 0.4818 - val_acc: 0.8270
Epoch 53/60
1280/1280 [==============================] - 3s - loss: 0.1513 - acc: 0.9352 - val_loss: 0.4443 - val_acc: 0.8420
Epoch 54/60
1280/1280 [==============================] - 3s - loss: 0.1364 - acc: 0.9383 - val_loss: 0.4299 - val_acc: 0.8440
Epoch 55/60
1280/1280 [==============================] - 3s - loss: 0.1163 - acc: 0.9578 - val_loss: 0.4172 - val_acc: 0.8380
Epoch 56/60
1280/1280 [==============================] - 3s - loss: 0.1119 - acc: 0.9609 - val_loss: 0.4087 - val_acc: 0.8610
Epoch 57/60
1280/1280 [==============================] - 3s - loss: 0.1002 - acc: 0.9625 - val_loss: 0.4143 - val_acc: 0.8560
Epoch 58/60
1280/1280 [==============================] - 3s - loss: 0.0855 - acc: 0.9664 - val_loss: 0.5505 - val_acc: 0.8300
Epoch 59/60
1280/1280 [==============================] - 3s - loss: 0.1088 - acc: 0.9594 - val_loss: 0.4176 - val_acc: 0.8570
Epoch 60/60
1280/1280 [==============================] - 3s - loss: 0.0865 - acc: 0.9688 - val_loss: 0.4450 - val_acc: 0.8610
Out[3]:
<keras.callbacks.History at 0x7f22373a1d68>

In [4]:
model.save('/home/bruce/model/CNN.h5')

In [8]:
from keras.models import load_model
model = load_model('/home/bruce/model/cnn.h5')

In [12]:
predict_y = model.predict(test_X)

In [13]:
predict_y = [1 if i>=0.5 else 0 for i in predict_y]

In [15]:
sum = 0
count = len(test_y)
for y1,y2 in zip(test_y,predict_y):
    if y1 == y2:
        sum = sum +1
print(sum/count)


0.846

In [10]:
def character_vec_dict(vec_path=''):
    print('loading character embedding')
    vec_dic={}
    with open(vec_path) as fr:
        lines = fr.readlines()
        for line in lines[1:]:
            try :
                v = [float(i) for i in line[1:].split()]
                vec_dic[line[0]] = v
            except:
                pass
    vec_dic[u'*'] = [float(0) for _ in range(200)]
    return vec_dic
vd = character_vec_dict(join(data_path,'wikiw2v_zh_zi.txt'))


loading character embedding

In [13]:



yes

In [63]:
def split_data(file_path=''):
    train_data = []
    train_y =[]
    test_data=[]
    test_y =[]
    
    
    with open (join(file_path,'pos'))as fr:
        lines = fr.readlines()
        lines = [line for line in lines if len(line.strip())>0]
        print('total pos line=',len(lines))
        train_data = train_data + lines[:-500]
        train_y = [1 for _ in range(len(train_data))]
        test_data = test_data + lines[-500:]
        test_y = [1 for _ in range(500)]
    with open(join(file_path,'neg'))as fr:
        lines = fr.readlines()
        lines = [line for line in lines if len(line.strip())>0 ]
        print('total neg lines =',len(lines))
        train_data = train_data + lines[:-500]
        test_data = test_data + lines[-500:]
        train_y = train_y + [0 for _ in range(len(lines[:-500]))]
        test_y = test_y + [0 for _ in range(500)]
    assert len(train_data) == len(train_y),'train'
    assert len(test_data) == len(test_y),'test'
    index_train = list(range(len(train_data)))
    index_test = list(range(len(test_data)))
    np.random.shuffle(index_train)
    np.random.shuffle(index_test)
    train_data = np.array(train_data)
    train_y = np.array(train_y)
    train_data = train_data[index_train]
    train_y = train_y[index_train]
    test_data = np.array(test_data)
    test_y = np.array(test_y)
    test_data = test_data[index_test]
    test_y = test_y[index_test]
    assert len(test_data) == len(test_y)
    print(len(test_data))
    print(len(test_y))
    
    with open(join(file_path,'train_X'),'w') as fw:
        for line in train_data:
            fw.write(line.strip()+'\n')
    with open(join(file_path,'train_y'),'w') as fw:
        for line in train_y:
            fw.write(str(line)+'\n')
    with open(join(file_path,'test_X'),'w') as fw:
        print('test_data',len(test_data))
        for line in test_data:
            fw.write(line.strip()+'\n')
    with open(join(file_path,'test_y'),'w') as fw:
        for line in test_y:
            fw.write(str(line)+'\n')
    
    
    
split_data('/home/bruce/code/DLNLP/data')


total pos line= 10677
total neg lines = 10428
1000
1000
test_data 1000

In [30]:
def predict_sentence(s='',padlen=188,vec_dict_path='/home/bruce/model/wikiw2v_zh_zi.txt'):
    '''
    1.句子长度:188
    '''
    vec_dict = character_vec_dict(vec_dict_path)
    X_str = pad([s],padlen)
    X = embedding(X_str,vec_dict)
    X=np.array(X)
    model = load_model('/home/bruce/model/cnn.h5')
    y = model.predict(X)
    print(y)
def predict_file(file_path='',padlen=188,vec_dict_path='/home/bruce/model/wikiw2v_zh_zi.txt'):
    with open(file_path,'r')as fr:
        lines = fr.readlines()
        print('len lines = ',len(lines))
        X= [line.strip() for line in lines]
    #加载字向量
    vec_dict = character_vec_dict(vec_dict_path)
    X = pad(X,padlen)
    print('len vec_dict = ',len(vec_dict))
    print('len text_X = ',len(X))
    X = embedding(X,vec_dict)
    X = np.array(X)
    model = load_model('/home/bruce/model/cnn.h5')
    y = model.predict(X)
    y = y.flatten()
    y = [str(round(i,2))for i in y]
    return y
predict_file('/home/bruce/test/test.txt')
        
        
        
        
        
        
#predict_sentence(s='我对这次购物一点也不满意',padlen=188)


len lines =  16
loading character embedding
padding sentence 
X[0] type must be unicode = <class 'str'>
pad to length =  188
len vec_dict =  10128
len text_X =  16
<class 'numpy.ndarray'>
['0.0', '0.02', '0.44', '0.94', '0.0', '0.02', '0.98', '0.01', '0.01', '0.0', '0.95', '0.01', '0.15', '1.0', '0.04', '0.49']

In [ ]:


In [ ]: