In [1]:
# -*- coding: utf-8 -*-

import os
import re
import time
import codecs



TIME_FORMAT = '%Y-%m-%d %H:%M:%S'
BASE_FOLDER = os.getcwd()# os.path.abspath(os.path.dirname(__file__))
DATA_FOLDER = os.path.join(BASE_FOLDER, 'data')
DEFAULT_FIN = os.path.join(DATA_FOLDER, '唐诗语料库.txt')
DEFAULT_FOUT = os.path.join(DATA_FOLDER, 'poem.txt')
reg_noisy = re.compile('[^\u3000-\uffee]')
reg_note = re.compile('((.*))') # Cannot deal with () in seperate lines
# 中文及全角标点符号(字符)是\u3000-\u301e\ufe10-\ufe19\ufe30-\ufe44\ufe50-\ufe6b\uff01-\uffee

DEFAULT_Char2Vec = os.path.join(DATA_FOLDER, 'Char2Vec100.bin')
print(DEFAULT_Char2Vec)

Test the material


In [2]:
def GetFirstNline(filePath, linesNumber):
    fd = codecs.open(filePath, 'r', 'utf-8')
    for i in range(1,linesNumber):
        print(fd.readline())
    fd.close()

GetFirstNline(DEFAULT_FOUT, 3)


饮马长城窟行

塞外悲风切,交河冰已结。瀚海百重波,阴山千里雪。迥戍危烽火,层峦引高节。悠悠卷旆旌,饮马出长城。寒沙连骑迹,朔吹断边声。胡尘清玉塞,羌笛韵金钲。绝漠干戈戢,车徒振原隰。都尉反龙堆,将军旋马邑。扬麾氛雾静,纪石功名立。荒裔一戎衣,灵台凯歌入。


In [3]:
print('{} START'.format(time.strftime(TIME_FORMAT)))
text = codecs.open(DEFAULT_FOUT, 'r', 'utf-8').read()
print('corpus length:', len(text))
print('{} STOP'.format(time.strftime(TIME_FORMAT)))


2017-10-17 15:27:46 START
corpus length: 3450588
2017-10-17 15:27:46 STOP

In [4]:
#text = text[:100000]
print('test corpus length:', len(text))


test corpus length: 3450588

In [5]:
print('{} START'.format(time.strftime(TIME_FORMAT)))
chars = sorted(list(set(text)))
vocab_size = len(chars)+1
print('total chars:', vocab_size)
print('{} STOP'.format(time.strftime(TIME_FORMAT)))


2017-10-17 15:27:49 START
total chars: 7504
2017-10-17 15:27:50 STOP

In [6]:
chars.insert(0, "\0")

In [7]:
''.join(chars[1:200])


Out[7]:
'\n\r()13569CDFGHJLMOQXZ[]·…、。々《》ヨ一丁七万丈三上下不与丐丑专且丕世丘丙业丛东丝丞两严丧个丫中丰丱串丳临丸丹为主丽举乂乃久么义之乌乍乎乏乐乔乖乘乙九乞也习乡书买乱乳乾了予争事二于亏云互五井亘亚些亟亡亢交亥亦产亨亩享京亭亮亲亳亵亶亸亹人亿什仁仄仅仆仇今介仍从仑仓仔仕他仗付仙仝仞仡代令以仪仰仲仳价任仿企伉伊伋伍伎伏伐休众优会伛伞伟传伣伤伥伦伧伪伫伯估伴伶伸伺似伽伾但佉位低住'

In [8]:
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

In [9]:
idx = [char_indices[c] for c in text]

In [10]:
idx[:10]


Out[10]:
[7051, 7108, 6698, 1045, 4380, 5660, 2, 1, 1085, 1133]

In [11]:
''.join(indices_char[i] for i in idx[1000:1080])


Out[11]:
'世岂邀名。星旂纷电举,日羽肃天行。遍野屯万骑,临原驻五营。登山麾武节,背水纵神兵。在昔戎戈动,今来宇宙平。\r\n入潼关\r\n崤函称地险,襟带壮两京。霜峰直临道,冰河'

import the pre-trained vector


In [12]:
DEFAULT_Char2Vec100 = os.path.join(DATA_FOLDER, 'Char2Vec100.bin')

from gensim.models import word2vec
import numpy as np

model = word2vec.Word2Vec.load(DEFAULT_Char2Vec100)

model[u'行']


d:\Anaconda3\lib\site-packages\gensim-2.0.0-py3.5-win-amd64.egg\gensim\utils.py:860: UserWarning: detected Windows; aliasing chunkize to chunkize_serial
  warnings.warn("detected Windows; aliasing chunkize to chunkize_serial")
Out[12]:
array([ 2.06856275,  2.32397461,  0.83518529,  0.46417695, -0.11634798,
        0.1818933 , -2.56801581, -0.18342905, -1.48421717, -2.11078668,
        0.20056053, -1.02167094, -0.21401837, -0.38561431,  0.51817751,
       -1.16497791,  0.54224551,  0.76138306,  2.3684957 ,  0.30970663,
       -1.3802563 , -0.38912675, -1.82215464, -0.00367989,  1.7190181 ,
        0.98718047,  1.4421339 ,  0.40846929,  0.79540157, -0.676166  ,
        0.08133515, -0.62718248, -1.03371155,  0.52618879,  0.42221704,
        0.25534707,  2.0411377 , -1.3095907 ,  0.30678886, -1.51624656,
        0.7682991 ,  0.92865288,  0.03569315,  0.47706228,  1.32932281,
        0.60655832,  1.01979291, -0.08373014, -1.31372821, -0.71655107,
       -0.14086835,  0.59556109, -3.27102542,  0.75412971,  0.42963707,
       -0.34976977,  0.37323466, -0.28765142, -0.33176178, -1.06273353,
        0.69204348, -0.56143588,  0.86933368,  0.28840253, -0.94598639,
       -0.63443971, -0.67289799, -0.56733495, -0.60957175,  0.95562029,
        0.53417146, -0.81318074, -1.68610132,  0.57404405,  0.62384731,
       -0.35198978,  2.50699282,  0.46690306,  1.31229103, -1.02700114,
       -1.44780755, -0.94855106,  0.16108763,  0.8101849 ,  1.17120028,
       -1.76896417,  0.25730687, -0.51312464, -0.12526916,  0.29898113,
       -0.7610212 ,  0.62285638,  1.32304406,  0.7449311 , -1.30808794,
       -0.35870194, -0.46392658,  0.26858318, -2.1789248 , -1.12329459], dtype=float32)

In [13]:
print ('creating embedding matrix...')
embedding_matrix = np.zeros((vocab_size, 100))

for i, c in enumerate(chars):
    #print(c)
    if c in model:
        embedding_matrix[i] = model[c]
    # if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
   #     embedding_matrix[i] = embedding_vector

print('Found %s word vectors.' % len(embedding_matrix))


creating embedding matrix...
Found 7504 word vectors.

Preprocess and create model


In [14]:
maxlen = 80
sentences = []
next_chars = []
for i in range(0, len(idx) - maxlen+1):
    sentences.append(idx[i: i + maxlen])
    next_chars.append(idx[i+1: i+maxlen+1])
print('nb sequences:', len(sentences))


nb sequences: 3450509

In [15]:
import numpy as np
import keras
from keras.layers import TimeDistributed, Activation
from numpy.random import choice


Using TensorFlow backend.

In [16]:
sentences = np.concatenate([[np.array(o)] for o in sentences[:-2]])
next_chars = np.concatenate([[np.array(o)] for o in next_chars[:-2]])

In [17]:
sentences.shape, next_chars.shape


Out[17]:
((3450507, 80), (3450507, 80))

In [18]:
n_fac = 100

In [19]:
from keras import backend as K
from keras.utils.data_utils import get_file
from keras.utils import np_utils
from keras.utils.np_utils import to_categorical
from keras.models import Sequential, Model
from keras.layers import Input, Embedding, Reshape, merge, LSTM, Bidirectional
from keras.layers import TimeDistributed, Activation, SimpleRNN, GRU
from keras.layers.core import Flatten, Dense, Dropout, Lambda
from keras.regularizers import l2, l1
from keras.layers.normalization import BatchNormalization
from keras.optimizers import SGD, RMSprop, Adam
#from keras.utils.layer_utils import layer_from_config
from keras.layers import deserialize as layer_from_config

from keras.metrics import categorical_crossentropy, categorical_accuracy
from keras.layers.convolutional import *
from keras.preprocessing import image, sequence
from keras.preprocessing.text import Tokenizer

Import the training word embedding


In [ ]:


In [20]:
model=Sequential([
        Embedding(vocab_size, n_fac, input_length=maxlen, weights=[embedding_matrix],trainable=False),
        Bidirectional(LSTM(512, return_sequences=True, dropout=0.2, recurrent_dropout=0.2,implementation=2)),
        Dropout(0.2),
        Bidirectional(LSTM(512, return_sequences=True, dropout=0.2, recurrent_dropout=0.2,implementation=2)),
        Dropout(0.2),
        TimeDistributed(Dense(vocab_size)),
        Activation('softmax')
    ])

In [21]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())

Training


In [22]:
def print_example():
    seed_string=u'世岂邀名。星旂纷电举,日羽肃天行。遍野屯万骑,临原驻五营。登山麾武节,背水纵神兵。在昔戎戈动,今来宇宙平。\r\n入潼关\r\n崤函称地险,襟带壮两京。霜峰直临道,冰河'
    for i in range(640):
        x=np.array([char_indices[c] for c in seed_string[-80:]])[np.newaxis,:]
        preds = model.predict(x, verbose=0)[0][-1]
        preds = preds/np.sum(preds)
        next_char = choice(chars, p=preds)
        seed_string = seed_string + next_char
    print(seed_string)

In [23]:
model.fit(sentences, np.expand_dims(next_chars,-1), batch_size=200, epochs=1)


Epoch 1/1
3450507/3450507 [==============================] - ETA: 0s - loss: 0.1421
Out[23]:
<keras.callbacks.History at 0xaceee17e48>

In [24]:
print_example()
DEFAULT_modelweights = os.path.join(DATA_FOLDER, 'char_rnn.h5')
model.save_weights(DEFAULT_modelweights )


世岂邀名。星旂纷电举,日羽肃天行。遍野屯万骑,临原驻五营。登山麾武节,背水纵神兵。在昔戎戈动,今来宇宙平。
入潼关
崤函称地险,襟带壮两京。霜峰直临道,冰河汉将殷。冥冥凫鹥宿,秋思遥相救。脱官逐奚词,白日复上来。怀故犹达息,相逢不见清。借行恨他事,江上望连城。
少府后见寄徐州
月半静离思,磈然满海来。相思入林子,昨夕洞中传。寂寞依稀减,新吟隔此心。谁知元帝国,还在一双林。
秋晚蝉
寒流落日宿,云雨涨来和。酒著缘棋侧,鸡村恨欲频。柳条匀尺眼,桃李水中窥。
送李策士相进士
我家秦留望,五千岁馀情。芳草倚苍云,寒红锦字衣。江湖千里外,苍渤水相寻。汉上秋风度,时逢玉磬游。
春日辞·学士
宝钗垂帐坐,舒画建池烟。汉武虽无朽,怜儒御卷神。知君几号火,乱说几梁材。山郭阴何挂,沙风不暂还。早朝平大日,花鬘六宫沙。
脑饮宴张查阳道
朝夕云高接,兰依雁行空。春来无限路,千古入朱门。萧条鸣廓廓,象板路似声。玉佩成尘,春景一容。相思天地宽,连榭越枝开。
寄张殿司空湖南渔浦
条索青苔外,登山复暗中。野亭春草遍,林鸟定难寻。景静阶追与,明君往不还。晴风摇细出,园里映芜平。
送毛难国亡海上人
天高书作客,一送李刀刀。知道阴松老,飘飘旋把曹。直须因背客,拾与别离心。旧路难留醉,扁舟岂易潸。
身锦水渡
杜陵寒镜里,行见暮山村。风飒蒹葭叠,犹闻雨露清。湛芳朝见骨,孤枕不平凉。
生行
嗷嗷新雨急,去年尽见来。此时人了落,逆恨是江齐。恋风欺我觉,俯仰有丹墀。
喜
十万霸州县,风初去豫书。年年见悲别,何必李评公。繁条将十管,隔处过人房。云近似寒水,轻烟爱露烟。红珠对不见,一把柳无愁。
新丰生
九陌杨花开落日,十

In [ ]:
DEFAULT_modelweights = os.path.join(DATA_FOLDER, 'char_rnn.h5')
model.load_weights(DEFAULT_modelweights )

In [44]:
print('{} START'.format(time.strftime(TIME_FORMAT)))
model.optimizer.lr=0.0001
model.fit(sentences, np.expand_dims(next_chars,-1), batch_size=200, epochs=1)
print('{} STOP'.format(time.strftime(TIME_FORMAT)))


Epoch 1/5
99959/99959 [==============================] - 272s - loss: 4.2686   
Epoch 2/5
99959/99959 [==============================] - 273s - loss: 3.5228   
Epoch 3/5
99959/99959 [==============================] - 277s - loss: 3.0338   
Epoch 4/5
99959/99959 [==============================] - 278s - loss: 2.7145   
Epoch 5/5
99959/99959 [==============================] - 277s - loss: 2.4891   
Out[44]:
<keras.callbacks.History at 0x207d8ded7b8>

In [45]:
print_example()


行。遍野屯万骑,临原驻五营。登山麾武节,背水纵神兵。在昔戎戈动,今来宇宙平。
入潼关
崤函乘洞律,黛锁显宫前。万壑浮云起,风调轸玉声。郊坰清广转,法乐调和风。
唐大飨拜洛乐章·昭和
烂俎牺荐,羞荐斯陈。黑修之牡,子器斯中。上帝配食,雍于执敬。爰容景福,永于旧章。
郊庙歌辞·祀雨师乐章·迎俎酌献
撰行协序,垂舞递成。鸾鸾凤舞,飘洞发旂。煌煌开御,穆穆雍雍。
唐明堂乐章·徵音
赫赫离精御炎陆,滔滔炽草郊开。画玉交桃养大人,谁教亿乙化殊平。
郊庙歌辞·汉宗庙乐舞辞·忠顺
明庭展赫,文物昭新。敬承茂典,敢择深衷。睟周历庙,载纬鸿休。
郊庙歌辞·梁太庙乐舞辞·登歌
既赫皇考,浚哉帝台。闓华而及,瑟彼飞香。大矣昭德,夙望明年。
郊庙歌辞·周宗庙乐舞辞·肃顺
恭彻祀礼,既以严禋。魏诚内庙,敢择良辰。载启其著,鸿休用职。
郊庙

In [46]:
DEFAULT_modelweights = os.path.join(DATA_FOLDER, 'char_rnn.h5')
model.save_weights(DEFAULT_modelweights )

In [47]:
model.optimizer.lr=0.00001
model.fit(sentences, np.expand_dims(next_chars,-1), batch_size=200, epochs=1)


Epoch 1/5
99959/99959 [==============================] - 278s - loss: 2.3189   
Epoch 2/5
99959/99959 [==============================] - 278s - loss: 2.1801   
Epoch 3/5
99959/99959 [==============================] - 277s - loss: 2.0666   
Epoch 4/5
99959/99959 [==============================] - 275s - loss: 1.9676   
Epoch 5/5
99959/99959 [==============================] - 275s - loss: 1.8820   
Out[47]:
<keras.callbacks.History at 0x20817684be0>

In [48]:
print_example()


行。遍野屯万骑,临原驻五营。登山麾武节,背水纵神兵。在昔戎戈动,今来宇宙平。
入潼汤
崤函重律险,壮夫三十二。浇俗杂良霜,黄泥夜点营。凯胡氛九匣,单帐即归时。横沙何有静,谁能扫虏归。
横吹曲辞·陇头水
借问陇头水,今年陇头家。塞头年不见,明月竟不寐。
横吹曲辞·关山月
胡风月夜长,北客终无远。拂拂东飞曙,花成玉座声。草除新树地,凉吹轸离心。林黄帷阁上,砌影乱花呈。登高思遗老,含毫属昭阳。
上巳日赐裴度之张
端拱乘轩镜,昏开景重长。芳菲分日暮,娇罢在浮云。分枝怜菊蕊,风暖洒檀栊。
咏小山
近谷交萦蕊,遥峰对出莲。径细新苔树,寒山带夕华。坐此攀垂萼,今年那必寻。只辞秋雁曲,还用旧来还。
相和歌辞·阳春曲
妾妒白蘋浦,团扇薄时衣。初花复独好,花似绮蛾身。日暮西施望,春深不忆新。
相和歌辞·婕妤怨
谗谤潜来起不知,携手

In [ ]:


In [ ]:
print_example()

In [51]:
model.save_weights(DEFAULT_modelweights )

用训练好了word2vec代替原来embedding layer, 在速度上有很大的提高,且词句的组织也比较好了

最后在8G的GPU上,再用Bidirectional LSMT来训练,这样的希望能看到一个好的结果


In [ ]: