In [5]:
# -*- coding: utf-8 -*-

import os
import re
import time
import codecs



TIME_FORMAT = '%Y-%m-%d %H:%M:%S'
BASE_FOLDER = os.getcwd() # os.path.abspath(os.path.dirname(__file__))
DATA_FOLDER = os.path.join(BASE_FOLDER, 'data')
DEFAULT_FIN = os.path.join(DATA_FOLDER, '唐诗语料库.txt')
DEFAULT_FOUT = os.path.join(DATA_FOLDER, 'poem.txt')
reg_noisy = re.compile('[^\u3000-\uffee]')
reg_note = re.compile('((.*))') # Cannot deal with () in seperate lines
# 中文及全角标点符号(字符)是\u3000-\u301e\ufe10-\ufe19\ufe30-\ufe44\ufe50-\ufe6b\uff01-\uffee

DEFAULT_Char2Vec = os.path.join(DATA_FOLDER, 'Char2Vec100.bin')

Test the material


In [6]:
os.getcwd()  # test the function and current path


Out[6]:
'C:\\Users\\sethf\\source\\repos\\chinesepoem'

In [7]:
def GetFirstNline(filePath, linesNumber):
    fd = codecs.open(filePath, 'r', 'utf-8')
    for i in range(1,linesNumber):
        print(fd.readline())
    fd.close()

GetFirstNline(DEFAULT_FOUT, 3)


饮马长城窟行

塞外悲风切,交河冰已结。瀚海百重波,阴山千里雪。迥戍危烽火,层峦引高节。悠悠卷旆旌,饮马出长城。寒沙连骑迹,朔吹断边声。胡尘清玉塞,羌笛韵金钲。绝漠干戈戢,车徒振原隰。都尉反龙堆,将军旋马邑。扬麾氛雾静,纪石功名立。荒裔一戎衣,灵台凯歌入。


In [8]:
print('{} START'.format(time.strftime(TIME_FORMAT)))
text = codecs.open(DEFAULT_FOUT, 'r', 'utf-8').read()
print('corpus length:', len(text))
print('{} STOP'.format(time.strftime(TIME_FORMAT)))


2017-10-30 20:49:48 START
corpus length: 3364177
2017-10-30 20:49:48 STOP

In [9]:
text = text[:100000]
print('test corpus length:', len(text))


test corpus length: 100000

In [10]:
print('{} START'.format(time.strftime(TIME_FORMAT)))
chars = sorted(list(set(text)))
vocab_size = len(chars)+1
print('total chars:', vocab_size)
print('{} STOP'.format(time.strftime(TIME_FORMAT)))


2017-10-30 20:50:05 START
total chars: 3826
2017-10-30 20:50:05 STOP

In [13]:
from pypinyin import pinyin, Style

In [46]:
char_py = pinyin(chars, style=Style.TONE2) # 拼音

def Pinze(strPy):
    ret = -1                  # 没有拼音
    if '1' in strPy: ret = 1  #平
    if '2' in strPy or '3' in strPy or '4' in strPy: ret = 0 #仄
    return ret


char_pz = [Pinze(pz[0]) for pz in char_py] # 平仄

char_mu = ['']*vocab_size

# 匹配 TONE2 中标识韵母声调的正则表达式
RE_TONE3 = re.compile('^([a-z]+)([1-4])([a-z]*)$')
# 声母表
_INITIALS = 'b,p,m,f,d,t,n,l,g,k,h,j,q,x,zh,ch,sh,r,z,c,s'.split(',')
# 声母表, 把 y, w 也当作声母
_INITIALS_NOT_STRICT = _INITIALS + ['y', 'w']


for i, pz in enumerate(char_pz):
    if pz!= - 1:
        py = re.sub(r'[1-4]', '', char_py[i][0])
        for sm in _INITIALS_NOT_STRICT:
            if py.startswith(sm):
                py = py[len(sm):]
                break
        char_mu[i] = py        
       
        #print(re.sub('^([a-z]+)([1-4])([a-z]*)$', r'\1', char_py[i][0])) 
        #char_mu[i] = re.sub(r'[1-4]', '', RE_TONE3.sub(char_py[i][0]))
        
char_pz[:20], char_mu[:20]

yunmu = sorted(list(set(char_mu)))

In [49]:
yunmu_indices = dict((c, i) for i, c in enumerate(yunmu))
char_yumu = [yunmu_indices[c] for c in char_mu]

In [60]:
char_yumu.insert(0,-1)
char_pz.insert(0,-1)

In [50]:
chars.insert(0, "\0")

In [51]:
''.join(chars[1:200])


Out[51]:
'\n·。一丁七万丈三上下不与丑专且丕世丘业丛东丝丞两严丧个中丰丱临丸丹为主丽举乃久义之乌乍乎乏乐乔乖乘乙九也习乡书买乱乳乾了予争事二于亏云互五井亘亚亟亡交亥亦产亨亩享京亭亮亲人亿仁仆仇今介仍从仑仓他仗付仙仞代令以仪仰仲价任仿伉伊伍伎伏伐休众优会伟传伣伤伦伪伫伯估伴伶伸似但位低住佐佑体何余佛作佞佩佯佳佶佾使侁侈侍侑侔供依侠侣侧侪侬侮侯侵便促俄俎俗俘保俞俟信俦俨俩俪俭修俯俱俾倍倏倒倘候倚借倡倢倦倩值倾'

In [52]:
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

In [53]:
idx = [char_indices[c] for c in text]

In [54]:
idx[:10]


Out[54]:
[3648, 3672, 3460, 619, 2357, 2953, 1, 632, 657, 1124]

In [55]:
''.join(indices_char[i] for i in idx[1000:1040])


Out[55]:
'行。遍野屯万骑,临原驻五营。登山麾武节,背水纵神兵。在昔戎戈动,今来宇宙平。\n入'

import the pre-trained vector


In [56]:
DEFAULT_Char2Vec50 = os.path.join(DATA_FOLDER, 'Char2Vec50.bin')

from gensim.models import word2vec
import numpy as np

model = word2vec.Word2Vec.load(DEFAULT_Char2Vec50)

model[u'行']


C:\Anaconda3\lib\site-packages\gensim\utils.py:862: UserWarning: detected Windows; aliasing chunkize to chunkize_serial
  warnings.warn("detected Windows; aliasing chunkize to chunkize_serial")
Out[56]:
array([-0.04007597,  1.43749964, -0.50613195,  0.30402976, -0.61863619,
        0.86658812, -0.85327005,  1.0774374 , -1.7388643 , -0.66667581,
        1.41951251,  1.73685801,  0.88174158, -1.08179212,  0.04298396,
       -0.06425346,  0.70207226, -1.53141725,  0.71608716, -0.90501112,
       -0.33681136,  0.14992777, -0.68830115, -0.19420403,  1.46785641,
       -1.46016061, -0.05367612, -0.05942056,  0.43858421, -0.82355237,
        1.08669186,  1.04226518, -1.23372197, -2.04672313,  0.13865796,
        0.07540765,  1.05799055,  0.84494865,  1.86665952,  0.52074122,
       -0.76229119,  0.08235706, -1.87348688, -1.76977718, -1.9432348 ,
       -0.51557565, -3.51499605, -0.06386752, -2.19704294,  2.92216778], dtype=float32)

In [67]:
print ('creating embedding matrix...')
embedding_matrix = np.zeros((vocab_size, 50+2))

for i, c in enumerate(chars):
    #print(c)
    if c in model:
        embedding_matrix[i] = np.insert( model[c],0, (char_pz[i],char_yumu[i]))
        
        #for j, m in model[c]:
        #    embedding_matrix[i][j] = m
        #    embedding_matrix[i][j+1] = char_pz[i]
        #    embedding_matrix[i][j+2] = char_yumu[i]
        
    #    
    # if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
   #     embedding_matrix[i] = embedding_vector

print('Found %s word vectors.' % len(embedding_matrix))


creating embedding matrix...
Found 3826 word vectors.

Preprocess and create model


In [68]:
maxlen = 40
sentences = []
next_chars = []
for i in range(0, len(idx) - maxlen+1):
    sentences.append(idx[i: i + maxlen])
    next_chars.append(idx[i+1: i+maxlen+1])
print('nb sequences:', len(sentences))


nb sequences: 99961

In [69]:
import numpy as np
import keras
from keras.layers import TimeDistributed, Activation
from numpy.random import choice


Using TensorFlow backend.

In [70]:
sentences = np.concatenate([[np.array(o)] for o in sentences[:-2]])
next_chars = np.concatenate([[np.array(o)] for o in next_chars[:-2]])

In [71]:
sentences.shape, next_chars.shape


Out[71]:
((99959, 40), (99959, 40))

In [72]:
n_fac = 50 +2

In [73]:
from keras import backend as K
from keras.utils.data_utils import get_file
from keras.utils import np_utils
from keras.utils.np_utils import to_categorical
from keras.models import Sequential, Model
from keras.layers import Input, Embedding, Reshape, merge, LSTM, Bidirectional
from keras.layers import TimeDistributed, Activation, SimpleRNN, GRU
from keras.layers.core import Flatten, Dense, Dropout, Lambda
from keras.regularizers import l2, l1
from keras.layers.normalization import BatchNormalization
from keras.optimizers import SGD, RMSprop, Adam
#from keras.utils.layer_utils import layer_from_config
from keras.layers import deserialize as layer_from_config

from keras.metrics import categorical_crossentropy, categorical_accuracy
from keras.layers.convolutional import *
from keras.preprocessing import image, sequence
from keras.preprocessing.text import Tokenizer

Import the training word embedding


In [ ]:


In [74]:
model=Sequential([
        Embedding(vocab_size, n_fac, input_length=maxlen, weights=[embedding_matrix],trainable=False),
        Bidirectional(LSTM(512, return_sequences=True, dropout=0.2, recurrent_dropout=0.2,implementation=2)),
        Dropout(0.2),
        Bidirectional(LSTM(512, return_sequences=True, dropout=0.2, recurrent_dropout=0.2,implementation=2)),
        Dropout(0.2),
        TimeDistributed(Dense(vocab_size)),
        Activation('softmax')
    ])

In [75]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())

Training


In [76]:
def print_example():
    seed_string=u'行。遍野屯万骑,临原驻五营。登山麾武节,背水纵神兵。在昔戎戈动,今来宇宙平。\n入'
    for i in range(320):
        x=np.array([char_indices[c] for c in seed_string[-40:]])[np.newaxis,:]
        preds = model.predict(x, verbose=0)[0][-1]
        preds = preds/np.sum(preds)
        next_char = choice(chars, p=preds)
        seed_string = seed_string + next_char
    print(seed_string)

In [77]:
model.fit(sentences, np.expand_dims(next_chars,-1), batch_size=64, epochs=1)


Epoch 1/1
99959/99959 [==============================] - 544s - loss: 0.8153   
Out[77]:
<keras.callbacks.History at 0x1b2d35c80f0>

In [78]:
print_example()


行。遍野屯万骑,临原驻五营。登山麾武节,背水纵神兵。在昔戎戈动,今来宇宙平。
入方周辅庙乐章·庆子二县五首
表敬潜福,西恩望徽。兮孙德,孝孝国。灵生累享,湛播惟彰。在百辟人,拖郊抚。敔内佳瑶篚,奏管节齐齐。眇云升沉炎,降矣福膏。庆元皇祀,同流式。逖橛入外,金海红钉。
郊庙歌辞·登太庙乐舞·寿和
茂棣洁无文,祝请先后室。
郊庙歌辞·祀圣行乐章·雍和
乐九三音已,汉帝葳奠奠。发时随万战,来藉百斯宫。吴穆临风静,珠炉想音。
郊庙歌辞·祀感昭懿太庙乐章·送神
蔡俎奕土,珪敷纂列。郁我既兹,奠本既功。功以仪仪,倾主崇虔。黎神产服,簪绅降微。
郊庙歌辞·享太庙乐章·大成
载物合如云,镇镇时斯熙。自乐既,清宫在。拂云则奏,空升凯握。日上沅湘,举时回景。春兰绿芳羞,动清容容。丽景僾望,直微万籁来。宣惟破定,集清兵明声。
郊庙歌

In [44]:
model.fit(sentences, np.expand_dims(next_chars,-1), batch_size=64, epochs=5)


Epoch 1/5
99959/99959 [==============================] - 272s - loss: 4.2686   
Epoch 2/5
99959/99959 [==============================] - 273s - loss: 3.5228   
Epoch 3/5
99959/99959 [==============================] - 277s - loss: 3.0338   
Epoch 4/5
99959/99959 [==============================] - 278s - loss: 2.7145   
Epoch 5/5
99959/99959 [==============================] - 277s - loss: 2.4891   
Out[44]:
<keras.callbacks.History at 0x207d8ded7b8>

In [45]:
print_example()


行。遍野屯万骑,临原驻五营。登山麾武节,背水纵神兵。在昔戎戈动,今来宇宙平。
入潼关
崤函乘洞律,黛锁显宫前。万壑浮云起,风调轸玉声。郊坰清广转,法乐调和风。
唐大飨拜洛乐章·昭和
烂俎牺荐,羞荐斯陈。黑修之牡,子器斯中。上帝配食,雍于执敬。爰容景福,永于旧章。
郊庙歌辞·祀雨师乐章·迎俎酌献
撰行协序,垂舞递成。鸾鸾凤舞,飘洞发旂。煌煌开御,穆穆雍雍。
唐明堂乐章·徵音
赫赫离精御炎陆,滔滔炽草郊开。画玉交桃养大人,谁教亿乙化殊平。
郊庙歌辞·汉宗庙乐舞辞·忠顺
明庭展赫,文物昭新。敬承茂典,敢择深衷。睟周历庙,载纬鸿休。
郊庙歌辞·梁太庙乐舞辞·登歌
既赫皇考,浚哉帝台。闓华而及,瑟彼飞香。大矣昭德,夙望明年。
郊庙歌辞·周宗庙乐舞辞·肃顺
恭彻祀礼,既以严禋。魏诚内庙,敢择良辰。载启其著,鸿休用职。
郊庙

In [46]:
DEFAULT_modelweights = os.path.join(DATA_FOLDER, 'char_rnn.h5')
model.save_weights(DEFAULT_modelweights )

In [47]:
model.optimizer.lr=0.0001
model.fit(sentences, np.expand_dims(next_chars,-1), batch_size=64, epochs=5)


Epoch 1/5
99959/99959 [==============================] - 278s - loss: 2.3189   
Epoch 2/5
99959/99959 [==============================] - 278s - loss: 2.1801   
Epoch 3/5
99959/99959 [==============================] - 277s - loss: 2.0666   
Epoch 4/5
99959/99959 [==============================] - 275s - loss: 1.9676   
Epoch 5/5
99959/99959 [==============================] - 275s - loss: 1.8820   
Out[47]:
<keras.callbacks.History at 0x20817684be0>

In [48]:
print_example()


行。遍野屯万骑,临原驻五营。登山麾武节,背水纵神兵。在昔戎戈动,今来宇宙平。
入潼汤
崤函重律险,壮夫三十二。浇俗杂良霜,黄泥夜点营。凯胡氛九匣,单帐即归时。横沙何有静,谁能扫虏归。
横吹曲辞·陇头水
借问陇头水,今年陇头家。塞头年不见,明月竟不寐。
横吹曲辞·关山月
胡风月夜长,北客终无远。拂拂东飞曙,花成玉座声。草除新树地,凉吹轸离心。林黄帷阁上,砌影乱花呈。登高思遗老,含毫属昭阳。
上巳日赐裴度之张
端拱乘轩镜,昏开景重长。芳菲分日暮,娇罢在浮云。分枝怜菊蕊,风暖洒檀栊。
咏小山
近谷交萦蕊,遥峰对出莲。径细新苔树,寒山带夕华。坐此攀垂萼,今年那必寻。只辞秋雁曲,还用旧来还。
相和歌辞·阳春曲
妾妒白蘋浦,团扇薄时衣。初花复独好,花似绮蛾身。日暮西施望,春深不忆新。
相和歌辞·婕妤怨
谗谤潜来起不知,携手

In [49]:
model.optimizer.lr=0.00001
model.fit(sentences, np.expand_dims(next_chars,-1), batch_size=64, epochs=5)


Epoch 1/5
99959/99959 [==============================] - 278s - loss: 1.8048   
Epoch 2/5
99959/99959 [==============================] - 275s - loss: 1.7379   
Epoch 3/5
99959/99959 [==============================] - 276s - loss: 1.6785   
Epoch 4/5
99959/99959 [==============================] - 273s - loss: 1.6224   
Epoch 5/5
99959/99959 [==============================] - 274s - loss: 1.5712   
Out[49]:
<keras.callbacks.History at 0x208175c6c50>

In [50]:
print_example()


行。遍野屯万骑,临原驻五营。登山麾武节,背水纵神兵。在昔戎戈动,今来宇宙平。
入潼河
崤函称地险,襟带壮两京。云台先著美,戈门远望新。交河拥已节,寒碛古寒霜。四兵屯九服,汉赏谁封侯。功高无边山,高轼多在张。老日无声影,三春不可穷。空馀片死事,所冀渭滨中。
琴曲歌辞·拘幽操
目掩掩兮其凝其盲,耳肃肃兮听欲闻。天气无为兮为君王圣,厚号万国兮孕八荒。天符既出兮帝业昌,愿临明祀兮降祯祥。
郊庙歌辞·享太庙乐章·景云舞
景云霏烂,告我帝符。噫帝冲德,唯传重圣。格调八簋,时居积庆。懿盛,登遐休。
郊庙歌辞·后唐宗庙乐舞辞·观成舞
穆穆王国,奕奕神猷。毖祀圜配,上帝冲宅。
郊庙歌辞·享章怀太子庙乐章·登歌酌鬯
誉阐元储,寄崇明两。玉裕虽晦,铜楼可想。弦诵辍音,笙歌罢响。币帛言设,灵心不测。
郊庙歌辞·享惠昭太子庙乐章·送神


In [51]:
model.save_weights(DEFAULT_modelweights )

用训练好了word2vec代替原来embedding layer, 在速度上有很大的提高,且词句的组织也比较好了

最后在8G的GPU上,再用Bidirectional LSMT来训练,这样的希望能看到一个好的结果


In [ ]: