In [5]:
# -*- coding: utf-8 -*-
import os
import re
import time
import codecs
TIME_FORMAT = '%Y-%m-%d %H:%M:%S'
BASE_FOLDER = os.getcwd() # os.path.abspath(os.path.dirname(__file__))
DATA_FOLDER = os.path.join(BASE_FOLDER, 'data')
DEFAULT_FIN = os.path.join(DATA_FOLDER, '唐诗语料库.txt')
DEFAULT_FOUT = os.path.join(DATA_FOLDER, 'poem.txt')
reg_noisy = re.compile('[^\u3000-\uffee]')
reg_note = re.compile('((.*))') # Cannot deal with () in seperate lines
# 中文及全角标点符号(字符)是\u3000-\u301e\ufe10-\ufe19\ufe30-\ufe44\ufe50-\ufe6b\uff01-\uffee
DEFAULT_Char2Vec = os.path.join(DATA_FOLDER, 'Char2Vec100.bin')
In [6]:
os.getcwd() # test the function and current path
Out[6]:
In [7]:
def GetFirstNline(filePath, linesNumber):
fd = codecs.open(filePath, 'r', 'utf-8')
for i in range(1,linesNumber):
print(fd.readline())
fd.close()
GetFirstNline(DEFAULT_FOUT, 3)
In [8]:
print('{} START'.format(time.strftime(TIME_FORMAT)))
text = codecs.open(DEFAULT_FOUT, 'r', 'utf-8').read()
print('corpus length:', len(text))
print('{} STOP'.format(time.strftime(TIME_FORMAT)))
In [9]:
text = text[:100000]
print('test corpus length:', len(text))
In [10]:
print('{} START'.format(time.strftime(TIME_FORMAT)))
chars = sorted(list(set(text)))
vocab_size = len(chars)+1
print('total chars:', vocab_size)
print('{} STOP'.format(time.strftime(TIME_FORMAT)))
In [13]:
from pypinyin import pinyin, Style
In [46]:
char_py = pinyin(chars, style=Style.TONE2) # 拼音
def Pinze(strPy):
ret = -1 # 没有拼音
if '1' in strPy: ret = 1 #平
if '2' in strPy or '3' in strPy or '4' in strPy: ret = 0 #仄
return ret
char_pz = [Pinze(pz[0]) for pz in char_py] # 平仄
char_mu = ['']*vocab_size
# 匹配 TONE2 中标识韵母声调的正则表达式
RE_TONE3 = re.compile('^([a-z]+)([1-4])([a-z]*)$')
# 声母表
_INITIALS = 'b,p,m,f,d,t,n,l,g,k,h,j,q,x,zh,ch,sh,r,z,c,s'.split(',')
# 声母表, 把 y, w 也当作声母
_INITIALS_NOT_STRICT = _INITIALS + ['y', 'w']
for i, pz in enumerate(char_pz):
if pz!= - 1:
py = re.sub(r'[1-4]', '', char_py[i][0])
for sm in _INITIALS_NOT_STRICT:
if py.startswith(sm):
py = py[len(sm):]
break
char_mu[i] = py
#print(re.sub('^([a-z]+)([1-4])([a-z]*)$', r'\1', char_py[i][0]))
#char_mu[i] = re.sub(r'[1-4]', '', RE_TONE3.sub(char_py[i][0]))
char_pz[:20], char_mu[:20]
yunmu = sorted(list(set(char_mu)))
In [49]:
yunmu_indices = dict((c, i) for i, c in enumerate(yunmu))
char_yumu = [yunmu_indices[c] for c in char_mu]
In [60]:
char_yumu.insert(0,-1)
char_pz.insert(0,-1)
In [50]:
chars.insert(0, "\0")
In [51]:
''.join(chars[1:200])
Out[51]:
In [52]:
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))
In [53]:
idx = [char_indices[c] for c in text]
In [54]:
idx[:10]
Out[54]:
In [55]:
''.join(indices_char[i] for i in idx[1000:1040])
Out[55]:
In [56]:
DEFAULT_Char2Vec50 = os.path.join(DATA_FOLDER, 'Char2Vec50.bin')
from gensim.models import word2vec
import numpy as np
model = word2vec.Word2Vec.load(DEFAULT_Char2Vec50)
model[u'行']
Out[56]:
In [67]:
print ('creating embedding matrix...')
embedding_matrix = np.zeros((vocab_size, 50+2))
for i, c in enumerate(chars):
#print(c)
if c in model:
embedding_matrix[i] = np.insert( model[c],0, (char_pz[i],char_yumu[i]))
#for j, m in model[c]:
# embedding_matrix[i][j] = m
# embedding_matrix[i][j+1] = char_pz[i]
# embedding_matrix[i][j+2] = char_yumu[i]
#
# if embedding_vector is not None:
# words not found in embedding index will be all-zeros.
# embedding_matrix[i] = embedding_vector
print('Found %s word vectors.' % len(embedding_matrix))
In [68]:
maxlen = 40
sentences = []
next_chars = []
for i in range(0, len(idx) - maxlen+1):
sentences.append(idx[i: i + maxlen])
next_chars.append(idx[i+1: i+maxlen+1])
print('nb sequences:', len(sentences))
In [69]:
import numpy as np
import keras
from keras.layers import TimeDistributed, Activation
from numpy.random import choice
In [70]:
sentences = np.concatenate([[np.array(o)] for o in sentences[:-2]])
next_chars = np.concatenate([[np.array(o)] for o in next_chars[:-2]])
In [71]:
sentences.shape, next_chars.shape
Out[71]:
In [72]:
n_fac = 50 +2
In [73]:
from keras import backend as K
from keras.utils.data_utils import get_file
from keras.utils import np_utils
from keras.utils.np_utils import to_categorical
from keras.models import Sequential, Model
from keras.layers import Input, Embedding, Reshape, merge, LSTM, Bidirectional
from keras.layers import TimeDistributed, Activation, SimpleRNN, GRU
from keras.layers.core import Flatten, Dense, Dropout, Lambda
from keras.regularizers import l2, l1
from keras.layers.normalization import BatchNormalization
from keras.optimizers import SGD, RMSprop, Adam
#from keras.utils.layer_utils import layer_from_config
from keras.layers import deserialize as layer_from_config
from keras.metrics import categorical_crossentropy, categorical_accuracy
from keras.layers.convolutional import *
from keras.preprocessing import image, sequence
from keras.preprocessing.text import Tokenizer
In [ ]:
In [74]:
model=Sequential([
Embedding(vocab_size, n_fac, input_length=maxlen, weights=[embedding_matrix],trainable=False),
Bidirectional(LSTM(512, return_sequences=True, dropout=0.2, recurrent_dropout=0.2,implementation=2)),
Dropout(0.2),
Bidirectional(LSTM(512, return_sequences=True, dropout=0.2, recurrent_dropout=0.2,implementation=2)),
Dropout(0.2),
TimeDistributed(Dense(vocab_size)),
Activation('softmax')
])
In [75]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())
In [76]:
def print_example():
seed_string=u'行。遍野屯万骑,临原驻五营。登山麾武节,背水纵神兵。在昔戎戈动,今来宇宙平。\n入'
for i in range(320):
x=np.array([char_indices[c] for c in seed_string[-40:]])[np.newaxis,:]
preds = model.predict(x, verbose=0)[0][-1]
preds = preds/np.sum(preds)
next_char = choice(chars, p=preds)
seed_string = seed_string + next_char
print(seed_string)
In [77]:
model.fit(sentences, np.expand_dims(next_chars,-1), batch_size=64, epochs=1)
Out[77]:
In [78]:
print_example()
In [44]:
model.fit(sentences, np.expand_dims(next_chars,-1), batch_size=64, epochs=5)
Out[44]:
In [45]:
print_example()
In [46]:
DEFAULT_modelweights = os.path.join(DATA_FOLDER, 'char_rnn.h5')
model.save_weights(DEFAULT_modelweights )
In [47]:
model.optimizer.lr=0.0001
model.fit(sentences, np.expand_dims(next_chars,-1), batch_size=64, epochs=5)
Out[47]:
In [48]:
print_example()
In [49]:
model.optimizer.lr=0.00001
model.fit(sentences, np.expand_dims(next_chars,-1), batch_size=64, epochs=5)
Out[49]:
In [50]:
print_example()
In [51]:
model.save_weights(DEFAULT_modelweights )
In [ ]: