In [1]:
# -*- coding: utf-8 -*-
import os
import re
import time
import codecs
TIME_FORMAT = '%Y-%m-%d %H:%M:%S'
BASE_FOLDER = os.getcwd()# os.path.abspath(os.path.dirname(__file__))
DATA_FOLDER = os.path.join(BASE_FOLDER, 'data')
DEFAULT_FIN = os.path.join(DATA_FOLDER, '唐诗语料库.txt')
DEFAULT_FOUT = os.path.join(DATA_FOLDER, 'poem.txt')
reg_noisy = re.compile('[^\u3000-\uffee]')
reg_note = re.compile('((.*))') # Cannot deal with () in seperate lines
# 中文及全角标点符号(字符)是\u3000-\u301e\ufe10-\ufe19\ufe30-\ufe44\ufe50-\ufe6b\uff01-\uffee
DEFAULT_Char2Vec = os.path.join(DATA_FOLDER, 'Char2Vec100.bin')
print(DEFAULT_Char2Vec)
In [2]:
def GetFirstNline(filePath, linesNumber):
fd = codecs.open(filePath, 'r', 'utf-8')
for i in range(1,linesNumber):
print(fd.readline())
fd.close()
GetFirstNline(DEFAULT_FOUT, 3)
In [3]:
print('{} START'.format(time.strftime(TIME_FORMAT)))
text = codecs.open(DEFAULT_FOUT, 'r', 'utf-8').read()
print('corpus length:', len(text))
print('{} STOP'.format(time.strftime(TIME_FORMAT)))
In [4]:
#text = text[:100000]
print('test corpus length:', len(text))
In [5]:
print('{} START'.format(time.strftime(TIME_FORMAT)))
chars = sorted(list(set(text)))
vocab_size = len(chars)+1
print('total chars:', vocab_size)
print('{} STOP'.format(time.strftime(TIME_FORMAT)))
In [6]:
chars.insert(0, "\0")
In [7]:
''.join(chars[1:200])
Out[7]:
In [8]:
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))
In [9]:
idx = [char_indices[c] for c in text]
In [10]:
idx[:10]
Out[10]:
In [11]:
''.join(indices_char[i] for i in idx[1000:1080])
Out[11]:
In [12]:
DEFAULT_Char2Vec100 = os.path.join(DATA_FOLDER, 'Char2Vec100.bin')
from gensim.models import word2vec
import numpy as np
model = word2vec.Word2Vec.load(DEFAULT_Char2Vec100)
model[u'行']
Out[12]:
In [13]:
print ('creating embedding matrix...')
embedding_matrix = np.zeros((vocab_size, 100))
for i, c in enumerate(chars):
#print(c)
if c in model:
embedding_matrix[i] = model[c]
# if embedding_vector is not None:
# words not found in embedding index will be all-zeros.
# embedding_matrix[i] = embedding_vector
print('Found %s word vectors.' % len(embedding_matrix))
In [14]:
maxlen = 80
sentences = []
next_chars = []
for i in range(0, len(idx) - maxlen+1):
sentences.append(idx[i: i + maxlen])
next_chars.append(idx[i+1: i+maxlen+1])
print('nb sequences:', len(sentences))
In [15]:
import numpy as np
import keras
from keras.layers import TimeDistributed, Activation
from numpy.random import choice
In [16]:
sentences = np.concatenate([[np.array(o)] for o in sentences[:-2]])
next_chars = np.concatenate([[np.array(o)] for o in next_chars[:-2]])
In [17]:
sentences.shape, next_chars.shape
Out[17]:
In [18]:
n_fac = 100
In [19]:
from keras import backend as K
from keras.utils.data_utils import get_file
from keras.utils import np_utils
from keras.utils.np_utils import to_categorical
from keras.models import Sequential, Model
from keras.layers import Input, Embedding, Reshape, merge, LSTM, Bidirectional
from keras.layers import TimeDistributed, Activation, SimpleRNN, GRU
from keras.layers.core import Flatten, Dense, Dropout, Lambda
from keras.regularizers import l2, l1
from keras.layers.normalization import BatchNormalization
from keras.optimizers import SGD, RMSprop, Adam
#from keras.utils.layer_utils import layer_from_config
from keras.layers import deserialize as layer_from_config
from keras.metrics import categorical_crossentropy, categorical_accuracy
from keras.layers.convolutional import *
from keras.preprocessing import image, sequence
from keras.preprocessing.text import Tokenizer
In [ ]:
In [20]:
model=Sequential([
Embedding(vocab_size, n_fac, input_length=maxlen, weights=[embedding_matrix],trainable=False),
Bidirectional(LSTM(512, return_sequences=True, dropout=0.2, recurrent_dropout=0.2,implementation=2)),
Dropout(0.2),
Bidirectional(LSTM(512, return_sequences=True, dropout=0.2, recurrent_dropout=0.2,implementation=2)),
Dropout(0.2),
TimeDistributed(Dense(vocab_size)),
Activation('softmax')
])
In [21]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())
In [22]:
def print_example():
seed_string=u'世岂邀名。星旂纷电举,日羽肃天行。遍野屯万骑,临原驻五营。登山麾武节,背水纵神兵。在昔戎戈动,今来宇宙平。\r\n入潼关\r\n崤函称地险,襟带壮两京。霜峰直临道,冰河'
for i in range(640):
x=np.array([char_indices[c] for c in seed_string[-80:]])[np.newaxis,:]
preds = model.predict(x, verbose=0)[0][-1]
preds = preds/np.sum(preds)
next_char = choice(chars, p=preds)
seed_string = seed_string + next_char
print(seed_string)
In [23]:
model.fit(sentences, np.expand_dims(next_chars,-1), batch_size=200, epochs=1)
Out[23]:
In [24]:
print_example()
DEFAULT_modelweights = os.path.join(DATA_FOLDER, 'char_rnn.h5')
model.save_weights(DEFAULT_modelweights )
In [ ]:
DEFAULT_modelweights = os.path.join(DATA_FOLDER, 'char_rnn.h5')
model.load_weights(DEFAULT_modelweights )
In [44]:
print('{} START'.format(time.strftime(TIME_FORMAT)))
model.optimizer.lr=0.0001
model.fit(sentences, np.expand_dims(next_chars,-1), batch_size=200, epochs=1)
print('{} STOP'.format(time.strftime(TIME_FORMAT)))
Out[44]:
In [45]:
print_example()
In [46]:
DEFAULT_modelweights = os.path.join(DATA_FOLDER, 'char_rnn.h5')
model.save_weights(DEFAULT_modelweights )
In [47]:
model.optimizer.lr=0.00001
model.fit(sentences, np.expand_dims(next_chars,-1), batch_size=200, epochs=1)
Out[47]:
In [48]:
print_example()
In [ ]:
In [ ]:
print_example()
In [51]:
model.save_weights(DEFAULT_modelweights )
In [ ]: