In [68]:
#coding=utf-8
import pickle
import os
import re
def load_text(path):
input_file = os.path.join(path)
with open(input_file, 'r') as f:
text_data = f.read()
return text_data
def preprocess_and_save_data(text, token_lookup, create_lookup_tables):
token_dict = token_lookup()
for key, token in token_dict.items():
text = text.replace(key, '{}'.format(token))
text = list(text)
vocab_to_int, int_to_vocab = create_lookup_tables(text)
int_text = [vocab_to_int[word] for word in text]
pickle.dump((int_text, vocab_to_int, int_to_vocab, token_dict), open('preprocess.p', 'wb'))
def load_preprocess():
return pickle.load(open('preprocess.p', mode='rb'))
def save_params(params):
pickle.dump(params, open('params.p', 'wb'))
def load_params():
return pickle.load(open('params.p', mode='rb'))
In [69]:
data_dir = '~/Documents/fastai-notes/deeplearning1/rnn/text/'
filename = 'gucheng.txt'
%cd $data_dir
text = load_text(filename)
In [70]:
text[:500]
Out[70]:
In [71]:
text = text.replace(' ', '').replace('……', '').replace('\u3000','')
In [72]:
text = re.sub('(19.*?)\\n','\n', text)
text = re.sub('(一九.*?)\\n','\n', text)
In [73]:
text = re.sub(r'\n{2,}', '&', text)
In [74]:
text[:500]
Out[74]:
In [75]:
text = re.sub('(&[\u4300-\u9fff,]+)\n([\u4300-\u9fff])', r'\1:\2', text)
In [76]:
text[:500]
Out[76]:
In [77]:
text = text.replace('(一)',':')
text = text.replace('(二)',':')
text = text.replace('(三)',':')
In [78]:
text = text.replace('\n', ' ').replace(u'目\u3000录', '')
In [79]:
text[:500]
Out[79]:
In [80]:
text = re.sub('(上一页.*?顾城)&', '', text)
In [81]:
text = text.replace('&', '\n')
text = re.sub('([\u4300-\u9fff]) ([\u4300-\u9fff])', r'\1,\2', text)
text = text.replace(' ', '')
In [82]:
print(text[27000:29000])
In [83]:
with open('gucheng_compiled.txt', 'w') as out_file:
out_file.write(text)
In [ ]: