In [1]:
import os
import numpy as np
In [2]:
no_convert = ['<s>', '</s>', '<eof>', '<unk>']
In [4]:
train = ''
with open('../data/ptb-punct/train.txt') as f:
for i, l in enumerate(f):
# print("\n" in l)
for word in l.split():
if word in no_convert:
train+=word
train+=" "
else:
for c in word:
train+=c
train+=" "
train+= "<space> "
train+="\n"
# if i>5:
# break
# print train
In [5]:
with open('../data/ptb-punct/train_char.txt', 'w') as f:
f.write(train)
In [6]:
valid = ''
with open('../data/ptb-punct/train.txt') as f:
for i, l in enumerate(f):
# print("\n" in l)
for word in l.split():
if word in no_convert:
valid+=word
valid+=" "
else:
for c in word:
valid+=c
train+=" "
train+= "<space> "
train+="\n"
In [7]:
with open('../data/ptb-punct/valid_char.txt', 'w') as f:
f.write(valid)
In [ ]: