In [1]:
import os
import numpy as np

In [2]:
no_convert = ['<s>', '</s>', '<eof>', '<unk>']

In [4]:
train = ''
with open('../data/ptb-punct/train.txt') as f:
    for i, l in enumerate(f):
#         print("\n" in l) 
        for word in l.split():
            if word in no_convert:
                train+=word
                train+=" "
            else:
                for c in word:
                    train+=c
                    train+=" "
            train+= "<space> "
        train+="\n"
#         if i>5:
#             break
# print train


True

In [5]:
with open('../data/ptb-punct/train_char.txt', 'w') as f:
    f.write(train)

In [6]:
valid = ''
with open('../data/ptb-punct/train.txt') as f:
    for i, l in enumerate(f):
#         print("\n" in l) 
        for word in l.split():
            if word in no_convert:
                valid+=word
                valid+=" "
            else:
                for c in word:
                    valid+=c
                    train+=" "
            train+= "<space> "
        train+="\n"


True

In [7]:
with open('../data/ptb-punct/valid_char.txt', 'w') as f:
    f.write(valid)

In [ ]: