About Training Data and pre-processing

Data format

Regardless of training corpus language ,We must use data format as following.

If you will use English, you can go back page and enjoy to EscortWithGAN.

Part of Japanese pre-processing

I just write example code to do leaving a space between words

In [ ]:
import os 
import MeCab
import re
import sys
from util import *

path  = "nov18.txt"
with open(path) as fs:
    sentences = fs.readlines()

# remove \n
sentenes = [sentence.split("\n")[0] for sentence in sentences]

#define function to remove special character
def replace_pattern(sentence, patterns=[[r"\n", ""], [r"\.+",""],[r"\!+", "!"], [r"\+", ""], ["\,", ""], ["\*", ""], [r"\\", ""],["\#", ""], ["\%", ""], ["\(", ""], ["\)", ""], ['"', ""], ["/", ""] ], ultimates="<「」》▽々〇《〈〉[]『』【】〔〕()〆〝〰〟〜*〈+\u3000\-[0-9]:;<=>■▲◆⚪◎✳❤➥☆★♪♀`◯●○◇▽□▼?@Ⅱ─③①②\\&Ⅳ⋯^Ⅰ_⁇≧≦−∞↓Ⅴ→~ìúω‥⁇⁉‼※…Ω\‘\“\”\˝е’–‐—―ДтямюбйиыОлНЯβα\'"):
    for i,pattern in enumerate(patterns):
        sentence = re.sub(pattern[0], pattern[1], str(sentence))
    for ultimate in ultimates:
            sentence = re.sub(ultimate, "", str(sentence))
    return sentence

sentences_ = []
for sentence in sentences:
save_path = ""
with open(save_path, "a") as fs:
## leaving a space between words
wakati_save_path = ""
wakati(save_path, wakati_save_path)