About Training Data and pre-processing

Data format

Regardless of training corpus language ,We must use data format as following.

Example Sentence \n
Example Sentence \n
Example Sentence \n

If you will use English, you can go back page and enjoy to EscortWithGAN.

Part of Japanese pre-processing

I just write example code to do leaving a space between words


In [ ]:
import os 
import MeCab
import re
import sys
sys.append("../")
from util import *

path  = "nov18.txt"
with open(path) as fs:
    sentences = fs.readlines()

# remove \n
sentenes = [sentence.split("\n")[0] for sentence in sentences]

#define function to remove special character
def replace_pattern(sentence, patterns=[[r"\n", ""], [r"\.+",""],[r"\!+", "!"], [r"\+", ""], ["\,", ""], ["\*", ""], [r"\\", ""],["\#", ""], ["\%", ""], ["\(", ""], ["\)", ""], ['"', ""], ["/", ""] ], ultimates="<「」》▽々〇《〈〉[]『』【】〔〕()〆〝〰〟〜*〈+\u3000\-[0-9]:;<=>■▲◆⚪◎✳❤➥☆★♪♀`◯●○◇▽□▼?@Ⅱ─③①②\\&Ⅳ⋯^Ⅰ_⁇≧≦−∞↓Ⅴ→~ìúω‥⁇⁉‼※…Ω\‘\“\”\˝е’–‐—―ДтямюбйиыОлНЯβα\'"):
    for i,pattern in enumerate(patterns):
        sentence = re.sub(pattern[0], pattern[1], str(sentence))
        
    for ultimate in ultimates:
        try:
            sentence = re.sub(ultimate, "", str(sentence))
        except:
            continue 
    return sentence

sentences_ = []
for sentence in sentences:
    sentences_.append(replace_pattern(sentence))
    
save_path = ""
with open(save_path, "a") as fs:
    fs.write("\n".join(sentences_))
    
## leaving a space between words
wakati_save_path = ""
wakati(save_path, wakati_save_path)