Regardless of training corpus language ,We must use data format as following.
Example Sentence \n
Example Sentence \n
Example Sentence \n
If you will use English, you can go back page and enjoy to EscortWithGAN.
I just write example code to do leaving a space between words
In [ ]:
import os
import MeCab
import re
import sys
sys.append("../")
from util import *
path = "nov18.txt"
with open(path) as fs:
sentences = fs.readlines()
# remove \n
sentenes = [sentence.split("\n")[0] for sentence in sentences]
#define function to remove special character
def replace_pattern(sentence, patterns=[[r"\n", ""], [r"\.+",""],[r"\!+", "!"], [r"\+", ""], ["\,", ""], ["\*", ""], [r"\\", ""],["\#", ""], ["\%", ""], ["\(", ""], ["\)", ""], ['"', ""], ["/", ""] ], ultimates="<「」》▽々〇《〈〉[]『』【】〔〕()〆〝〰〟〜*〈+\u3000\-[0-9]:;<=>■▲◆⚪◎✳❤➥☆★♪♀`◯●○◇▽□▼?@Ⅱ─③①②\\&Ⅳ⋯^Ⅰ_⁇≧≦−∞↓Ⅴ→~ìúω‥⁇⁉‼※…Ω\‘\“\”\˝е’–‐—―ДтямюбйиыОлНЯβα\'"):
for i,pattern in enumerate(patterns):
sentence = re.sub(pattern[0], pattern[1], str(sentence))
for ultimate in ultimates:
try:
sentence = re.sub(ultimate, "", str(sentence))
except:
continue
return sentence
sentences_ = []
for sentence in sentences:
sentences_.append(replace_pattern(sentence))
save_path = ""
with open(save_path, "a") as fs:
fs.write("\n".join(sentences_))
## leaving a space between words
wakati_save_path = ""
wakati(save_path, wakati_save_path)