"data/nine_dreams/ninedreams.txt" IS REQUIRED

SPECIFY FILE ENCODING TYOE IN PYTHON


In [1]:
# -*- coding: utf-8 -*-
print ("UTF-8 ENCODING")


UTF-8 ENCODING

LOAD PACKAGES


In [3]:
import chardet # https://github.com/chardet/chardet
import glob
import codecs
import sys
import os
from TextLoader import *
from Hangulpy import *
print ("PACKAGES LOADED")


PACKAGES LOADED

In [5]:

CONVERT UTF8-ENCODED TXT FILE


In [6]:
def conv_file(fromfile, tofile):
    with open(fromfile, "rb") as f:
        sample_text=f.read(10240)
    pred = chardet.detect(sample_text)
    if not pred['encoding'] in ('EUC-KR', 'UTF-8', 'CP949', 'UTF-16LE'):
        print ("WARNING! Unknown encoding! : %s = %s") % (fromfile, pred['encoding'])
        pred['encoding'] = "CP949" # 못찾으면 기본이 CP949
        formfile = fromfile + ".unknown"
    elif pred['confidence'] < 0.9:
        print ("WARNING! Unsured encofing! : %s = %s / %s")
        % (fromfile, pred['confidence'], pred['encoding'])
        formfile = fromfile + ".notsure"
    with codecs.open(fromfile, "r", encoding=pred['encoding'], errors="ignore") as f:
        with codecs.open(tofile, "w+", encoding="utf8") as t:
            all_text = f.read()
            t.write(all_text)

"data/nine_dreams/ninedreams_utf8.txt" IS GENERATED


In [7]:
# Downloded data
dataname = 'invisible_dragon'
# SOURCE TXT FILE
fromfile = os.path.join("data",dataname,"rawtext.txt")
# TARGET TXT FILE
tofile   = os.path.join("data",dataname,"rawtext_utf8.txt")
conv_file(fromfile, tofile)
print ("UTF8-CONVERTING DONE")
print (" [%s] IS GENERATED" % (tofile))


UTF8-CONVERTING DONE
 [data/invisible_dragon/rawtext_utf8.txt] IS GENERATED

DECOMPOSE HANGUL (THIS PART IS IMPORTANT!)


In [8]:
def dump_file(filename):
    result=u"" # <= UNICODE STRING 
    with codecs.open(filename,"r", encoding="UTF8") as f:
        for line in f.readlines():
            line = tuple(line)
            result = result + decompose_text(line)
    return result
print ("FUNCTION READY")


FUNCTION READY

PYTHON 2 AND 3 COMPATIBILITY


In [9]:
if sys.version_info.major == 2:
    parsed_txt = dump_file(tofile).encode("utf8") 
else:
    parsed_txt = dump_file(tofile) 

print ("Parsing %s done" % (tofile))
# PRINT FIRST 100 CHARACTERS
print (parsed_txt[:100])


Parsing data/invisible_dragon/rawtext_utf8.txt done

ㅇㅕᴥㄹㅓᴥㅂㅡㄴᴥ ㅈㅐᴥㄱㅏᴥ ㄷㅡᴥㄷㅣᴥㅇㅓᴥ ㄱㅡㄹᴥㅇㅡ�

"data/nine_dreams/input.txt" IS GENERATED


In [13]:
with open(os.path.join("data",dataname,"input.txt"), "w") as text_file:
    text_file.write(parsed_txt)
print ("Saved to a txt file")
print (text_file)


Saved to a txt file
<closed file 'data/invisible_dragon/input.txt', mode 'w' at 0x7f7aaaecedb0>

COMPOSE HANGUL CHARACTER FROM PHONEME


In [14]:
data=[u'\u3147', u'\u3157', u'\u1d25', u'\u3134', u'\u3161', u'\u3139', u'\u1d25'
      , u' ', u'\u314f', u'\u3147', u'\u3145', u'\u314f', u'\u1d25', u'\u1d25'
      , u'\u3163', u'\u1d25', u' ', u'\u3147', u'\u1d25', u'\u3155', u'\u1d25'
      , u'\u3134', u'\u314f', u'\u1d25', u'\u3155', u'\u3147', u'\u1d25'
      , u'\u315b', u'\u3131', u'\u1d25', u'\u3147', u'\u3139', u'\u3146'
      , u'\u1d25', u'\u3137', u'\u314f', u'\u314e', u'\u3139', u'\u1d25'
      , u'\u3134', u'\u1d25', u'\u3145', u'\u3163', u'\u1d25', u'\u1d25'
      , u'\u314f', u'\u1d25', u'\u314e', u'\u314f', u'\u3147', u'\u3131'
      , u'\u3157', u'\u3134', u'\u1d25', u'\u1d25', u'\u315b', u'\u1d25'
      , u'\u3148', u'\u3153', u'\u3136', u'\u1d25', u' ', u'\u3145', u'\u3150'
      , u'\u3141', u'\u3136', u'\u3161', u'\u3134', u'\u3163', u'\u1d25', u'.'
      , u'\u3148', u'\u3153', u'\u3134', u'\u314e', u'\u3153', u'\u1d25', u'\u1d25'
      , u'\u3147', u'\u314f', u'\u3134', u'\u3148', u'\u314f', u'\u3139', u'\u315d'
      , u'\u314c', u'\u1d25', u'\u3161', u'\u3134', u'\u3148', u'\u3163', u'\u313a'
      , u'\u1d25', u' ', u'\u3147', u'\u3161', u'\u3146', u'\u1d25', u'?', u'\u3134'
      , u'\u1d25', u'\u314e', u'\u3163', u'\u1d25', u'\u3147', u'\u3148', u'\u314f'
      ]
print automata("".join(data))


오늘 ㅏㅇ사ㅣ ㅇㅕ나ㅕㅇㅛㄱㅇㄹㅆ닿ㄹㄴ시ㅏ항곤ㅛ젆 샘ㄶㅡ니.젆ㅓ앉ㅏ뤝ㅡㄴ짉 읐?ㄴ히ㅇ

GENERATE "vocab.pkl" and "data.npy" in "data/nine_dreams/" FROM "data/nine_dreams/input.txt"


In [15]:
data_dir    = "data/nine_dreams"
batch_size  = 50
seq_length  = 50
data_loader = TextLoader(data_dir, batch_size, seq_length)


loading preprocessed files

DATA_LOADER IS:


In [16]:
print ( "type of 'data_loader' is %s, length is %d" 
       % (type(data_loader.vocab), len(data_loader.vocab)) )


type of 'data_loader' is <type 'dict'>, length is 76

DATA_LOADER.VOCAB IS:


In [17]:
print ("data_loader.vocab looks like \n%s " % (data_loader.vocab,))


data_loader.vocab looks like 
{u'_': 69, u'6': 59, u':': 57, u'\n': 19, u'4': 67, u'5': 63, u'>': 75, u'!': 52, u' ': 1, u'"': 28, u'\u1d25': 0, u"'": 49, u')': 46, u'(': 45, u'-': 65, u',': 27, u'.': 24, u'\u3131': 7, u'0': 73, u'\u3133': 60, u'\u3132': 29, u'\u3135': 50, u'\u3134': 4, u'\u3137': 13, u'\u3136': 44, u'\u3139': 5, u'\u3138': 32, u'\u313b': 55, u'\u313a': 48, u'\u313c': 54, u'?': 41, u'3': 66, u'\u3141': 12, u'\u3140': 51, u'\u3143': 47, u'\u3142': 17, u'\u3145': 10, u'\u3144': 43, u'\u3147': 2, u'\u3146': 22, u'\u3149': 40, u'\u3148': 15, u'\u314b': 42, u'\u314a': 23, u'\u314d': 31, u'\u314c': 30, u'\u314f': 3, u'\u314e': 14, u'\u3151': 34, u'\u3150': 21, u'\u3153': 11, u'\u3152': 74, u'\u3155': 18, u'\u3154': 20, u'\u3157': 9, u'\u3156': 39, u'\u3159': 53, u'\u3158': 26, u'\u315b': 38, u'\u315a': 33, u'\u315d': 36, u'\u315c': 16, u'\u315f': 35, u'\u315e': 61, u'\u3161': 8, u'\u3160': 37, u'\u3163': 6, u'\u3162': 25, u'\x1a': 72, u'9': 64, u'7': 71, u'2': 62, u'1': 58, u'\u313f': 56, u'\u313e': 70, u'8': 68} 

DATA_LOADER.CHARS IS:


In [18]:
print ( "type of 'data_loader.chars' is %s, length is %d" 
       % (type(data_loader.chars), len(data_loader.chars)) )


type of 'data_loader.chars' is <type 'tuple'>, length is 76

CHARS CONVERTS INDEX -> CHAR


In [19]:
print ("data_loader.chars looks like \n%s " % (data_loader.chars,))


data_loader.chars looks like 
(u'\u1d25', u' ', u'\u3147', u'\u314f', u'\u3134', u'\u3139', u'\u3163', u'\u3131', u'\u3161', u'\u3157', u'\u3145', u'\u3153', u'\u3141', u'\u3137', u'\u314e', u'\u3148', u'\u315c', u'\u3142', u'\u3155', u'\n', u'\u3154', u'\u3150', u'\u3146', u'\u314a', u'.', u'\u3162', u'\u3158', u',', u'"', u'\u3132', u'\u314c', u'\u314d', u'\u3138', u'\u315a', u'\u3151', u'\u315f', u'\u315d', u'\u3160', u'\u315b', u'\u3156', u'\u3149', u'?', u'\u314b', u'\u3144', u'\u3136', u'(', u')', u'\u3143', u'\u313a', u"'", u'\u3135', u'\u3140', u'!', u'\u3159', u'\u313c', u'\u313b', u'\u313f', u':', u'1', u'6', u'\u3133', u'\u315e', u'2', u'5', u'9', u'-', u'3', u'4', u'8', u'_', u'\u313e', u'7', u'\x1a', u'0', u'\u3152', u'>') 

In [20]:
for i, char in enumerate(data_loader.chars):
    # GET INDEX OF THE CHARACTER
    idx = data_loader.vocab[char]
    print ("[%02d] %03s (%02d)" 
           % (i, automata("".join(char)), idx))


[00]     (00)
[01]     (01)
[02]     (02)
[03]   ㅏ (03)
[04]     (04)
[05]     (05)
[06]   ㅣ (06)
[07]     (07)
[08]   ㅡ (08)
[09]   ㅗ (09)
[10]     (10)
[11]   ㅓ (11)
[12]     (12)
[13]     (13)
[14]     (14)
[15]     (15)
[16]   ㅜ (16)
[17]     (17)
[18]   ㅕ (18)
[19]   
 (19)
[20]   ㅔ (20)
[21]   ㅐ (21)
[22]     (22)
[23]     (23)
[24]   . (24)
[25]   ㅢ (25)
[26]   ㅘ (26)
[27]   , (27)
[28]   " (28)
[29]     (29)
[30]     (30)
[31]     (31)
[32]     (32)
[33]   ㅚ (33)
[34]   ㅑ (34)
[35]   ㅟ (35)
[36]   ㅝ (36)
[37]   ㅠ (37)
[38]   ㅛ (38)
[39]   ㅖ (39)
[40]     (40)
[41]   ? (41)
[42]     (42)
[43]   ㅄ (43)
[44]   ㄶ (44)
[45]   ( (45)
[46]   ) (46)
[47]     (47)
[48]   ㄺ (48)
[49]   ' (49)
[50]   ㄵ (50)
[51]   ㅀ (51)
[52]   ! (52)
[53]   ㅙ (53)
[54]   ㄼ (54)
[55]   ㄻ (55)
[56]   ㄿ (56)
[57]   : (57)
[58]   1 (58)
[59]   6 (59)
[60]   ㄳ (60)
[61]   ㅞ (61)
[62]   2 (62)
[63]   5 (63)
[64]   9 (64)
[65]   - (65)
[66]   3 (66)
[67]   4 (67)
[68]   8 (68)
[69]   _ (69)
[70]   ㄾ (70)
[71]   7 (71)
[72]    (72)
[73]   0 (73)
[74]   ㅒ (74)
[75]   > (75)

In [ ]:


In [ ]: