In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F

import nltk
from konlpy.tag import Kkma
kor_tagger = Kkma()

Tokenize


In [2]:
english_data = "Hi, Kyle! Welcome to Retrica"

In [3]:
token = nltk.word_tokenize(english_data)

In [4]:
print(token)


['Hi', ',', 'Kyle', '!', 'Welcome', 'to', 'Retrica']

In [5]:
korea_data = "안녕? 나는 파이토치를 공부하고 있어요."

In [6]:
token = kor_tagger.morphs(korea_data)
print(token)


['안녕', '?', '나', '는', '파이', '토치', '를', '공부', '하', '고', '있', '어요', '.']

In [7]:
# 꼬꼬마 사전은 처음에 조금 느리지만 성능은 괜찮은 편

In [8]:
# Pos
kor_tagger.pos(korea_data)


Out[8]:
[('안녕', 'NNG'),
 ('?', 'SF'),
 ('나', 'NP'),
 ('는', 'JX'),
 ('파이', 'NNG'),
 ('토치', 'NNG'),
 ('를', 'JKO'),
 ('공부', 'NNG'),
 ('하', 'XSV'),
 ('고', 'ECE'),
 ('있', 'VXV'),
 ('어요', 'EFN'),
 ('.', 'SF')]

In [9]:
kor_tagger.sentences(korea_data)


Out[9]:
['안녕? 나는 파이 토치를 공부하고 있어요.']

In [10]:
kor_tagger.nouns(korea_data)


Out[10]:
['안녕', '나', '파이', '파이토치', '토치', '공부']

Build vocab


In [11]:
from collections import OrderedDict, defaultdict

In [12]:
word2index = defaultdict()
for vo in token:
    if not word2index.get(vo):
        word2index[vo]=len(word2index)
print(word2index)


defaultdict(None, {'안녕': 0, '?': 1, '나': 2, '는': 3, '파이': 4, '토치': 5, '를': 6, '공부': 7, '하': 8, '고': 9, '있': 10, '어요': 11, '.': 12})

One-Hot Encoding


In [13]:
def one_hot_encoding(word, word2index):
    tensor = torch.zeros(len(word2index))
    index = word2index[word]
    tensor[index]=1.
    return tensor

In [14]:
torch_vector = one_hot_encoding("토치", word2index)
print(torch_vector)


 0
 0
 0
 0
 0
 1
 0
 0
 0
 0
 0
 0
 0
[torch.FloatTensor of size 13]


In [15]:
py_vector = one_hot_encoding("파이",word2index)
py_vector.dot(torch_vector)


Out[15]:
0.0

In [16]:
train_data = [["배고프다 밥줘","FOOD"],
                ["뭐 먹을만한거 없냐","FOOD"],
                ["맛집 추천","FOOD"],
                ["이 근처 맛있는 음식점 좀","FOOD"],
                ["밥줘","FOOD"],
                ["뭐 먹지?","FOOD"],
                ["삼겹살 먹고싶어","FOOD"],
                ["영화 보고싶다","MEDIA"],
                ["요즘 볼만한거 있어?","MEDIA"],
                ["영화나 예능 추천","MEDIA"],
                ["재밌는 드라마 보여줘","MEDIA"],
                ["신과 함께 줄거리 좀 알려줘","MEDIA"],
                ["고등랩퍼 다시보기 좀","MEDIA"],
                ["재밌는 영상 하이라이트만 보여줘","MEDIA"]]

test_data = [["쭈꾸미 맛집 좀 찾아줘","FOOD"],
               ["매콤한 떡볶이 먹고싶다","FOOD"],
               ["강남 씨지비 조조 영화 스케줄표 좀","MEDIA"],
               ["효리네 민박 보고싶엉","MEDIA"]]

In [17]:
train_X,train_y = list(zip(*train_data))

In [18]:
train_X


Out[18]:
('배고프다 밥줘',
 '뭐 먹을만한거 없냐',
 '맛집 추천',
 '이 근처 맛있는 음식점 좀',
 '밥줘',
 '뭐 먹지?',
 '삼겹살 먹고싶어',
 '영화 보고싶다',
 '요즘 볼만한거 있어?',
 '영화나 예능 추천',
 '재밌는 드라마 보여줘',
 '신과 함께 줄거리 좀 알려줘',
 '고등랩퍼 다시보기 좀',
 '재밌는 영상 하이라이트만 보여줘')

In [19]:
train_y


Out[19]:
('FOOD',
 'FOOD',
 'FOOD',
 'FOOD',
 'FOOD',
 'FOOD',
 'FOOD',
 'MEDIA',
 'MEDIA',
 'MEDIA',
 'MEDIA',
 'MEDIA',
 'MEDIA',
 'MEDIA')

In [20]:
train_X = [kor_tagger.morphs(x) for x in train_X] # Tokenize
print(train_X)


[['배고프', '다', '밥', '주', '어'], ['뭐', '먹', '을', '만하', 'ㄴ', '거', '없', '냐'], ['맛', '집', '추천'], ['이', '근처', '맛있', '는', '음식', '점', '좀'], ['밥', '주', '어'], ['뭐', '먹', '지', '?'], ['삼겹살', '먹', '고', '싶', '어'], ['영화', '보', '고', '싶', '다'], ['요즘', '볼만', '하', 'ㄴ', '거', '있', '어', '?'], ['영화', '나', '예능', '추천'], ['재밌', '는', '드라마', '보여주', '어'], ['신', '과', '함께', '줄거리', '좀', '알려주', '어'], ['고등', '랩', '푸', '어', '다시', '보', '기', '좀'], ['재밌', '는', '영상', '하이라이트', '만', '보여주', '어']]

In [21]:
word2index={'<unk>' : 0}
for x in train_X:
    for token in x:
        if word2index.get(token)==None:
            word2index[token]=len(word2index)
            
class2index = {'FOOD' : 0, 'MEDIA' : 1}
print(word2index)
print(class2index)


{'<unk>': 0, '배고프': 1, '다': 2, '밥': 3, '주': 4, '어': 5, '뭐': 6, '먹': 7, '을': 8, '만하': 9, 'ㄴ': 10, '거': 11, '없': 12, '냐': 13, '맛': 14, '집': 15, '추천': 16, '이': 17, '근처': 18, '맛있': 19, '는': 20, '음식': 21, '점': 22, '좀': 23, '지': 24, '?': 25, '삼겹살': 26, '고': 27, '싶': 28, '영화': 29, '보': 30, '요즘': 31, '볼만': 32, '하': 33, '있': 34, '나': 35, '예능': 36, '재밌': 37, '드라마': 38, '보여주': 39, '신': 40, '과': 41, '함께': 42, '줄거리': 43, '알려주': 44, '고등': 45, '랩': 46, '푸': 47, '다시': 48, '기': 49, '영상': 50, '하이라이트': 51, '만': 52}
{'FOOD': 0, 'MEDIA': 1}

In [22]:
def make_BoW(seq, word2index):
    tensor = torch.zeros(len(word2index))
    for w in seq:
        index = word2index.get(w)
        if index != None:
            tensor[index]+=1.
        else:
            index = word2index['<unk>']
            tensor[index]+=1.
    return tensor

In [23]:
train_X = torch.cat([Variable(make_BoW(x,word2index)).view(1,-1) for x in train_X])
train_y = torch.cat([Variable(torch.LongTensor([class2index[y]])) for y in train_y])

In [24]:
train_X


Out[24]:
Variable containing:

Columns 0 to 12 
    0     1     1     1     1     1     0     0     0     0     0     0     0
    0     0     0     0     0     0     1     1     1     1     1     1     1
    0     0     0     0     0     0     0     0     0     0     0     0     0
    0     0     0     0     0     0     0     0     0     0     0     0     0
    0     0     0     1     1     1     0     0     0     0     0     0     0
    0     0     0     0     0     0     1     1     0     0     0     0     0
    0     0     0     0     0     1     0     1     0     0     0     0     0
    0     0     1     0     0     0     0     0     0     0     0     0     0
    0     0     0     0     0     1     0     0     0     0     1     1     0
    0     0     0     0     0     0     0     0     0     0     0     0     0
    0     0     0     0     0     1     0     0     0     0     0     0     0
    0     0     0     0     0     1     0     0     0     0     0     0     0
    0     0     0     0     0     1     0     0     0     0     0     0     0
    0     0     0     0     0     1     0     0     0     0     0     0     0

Columns 13 to 25 
    0     0     0     0     0     0     0     0     0     0     0     0     0
    1     0     0     0     0     0     0     0     0     0     0     0     0
    0     1     1     1     0     0     0     0     0     0     0     0     0
    0     0     0     0     1     1     1     1     1     1     1     0     0
    0     0     0     0     0     0     0     0     0     0     0     0     0
    0     0     0     0     0     0     0     0     0     0     0     1     1
    0     0     0     0     0     0     0     0     0     0     0     0     0
    0     0     0     0     0     0     0     0     0     0     0     0     0
    0     0     0     0     0     0     0     0     0     0     0     0     1
    0     0     0     1     0     0     0     0     0     0     0     0     0
    0     0     0     0     0     0     0     1     0     0     0     0     0
    0     0     0     0     0     0     0     0     0     0     1     0     0
    0     0     0     0     0     0     0     0     0     0     1     0     0
    0     0     0     0     0     0     0     1     0     0     0     0     0

Columns 26 to 38 
    0     0     0     0     0     0     0     0     0     0     0     0     0
    0     0     0     0     0     0     0     0     0     0     0     0     0
    0     0     0     0     0     0     0     0     0     0     0     0     0
    0     0     0     0     0     0     0     0     0     0     0     0     0
    0     0     0     0     0     0     0     0     0     0     0     0     0
    0     0     0     0     0     0     0     0     0     0     0     0     0
    1     1     1     0     0     0     0     0     0     0     0     0     0
    0     1     1     1     1     0     0     0     0     0     0     0     0
    0     0     0     0     0     1     1     1     1     0     0     0     0
    0     0     0     1     0     0     0     0     0     1     1     0     0
    0     0     0     0     0     0     0     0     0     0     0     1     1
    0     0     0     0     0     0     0     0     0     0     0     0     0
    0     0     0     0     1     0     0     0     0     0     0     0     0
    0     0     0     0     0     0     0     0     0     0     0     1     0

Columns 39 to 51 
    0     0     0     0     0     0     0     0     0     0     0     0     0
    0     0     0     0     0     0     0     0     0     0     0     0     0
    0     0     0     0     0     0     0     0     0     0     0     0     0
    0     0     0     0     0     0     0     0     0     0     0     0     0
    0     0     0     0     0     0     0     0     0     0     0     0     0
    0     0     0     0     0     0     0     0     0     0     0     0     0
    0     0     0     0     0     0     0     0     0     0     0     0     0
    0     0     0     0     0     0     0     0     0     0     0     0     0
    0     0     0     0     0     0     0     0     0     0     0     0     0
    0     0     0     0     0     0     0     0     0     0     0     0     0
    1     0     0     0     0     0     0     0     0     0     0     0     0
    0     1     1     1     1     1     0     0     0     0     0     0     0
    0     0     0     0     0     0     1     1     1     1     1     0     0
    1     0     0     0     0     0     0     0     0     0     0     1     1

Columns 52 to 52 
    0
    0
    0
    0
    0
    0
    0
    0
    0
    0
    0
    0
    0
    1
[torch.FloatTensor of size 14x53]

In [25]:
train_X.size()


Out[25]:
torch.Size([14, 53])

In [26]:
class BoWClassifier(nn.Module):
    def __init__(self, vocab_size, output_size):
        super(BoWClassifier, self).__init__()
        
        self.linear = nn.Linear(vocab_size, output_size)
        
    def forward(self, inputs):
        return self.linear(inputs)

In [27]:
STEP = 100
LR = 0.1
model = BoWClassifier(len(word2index),2)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(),lr=LR)

In [28]:
for step in range(STEP):
    model.zero_grad()
    preds = model(train_X)
    loss = loss_function(preds,train_y)
    if step % 10 == 0:
        print(loss.data[0])
    loss.backward()
    optimizer.step()


0.7329007387161255
0.5580320954322815
0.4465262293815613
0.36951833963394165
0.31345129013061523
0.2710380554199219
0.2379886358976364
0.21161293983459473
0.19014276564121246
0.17237167060375214

In [29]:
# test
index2class = {v:k for k,v in class2index.items()}

In [30]:
for test in test_data:
    X = kor_tagger.morphs(test[0])
    X = Variable(make_BoW(X,word2index)).view(1,-1)
    
    pred = model(X)
    pred = pred.max(1)[1].data[0]
    print("Input : %s" % test[0])
    print("Prediction : %s" % index2class[pred])
    print("Truth : %s" % test[1])
    print("\n")


Input : 쭈꾸미 맛집 좀 찾아줘
Prediction : FOOD
Truth : FOOD


Input : 매콤한 떡볶이 먹고싶다
Prediction : FOOD
Truth : FOOD


Input : 강남 씨지비 조조 영화 스케줄표 좀
Prediction : MEDIA
Truth : MEDIA


Input : 효리네 민박 보고싶엉
Prediction : MEDIA
Truth : MEDIA



In [ ]: