In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import nltk
from konlpy.tag import Kkma
kor_tagger = Kkma()
In [2]:
english_data = "Hi, Kyle! Welcome to Retrica"
In [3]:
token = nltk.word_tokenize(english_data)
In [4]:
print(token)
In [5]:
korea_data = "안녕? 나는 파이토치를 공부하고 있어요."
In [6]:
token = kor_tagger.morphs(korea_data)
print(token)
In [7]:
# 꼬꼬마 사전은 처음에 조금 느리지만 성능은 괜찮은 편
In [8]:
# Pos
kor_tagger.pos(korea_data)
Out[8]:
In [9]:
kor_tagger.sentences(korea_data)
Out[9]:
In [10]:
kor_tagger.nouns(korea_data)
Out[10]:
In [11]:
from collections import OrderedDict, defaultdict
In [12]:
word2index = defaultdict()
for vo in token:
if not word2index.get(vo):
word2index[vo]=len(word2index)
print(word2index)
In [13]:
def one_hot_encoding(word, word2index):
tensor = torch.zeros(len(word2index))
index = word2index[word]
tensor[index]=1.
return tensor
In [14]:
torch_vector = one_hot_encoding("토치", word2index)
print(torch_vector)
In [15]:
py_vector = one_hot_encoding("파이",word2index)
py_vector.dot(torch_vector)
Out[15]:
In [16]:
train_data = [["배고프다 밥줘","FOOD"],
["뭐 먹을만한거 없냐","FOOD"],
["맛집 추천","FOOD"],
["이 근처 맛있는 음식점 좀","FOOD"],
["밥줘","FOOD"],
["뭐 먹지?","FOOD"],
["삼겹살 먹고싶어","FOOD"],
["영화 보고싶다","MEDIA"],
["요즘 볼만한거 있어?","MEDIA"],
["영화나 예능 추천","MEDIA"],
["재밌는 드라마 보여줘","MEDIA"],
["신과 함께 줄거리 좀 알려줘","MEDIA"],
["고등랩퍼 다시보기 좀","MEDIA"],
["재밌는 영상 하이라이트만 보여줘","MEDIA"]]
test_data = [["쭈꾸미 맛집 좀 찾아줘","FOOD"],
["매콤한 떡볶이 먹고싶다","FOOD"],
["강남 씨지비 조조 영화 스케줄표 좀","MEDIA"],
["효리네 민박 보고싶엉","MEDIA"]]
In [17]:
train_X,train_y = list(zip(*train_data))
In [18]:
train_X
Out[18]:
In [19]:
train_y
Out[19]:
In [20]:
train_X = [kor_tagger.morphs(x) for x in train_X] # Tokenize
print(train_X)
In [21]:
word2index={'<unk>' : 0}
for x in train_X:
for token in x:
if word2index.get(token)==None:
word2index[token]=len(word2index)
class2index = {'FOOD' : 0, 'MEDIA' : 1}
print(word2index)
print(class2index)
In [22]:
def make_BoW(seq, word2index):
tensor = torch.zeros(len(word2index))
for w in seq:
index = word2index.get(w)
if index != None:
tensor[index]+=1.
else:
index = word2index['<unk>']
tensor[index]+=1.
return tensor
In [23]:
train_X = torch.cat([Variable(make_BoW(x,word2index)).view(1,-1) for x in train_X])
train_y = torch.cat([Variable(torch.LongTensor([class2index[y]])) for y in train_y])
In [24]:
train_X
Out[24]:
In [25]:
train_X.size()
Out[25]:
In [26]:
class BoWClassifier(nn.Module):
def __init__(self, vocab_size, output_size):
super(BoWClassifier, self).__init__()
self.linear = nn.Linear(vocab_size, output_size)
def forward(self, inputs):
return self.linear(inputs)
In [27]:
STEP = 100
LR = 0.1
model = BoWClassifier(len(word2index),2)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(),lr=LR)
In [28]:
for step in range(STEP):
model.zero_grad()
preds = model(train_X)
loss = loss_function(preds,train_y)
if step % 10 == 0:
print(loss.data[0])
loss.backward()
optimizer.step()
In [29]:
# test
index2class = {v:k for k,v in class2index.items()}
In [30]:
for test in test_data:
X = kor_tagger.morphs(test[0])
X = Variable(make_BoW(X,word2index)).view(1,-1)
pred = model(X)
pred = pred.max(1)[1].data[0]
print("Input : %s" % test[0])
print("Prediction : %s" % index2class[pred])
print("Truth : %s" % test[1])
print("\n")
In [ ]: