In [9]:
ls data
In [13]:
!wget https://raw.githubusercontent.com/yunjey/pytorch-tutorial/master/tutorials/02-intermediate/language_model/data/train.txt -P data
In [14]:
less data/train.txt
In [30]:
import os
class Dictionary(object):
def __init__(self):
self.word2idx = {}
self.idx2word = {}
self.idx = 0
def add_word(self, word):
if not word in self.word2idx:
self.word2idx[word] = self.idx
self.idx2word[self.idx] = word
self.idx += 1
def __len__(self):
return len(self.word2idx)
In [31]:
d = Dictionary()
d.add_word('Me')
d.add_word('Hello')
print(d.word2idx)
print(d.idx2word)
print(len(d))
In [84]:
class Corpus(object):
def __init__(self, path='./data'):
self.dictionary = Dictionary()
def get_data(self, path, batch_size=20):
# add words to the dictionary
with open(path, 'r') as f:
tokens = 0
for line in f:
words = line.split() + ['<eos>']
tokens += len(words)
for word in words:
self.dictionary.add_word(word)
# tokenize the file content
ids = torch.LongTensor(tokens)
token = 0
with open(path, 'r') as f:
for line in f:
words = line.split() + ['<eos>']
for word in words:
ids[token] = self.dictionary.word2idx[word]
token += 1
# バッチサイズで割り切れるサイズにする
num_batches = ids.size(0) // batch_size
ids = ids[:num_batches * batch_size]
return ids.view(batch_size, -1)
In [89]:
import torch
import torch.nn as nn
import numpy as np
from torch.autograd import Variable
#from data_utils import Dictionary, Corpus
# hyper parameters
embed_size = 128
hidden_size = 1024
num_layers = 1
num_epochs = 5
num_samples = 1000
batch_size = 20
seq_length = 30
learning_rate = 0.002
# Load Penn Treebank Dataset
train_path = './data/train.txt'
sample_path = './sample.txt'
corpus = Corpus()
# インデックスに変換したコーパス
ids = corpus.get_data(train_path, batch_size) # (20, 46479)
vocab_size = len(corpus.dictionary) # 10000
num_batches = ids.size(1) // seq_length # 1549
print('ids:', ids.size())
print('vocab_size:', vocab_size)
print('num_batches:', num_batches)
In [124]:
# RNN based language model
class RNNLM(nn.Module):
def __init__(self, vocab_size, embed_size, hidden_size, num_layers):
super(RNNLM, self).__init__()
self.embed = nn.Embedding(vocab_size, embed_size)
self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
self.linear = nn.Linear(hidden_size, vocab_size)
self.init_weight()
def init_weight(self):
self.embed.weight.data.uniform_(-0.1, 0.1)
self.linear.bias.data.fill_(0)
self.linear.weight.data.uniform_(-0.1, 0.1)
def forward(self, x, h): # [20, 30]
print('x:', x.size())
print('h:', h[0].size())
print('c:', h[1].size())
# embed word ids to vectors
x = self.embed(x) # [20, 30, 128]
print('embed:', x.size())
# forward propagate RNN
out, h = self.lstm(x, h) # [20, 30, 1024]
out = out.contiguous().view(out.size(0) * out.size(1), out.size(2))
out = self.linear(out)
return out, h
In [125]:
model = RNNLM(vocab_size, embed_size, hidden_size, num_layers)
#model.cuda()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
In [126]:
inputs = Variable(ids[:, 0:30])
targets = Variable(ids[:, 1:31])
states = (Variable(torch.zeros(num_layers, batch_size, hidden_size)),
Variable(torch.zeros(num_layers, batch_size, hidden_size)))
model(inputs, states)
Out[126]:
In [ ]:
# training
for epoch in range(num_epochs):
# initial hidden and memory states
states = (Variable(torch.zeros(num_layers, batch_size, hidden_size)),
Variable(torch.zeros(num_layers, batch_size, hidden_size)))
for i in range(0, ids.size(1) - seq_length, seq_length):
# get batch inputs and targets
# 入力単語系列に対して1つずらした単語系列が出力となるように学習
# in: [0:30], out: [1:31]
# in: [1:31], out: [2:32]
inputs = Variable(ids[:, i:i+seq_length])
targets = Variable(ids[:, (i+1):(i+1)+seq_length])
model.zero_grad()
states = detach(states)
outputs, states = model(inputs, states)
loss = criterion(outputs, targets.view(-1))
loss.backward()
torch.nn.utils.clip_grad_norm(model.paramters(), 0.5)
optimizer.step()