In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
plt.style.use('fivethirtyeight')
plt.rcParams['figure.figsize'] = (14, 6)
from local_settings import settings, datautils
In [2]:
data = pd.read_csv(settings.ZHNEWS_CSV, names=['label', 'title', 'content'])
data.head()
Out[2]:
In [3]:
sns.countplot(data['label'])
Out[3]:
In [4]:
chars = 'abcdefghijklmnopqrstuvwxyz-,;!?:\'\\|_@#$%ˆ&*˜‘+-=<>()[]{} '
char_to_index = {char:i for i, char in enumerate(chars)}
index_to_char = {i: char for i, char in enumerate(chars)}
In [5]:
maxlen = int(max(data['title'].apply(len)))
maxlen
Out[5]:
In [6]:
def encode_input(title, maxlen=207):
title = title.lower().strip()
encoding = np.zeros((len(chars), maxlen), dtype=np.int64)
for i, char in enumerate(title[:maxlen]):
index = char_to_index.get(char, 'unknown')
if index is not 'unknown':
encoding[index,i] = 1
return encoding
In [7]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
In [8]:
encode_input('Brian')
Out[8]:
In [9]:
encode_input('Brian').shape
Out[9]:
In [10]:
from torch.utils.data import Dataset, DataLoader
In [11]:
class SogouNews(Dataset):
"""Sogou News dataset"""
def __init__(self, data_path):
self.data = pd.read_csv(data_path, names=['label', 'title', 'content']).dropna()
del self.data['content']
self.X = self.data['title']
self.y = self.data['label']
def __len__(self):
return len(self.data)
def __getitem__(self, index):
content = torch.from_numpy(encode_input(self.data['title'][index])).float()
label = self.data['label'][index] - 1
sample = {'X': content, 'y': label}
return sample
In [12]:
sogou_dataset = SogouNews(settings.ZHNEWS_CSV)
In [13]:
dataloader = DataLoader(sogou_dataset, batch_size=32, shuffle=True, num_workers=0)
In [14]:
test_batch = next(iter(dataloader))
test_batch['X'][0]
Out[14]:
In [15]:
class CharCNN(nn.Module):
def __init__(self, n_classes, vocab_size, max_seq_length, channel_size=128, pool_size=5):
super(CharCNN, self).__init__()
self.conv_stack = nn.ModuleList([nn.Conv1d(vocab_size, channel_size, 7),
nn.ReLU(),
nn.BatchNorm1d(num_features=channel_size),
nn.MaxPool1d(pool_size),
nn.Conv1d(channel_size, channel_size, 3, padding=1),
nn.ReLU(),
nn.BatchNorm1d(num_features=channel_size),
nn.MaxPool1d(pool_size)])
self.dropout1 = nn.Dropout(p=0.5)
self.output = nn.Linear(1024, n_classes)
def forward(self, x):
for op in self.conv_stack:
x = op(x)
x = x.view(x.size(0),-1)
x = self.dropout1(x)
x = self.output(x)
return x
In [16]:
criterion = nn.CrossEntropyLoss()
In [17]:
from tqdm import tqdm_notebook
In [18]:
def train(model, dataloader, num_epochs, loss_history):
cuda = torch.cuda.is_available()
if cuda:
model.cuda()
optimizer = torch.optim.Adam(model.parameters())
bar = tqdm_notebook(total=len(dataloader))
for i in range(num_epochs):
per_epoch_losses = []
for batch in dataloader:
X = Variable(batch['X'])
y = Variable(batch['y'])
if cuda:
X = X.cuda()
y = y.cuda()
model.zero_grad()
outputs = model(X)
loss = criterion(outputs, y)
loss.backward()
optimizer.step()
per_epoch_losses.append(loss.data[0])
bar.set_postfix(loss=loss.data[0])
bar.update(1)
loss_history.append(np.mean(per_epoch_losses))
print('epoch[%d] loss: %.4f' % (i, loss.data[0]))
return loss_history
In [19]:
charcnn = CharCNN(n_classes=5, vocab_size=len(chars), max_seq_length=maxlen)
In [20]:
loss_history = []
In [21]:
try:
train(charcnn, dataloader, 100, loss_history)
except KeyboardInterrupt:
print("...")
In [22]:
plt.plot(loss_history);
In [ ]: