Chinese document Classification


In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
plt.style.use('fivethirtyeight')
plt.rcParams['figure.figsize'] = (14, 6)

from local_settings import settings, datautils

Overview of Task

1. Load data


In [2]:
data = pd.read_csv(settings.ZHNEWS_CSV, names=['label', 'title', 'content'])
data.head()


Out[2]:
label title content
0 1 ti3 ca1o shi4 jie4 be1i : che2ng fe1i na2 pi2... su4 du4 : ( shuo1 mi2ng : dia3n ji1 zi4 do4ng ...
1 4 da3o ha2ng du2 jia1 ti2 go1ng me3i ri4 ba4o jia4 \n re4 ...
2 1 wa3ng yi4 ti3 yu4 gu3n do4ng tu2 ji2 \n be3n tu2 ji2 go4ng 7 zh...
3 3 zi1 lia4o tu2 pia4n : dia4n shi4 ju4 < fu2 gu... wa3ng ye4 \n bu4 zhi1 chi2 Flash\n xi1n la4n...
4 2 niu3 yua2n dui4 me3i yua2n : ku4 lu2n jia3ng ... xi1n xi1 la2n ca2i cha2ng ku4 lu2n fa1 bia3o j...

In [3]:
sns.countplot(data['label'])


Out[3]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f8cd8ae3828>

2. Build vocab


In [4]:
chars = 'abcdefghijklmnopqrstuvwxyz-,;!?:\'\\|_@#$%ˆ&*˜‘+-=<>()[]{} '
char_to_index = {char:i for i, char in enumerate(chars)}
index_to_char = {i: char for i, char in enumerate(chars)}

3. Find max sequence length


In [5]:
maxlen = int(max(data['title'].apply(len)))
maxlen


Out[5]:
207

4. Convert sequences to Tensors


In [6]:
def encode_input(title, maxlen=207):
    title = title.lower().strip()
    encoding = np.zeros((len(chars), maxlen), dtype=np.int64)
    for i, char in enumerate(title[:maxlen]):
        index = char_to_index.get(char, 'unknown')
        if index is not 'unknown':
            encoding[index,i] = 1
    return encoding

In [7]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F

In [8]:
encode_input('Brian')


Out[8]:
array([[0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [9]:
encode_input('Brian').shape


Out[9]:
(57, 207)

5. Build PyTorch Dataset and DataLoader


In [10]:
from torch.utils.data import Dataset, DataLoader

In [11]:
class SogouNews(Dataset):
    """Sogou News dataset"""
    
    def __init__(self, data_path):
        self.data = pd.read_csv(data_path, names=['label', 'title', 'content']).dropna()
        del self.data['content']
        self.X = self.data['title']
        self.y = self.data['label']
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        content = torch.from_numpy(encode_input(self.data['title'][index])).float()
        label = self.data['label'][index] - 1
        sample = {'X': content, 'y': label}
        return sample

In [12]:
sogou_dataset = SogouNews(settings.ZHNEWS_CSV)

In [13]:
dataloader = DataLoader(sogou_dataset, batch_size=32, shuffle=True, num_workers=0)

In [14]:
test_batch = next(iter(dataloader))
test_batch['X'][0]


Out[14]:
    0     0     0  ...      0     0     0
    0     0     0  ...      0     0     0
    0     0     0  ...      0     0     0
       ...          ⋱          ...       
    0     0     0  ...      0     0     0
    0     0     0  ...      0     0     0
    0     0     0  ...      0     0     0
[torch.FloatTensor of size 57x207]

Define Model


In [15]:
class CharCNN(nn.Module):    
    def __init__(self, n_classes, vocab_size, max_seq_length, channel_size=128, pool_size=5):
        
        super(CharCNN, self).__init__()
    
        self.conv_stack = nn.ModuleList([nn.Conv1d(vocab_size, channel_size, 7), 
                                         nn.ReLU(),
                                         nn.BatchNorm1d(num_features=channel_size),
                                         nn.MaxPool1d(pool_size),
                                         nn.Conv1d(channel_size, channel_size, 3, padding=1),
                                         nn.ReLU(),
                                         nn.BatchNorm1d(num_features=channel_size),
                                         nn.MaxPool1d(pool_size)])
        self.dropout1 = nn.Dropout(p=0.5)
        self.output = nn.Linear(1024, n_classes)
        
        
    def forward(self, x):
        for op in self.conv_stack:
            x = op(x)
        
        x = x.view(x.size(0),-1)
        x = self.dropout1(x)
        x = self.output(x)
        return x

Define loss


In [16]:
criterion = nn.CrossEntropyLoss()

In [17]:
from tqdm import tqdm_notebook

In [18]:
def train(model, dataloader, num_epochs, loss_history):
    cuda = torch.cuda.is_available()
    if cuda:
        model.cuda()
    optimizer = torch.optim.Adam(model.parameters())
    bar = tqdm_notebook(total=len(dataloader))
    for i in range(num_epochs):
        per_epoch_losses = []
        for batch in dataloader:
            X = Variable(batch['X'])
            y = Variable(batch['y'])
            if cuda:
                X = X.cuda()
                y = y.cuda()
            model.zero_grad()
            outputs = model(X)
            loss = criterion(outputs, y)
            loss.backward()
            optimizer.step()
            per_epoch_losses.append(loss.data[0])
            bar.set_postfix(loss=loss.data[0])
            bar.update(1)
        loss_history.append(np.mean(per_epoch_losses))
        print('epoch[%d] loss: %.4f' % (i, loss.data[0]))
    return loss_history

In [19]:
charcnn = CharCNN(n_classes=5, vocab_size=len(chars), max_seq_length=maxlen)

In [20]:
loss_history = []

In [21]:
try:
    train(charcnn, dataloader, 100, loss_history)
except KeyboardInterrupt:
    print("...")


epoch[0] loss: 1.0125
epoch[1] loss: 0.6316
epoch[2] loss: 0.3102
epoch[3] loss: 0.3798
epoch[4] loss: 0.4475
epoch[5] loss: 0.4318
epoch[6] loss: 0.4616
epoch[7] loss: 0.2947
epoch[8] loss: 0.3853
epoch[9] loss: 0.5907
epoch[10] loss: 0.2385
epoch[11] loss: 0.3859
epoch[12] loss: 0.5880
epoch[13] loss: 0.0821
epoch[14] loss: 0.1574
epoch[15] loss: 0.2979
epoch[16] loss: 0.1919
epoch[17] loss: 0.0421
epoch[18] loss: 0.2789
epoch[19] loss: 0.0759
epoch[20] loss: 0.2457
epoch[21] loss: 0.1539
epoch[22] loss: 0.2495
epoch[23] loss: 0.2738
epoch[24] loss: 0.1787
epoch[25] loss: 0.1818
epoch[26] loss: 0.1908
epoch[27] loss: 0.1909
epoch[28] loss: 0.1800
epoch[29] loss: 0.2165
epoch[30] loss: 0.1303
epoch[31] loss: 0.0821
epoch[32] loss: 0.1988
epoch[33] loss: 0.0734
epoch[34] loss: 0.0995
epoch[35] loss: 0.3799
epoch[36] loss: 0.1245
epoch[37] loss: 0.2139
epoch[38] loss: 0.3832
epoch[39] loss: 0.0434
epoch[40] loss: 0.1631
epoch[41] loss: 0.1988
epoch[42] loss: 0.0680
epoch[43] loss: 0.1036
epoch[44] loss: 0.1608
epoch[45] loss: 0.0424
epoch[46] loss: 0.1204
epoch[47] loss: 0.0638
epoch[48] loss: 0.0178
epoch[49] loss: 0.0133
epoch[50] loss: 0.0336
epoch[51] loss: 0.2700
epoch[52] loss: 0.2089
epoch[53] loss: 0.2930
epoch[54] loss: 0.0989
epoch[55] loss: 0.0273
epoch[56] loss: 0.0835
epoch[57] loss: 0.2447
epoch[58] loss: 0.1637
epoch[59] loss: 0.1404
epoch[60] loss: 0.2227
epoch[61] loss: 0.0680
epoch[62] loss: 0.2013
epoch[63] loss: 0.1672
epoch[64] loss: 0.1086
epoch[65] loss: 0.1863
epoch[66] loss: 0.2671
epoch[67] loss: 0.0799
epoch[68] loss: 0.1660
epoch[69] loss: 0.0298
epoch[70] loss: 0.0633
epoch[71] loss: 0.0100
epoch[72] loss: 0.1455
epoch[73] loss: 0.0962
epoch[74] loss: 0.1906
epoch[75] loss: 0.0872
epoch[76] loss: 0.1409
epoch[77] loss: 0.0295
epoch[78] loss: 0.0506
epoch[79] loss: 0.2864
epoch[80] loss: 0.0231
epoch[81] loss: 0.0760
epoch[82] loss: 0.0249
epoch[83] loss: 0.0694
epoch[84] loss: 0.3208
epoch[85] loss: 0.4952
epoch[86] loss: 0.1414
epoch[87] loss: 0.1518
epoch[88] loss: 0.0397
epoch[89] loss: 0.0941
epoch[90] loss: 0.1205
epoch[91] loss: 0.0815
epoch[92] loss: 0.0767
epoch[93] loss: 0.0212
epoch[94] loss: 0.0082
epoch[95] loss: 0.0798
epoch[96] loss: 0.1038
epoch[97] loss: 0.1353
epoch[98] loss: 0.0666
epoch[99] loss: 0.0283
187500it [24:21, 150.17it/s, loss=0.0283]

In [22]:
plt.plot(loss_history);



In [ ]: