Import


In [ ]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np
from sklearn import metrics
import pandas

import tensorflow as tf
from tensorflow.contrib import learn

import chainer.functions as F
import chainer.links as L
from chainer import optimizers, Chain
from commonml.skchainer import ChainerEstimator, SoftmaxCrossEntropyClassifier, RnnEstimator
from commonml.text import VocabularyTransformer

import logging
logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
logging.root.level = 20

Downloads, unpacks and reads DBpedia dataset.


In [ ]:
dbpedia = learn.datasets.load_dataset('dbpedia')
X_train, y_train = pandas.DataFrame(dbpedia.train.data)[1], pandas.Series(dbpedia.train.target)
X_test, y_test = pandas.DataFrame(dbpedia.test.data)[1], pandas.Series(dbpedia.test.target)

Process vocabulary


In [ ]:
MAX_DOCUMENT_LENGTH = 10

vocab_transformer = VocabularyTransformer(MAX_DOCUMENT_LENGTH)
X_train_idx = np.array(list(vocab_transformer.fit_transform(X_train)))
X_test_idx = np.array(list(vocab_transformer.transform(X_test)))

n_words = len(vocab_transformer.get_feature_names())
print('Total words: %d' % n_words)

Models


In [ ]:
EMBEDDING_SIZE = 50

class RNNModel(Chain):

    def __init__(self, n_words, n_units, n_classes):
        super(RNNModel, self).__init__(embed=F.EmbedID(n_words, n_units),
                                       l1=L.LSTM(n_units, n_units),
                                       l2=L.Linear(n_units, n_classes),
                                      )

    def __call__(self, x):
        h0 = self.embed(x)
        h1 = self.l1(h0)
        h2 = self.l2(h1)
        return h2

    def reset_state(self):
        self.l1.reset_state()

classifier = RnnEstimator(model=SoftmaxCrossEntropyClassifier(RNNModel(n_words, EMBEDDING_SIZE, 15)),
                          optimizer=optimizers.Adam(),
                          batch_size=1000,
                          device=0,
                          stop_trigger=(100, 'epoch'))
classifier.fit(X_train_idx, y_train)
score = metrics.accuracy_score(y_test, classifier.predict(X_test_idx))
print('Accuracy: {0:f}'.format(score))