Genre Classification

This is an example for document Genre classification using the Brown corpus (provided by NLTK).

The 5000 most common words are taken from the Brown Corpus. Then we create a 5000 dimensional bag of words and input this to a neural network. The network predicts the genre.

The input for the network is a 5000 dimensional vector. Each position correspondence to one of the most common words. The value is set to 1, if the word appears in a document. Otherwise to 0.

Reading the corpus

Reads in the corpus and create a bag of word representation for each document.



In [1]:

    
import nltk
import gensim
import nltk.corpus
import random

from nltk.corpus import brown
from nltk.stem.porter import *
import numpy as np


np.random.seed(0)

num_max_words = 5000

stopwords = {}
for stopword in nltk.corpus.stopwords.words('english'):
    stopwords[stopword.lower()] = True
    

def preprocessDocument(words):
    stemmer = PorterStemmer()
    return [stemmer.stem(w.lower()) for w in words if len(w) >= 3 and w.lower() not in stopwords]

brown_words = brown.words()
fdist = nltk.FreqDist(preprocessDocument(brown_words))


max_words = []
for word, freq in fdist.most_common(num_max_words):
    max_words.append(word)
    
max_words = sorted(max_words)   

max_words_idx = {}
idx = 0

for max_word in max_words:
    max_words_idx[max_word] = idx
    idx += 1



def getBoW(words):
    outputvector = [0]*len(max_words)
    
    prepocessed = preprocessDocument(words)
    
    for word in prepocessed:
        if word in max_words_idx:
            idx = max_words_idx[word]
            outputvector[idx] = 1 
    
    return outputvector









    



Couldn't import dot_parser, loading of dot files will not be possible.

Train / Test Set

This creates the train and test sets.



In [7]:

    
category2Idx = {}
idx = 0
for cat in brown.categories():
    category2Idx[cat] = idx
    idx += 1

file_ids = sorted(brown.fileids())
print "File IDs:",",".join(file_ids[0:10])

random.seed(4)
random.shuffle(file_ids)

train_file_ids, test_file_ids = file_ids[0:300],file_ids[300:]

print "Train File IDs:",",".join(train_file_ids[0:10])
print "Test File IDs:",",".join(test_file_ids[0:10])

train_x = []
train_y = []

test_x = []
test_y = []

for fileid in train_file_ids:
    category = brown.categories(fileid)[0]
    all_words = brown.words(fileid) 
    bow = getBoW(all_words)
    
    train_x.append(bow)
    train_y.append(category2Idx[category])

for fileid in test_file_ids:
    category = brown.categories(fileid)[0]
    all_words = brown.words(fileid) 
    bow = getBoW(all_words)
    
    test_x.append(bow)
    test_y.append(category2Idx[category])
    

train_x = np.asarray(train_x, dtype='int32')
train_y = np.asarray(train_y, dtype='int32')
test_x = np.asarray(test_x, dtype='int32')
test_y = np.asarray(test_y, dtype='int32')









    



File IDs: ca01,ca02,ca03,ca04,ca05,ca06,ca07,ca08,ca09,ca10
Train File IDs: ce29,ce05,ck08,cg07,ck26,cj05,cf07,cg65,cj70,cj11
Test File IDs: cn11,cg73,cp18,cf40,ca11,ca09,cf21,cj67,cg51,ch13

Neural Network

Given the Training and Test sets, we now define a feed forward network. We use a 500 dimensional hidden layer with dropout of 0.5.

Feel free to try different hidden layer sizes and number of hidden layers.



In [19]:

    
from keras.layers import containers
import keras
from keras.models import Sequential
from keras.layers.core import Dense, Flatten, AutoEncoder, Dropout
from keras.optimizers import SGD
from keras.utils import np_utils

batch_size = 30
nb_epoch = 50
nb_classes = len(category2Idx)

model = Sequential()
model.add(Dense(500, input_dim=num_max_words, activation='tanh'))
model.add(Dropout(0.5))
model.add(Dense(nb_classes, activation='softmax'))

train_y_cat = np_utils.to_categorical(train_y, nb_classes)
test_y_cat = np_utils.to_categorical(test_y, nb_classes)


model.compile(loss='categorical_crossentropy', optimizer='Adam')
score = model.evaluate(test_x, test_y_cat, show_accuracy=True, verbose=0)
print('Test score before fine turning:', score[0])
print('Test accuracy before fine turning:', score[1])
model.fit(train_x, train_y_cat, batch_size=batch_size, nb_epoch=nb_epoch,
          show_accuracy=True, validation_data=(test_x, test_y_cat))
score = model.evaluate(test_x, test_y_cat, show_accuracy=True, verbose=0)
print('Test score after fine turning:', score[0])
print('Test accuracy after fine turning:', score[1])









    



('Test score before fine turning:', 2.7576393604278566)
('Test accuracy before fine turning:', 0.070000000000000007)
Train on 300 samples, validate on 200 samples
Epoch 1/50
300/300 [==============================] - 0s - loss: 2.3795 - acc: 0.2833 - val_loss: 2.0335 - val_acc: 0.3100
Epoch 2/50
300/300 [==============================] - 0s - loss: 0.8440 - acc: 0.8200 - val_loss: 1.6403 - val_acc: 0.4200
Epoch 3/50
300/300 [==============================] - 0s - loss: 0.3087 - acc: 0.9833 - val_loss: 1.4993 - val_acc: 0.4400
Epoch 4/50
300/300 [==============================] - 0s - loss: 0.1097 - acc: 1.0000 - val_loss: 1.4861 - val_acc: 0.4500
Epoch 5/50
300/300 [==============================] - 0s - loss: 0.0573 - acc: 1.0000 - val_loss: 1.4593 - val_acc: 0.4750
Epoch 6/50
300/300 [==============================] - 0s - loss: 0.0313 - acc: 1.0000 - val_loss: 1.4691 - val_acc: 0.4450
Epoch 7/50
300/300 [==============================] - 0s - loss: 0.0243 - acc: 1.0000 - val_loss: 1.4730 - val_acc: 0.4400
Epoch 8/50
300/300 [==============================] - 0s - loss: 0.0176 - acc: 1.0000 - val_loss: 1.4788 - val_acc: 0.4450
Epoch 9/50
300/300 [==============================] - 0s - loss: 0.0155 - acc: 1.0000 - val_loss: 1.4782 - val_acc: 0.4750
Epoch 10/50
300/300 [==============================] - 0s - loss: 0.0133 - acc: 1.0000 - val_loss: 1.4798 - val_acc: 0.4650
Epoch 11/50
300/300 [==============================] - 0s - loss: 0.0118 - acc: 1.0000 - val_loss: 1.4776 - val_acc: 0.4700
Epoch 12/50
300/300 [==============================] - 0s - loss: 0.0113 - acc: 1.0000 - val_loss: 1.4777 - val_acc: 0.4700
Epoch 13/50
300/300 [==============================] - 0s - loss: 0.0088 - acc: 1.0000 - val_loss: 1.4795 - val_acc: 0.4750
Epoch 14/50
300/300 [==============================] - 0s - loss: 0.0085 - acc: 1.0000 - val_loss: 1.4851 - val_acc: 0.4650
Epoch 15/50
300/300 [==============================] - 0s - loss: 0.0077 - acc: 1.0000 - val_loss: 1.4893 - val_acc: 0.4600
Epoch 16/50
300/300 [==============================] - 0s - loss: 0.0069 - acc: 1.0000 - val_loss: 1.4878 - val_acc: 0.4700
Epoch 17/50
270/300 [==========================>...] - ETA: 0s - loss: 0.0065 - acc: 1.0000





    



---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-19-e243ce4ce096> in <module>()
     25 print('Test accuracy before fine turning:', score[1])
     26 model.fit(train_x, train_y_cat, batch_size=batch_size, nb_epoch=nb_epoch,
---> 27           show_accuracy=True, validation_data=(test_x, test_y_cat))
     28 score = model.evaluate(test_x, test_y_cat, show_accuracy=True, verbose=0)
     29 print('Test score after fine turning:', score[0])

/usr/local/lib/python2.7/dist-packages/Keras-0.2.0-py2.7.egg/keras/models.pyc in fit(self, X, y, batch_size, nb_epoch, verbose, callbacks, validation_split, validation_data, shuffle, show_accuracy, class_weight, sample_weight)
    487                          verbose=verbose, callbacks=callbacks,
    488                          val_f=val_f, val_ins=val_ins,
--> 489                          shuffle=shuffle, metrics=metrics)
    490 
    491     def predict(self, X, batch_size=128, verbose=0):

/usr/local/lib/python2.7/dist-packages/Keras-0.2.0-py2.7.egg/keras/models.pyc in _fit(self, f, ins, out_labels, batch_size, nb_epoch, verbose, callbacks, val_f, val_ins, shuffle, metrics)
    221                     if do_validation:
    222                         # replace with self._evaluate
--> 223                         val_outs = self._test_loop(val_f, val_ins, batch_size=batch_size, verbose=0)
    224                         if type(val_outs) != list:
    225                             val_outs = [val_outs]

/usr/local/lib/python2.7/dist-packages/Keras-0.2.0-py2.7.egg/keras/models.pyc in _test_loop(self, f, ins, batch_size, verbose)
    277             ins_batch = slice_X(ins, batch_ids)
    278 
--> 279             batch_outs = f(*ins_batch)
    280             if type(batch_outs) == list:
    281                 if batch_index == 0:

/home/likewise-open/UKP/reimers/NLP/Tools/Theano/Theano/theano/compile/function_module.pyc in __call__(self, *args, **kwargs)
    838         t0_fn = time.time()
    839         try:
--> 840             outputs = self.fn()
    841         except Exception:
    842             if hasattr(self.fn, 'position_of_error'):

KeyboardInterrupt:



In [ ]:



In [ ]:



In [ ]: