This is an example for document Genre classification using the Brown corpus (provided by NLTK).
The 5000 most common words are taken from the Brown Corpus. Then we create a 5000 dimensional bag of words and input this to a neural network. The network predicts the genre.
The input for the network is a 5000 dimensional vector. Each position correspondence to one of the most common words. The value is set to 1, if the word appears in a document. Otherwise to 0.
In [1]:
import nltk
import gensim
import nltk.corpus
import random
from nltk.corpus import brown
from nltk.stem.porter import *
import numpy as np
np.random.seed(0)
num_max_words = 5000
stopwords = {}
for stopword in nltk.corpus.stopwords.words('english'):
stopwords[stopword.lower()] = True
def preprocessDocument(words):
stemmer = PorterStemmer()
return [stemmer.stem(w.lower()) for w in words if len(w) >= 3 and w.lower() not in stopwords]
brown_words = brown.words()
fdist = nltk.FreqDist(preprocessDocument(brown_words))
max_words = []
for word, freq in fdist.most_common(num_max_words):
max_words.append(word)
max_words = sorted(max_words)
max_words_idx = {}
idx = 0
for max_word in max_words:
max_words_idx[max_word] = idx
idx += 1
def getBoW(words):
outputvector = [0]*len(max_words)
prepocessed = preprocessDocument(words)
for word in prepocessed:
if word in max_words_idx:
idx = max_words_idx[word]
outputvector[idx] = 1
return outputvector
In [7]:
category2Idx = {}
idx = 0
for cat in brown.categories():
category2Idx[cat] = idx
idx += 1
file_ids = sorted(brown.fileids())
print "File IDs:",",".join(file_ids[0:10])
random.seed(4)
random.shuffle(file_ids)
train_file_ids, test_file_ids = file_ids[0:300],file_ids[300:]
print "Train File IDs:",",".join(train_file_ids[0:10])
print "Test File IDs:",",".join(test_file_ids[0:10])
train_x = []
train_y = []
test_x = []
test_y = []
for fileid in train_file_ids:
category = brown.categories(fileid)[0]
all_words = brown.words(fileid)
bow = getBoW(all_words)
train_x.append(bow)
train_y.append(category2Idx[category])
for fileid in test_file_ids:
category = brown.categories(fileid)[0]
all_words = brown.words(fileid)
bow = getBoW(all_words)
test_x.append(bow)
test_y.append(category2Idx[category])
train_x = np.asarray(train_x, dtype='int32')
train_y = np.asarray(train_y, dtype='int32')
test_x = np.asarray(test_x, dtype='int32')
test_y = np.asarray(test_y, dtype='int32')
In [19]:
from keras.layers import containers
import keras
from keras.models import Sequential
from keras.layers.core import Dense, Flatten, AutoEncoder, Dropout
from keras.optimizers import SGD
from keras.utils import np_utils
batch_size = 30
nb_epoch = 50
nb_classes = len(category2Idx)
model = Sequential()
model.add(Dense(500, input_dim=num_max_words, activation='tanh'))
model.add(Dropout(0.5))
model.add(Dense(nb_classes, activation='softmax'))
train_y_cat = np_utils.to_categorical(train_y, nb_classes)
test_y_cat = np_utils.to_categorical(test_y, nb_classes)
model.compile(loss='categorical_crossentropy', optimizer='Adam')
score = model.evaluate(test_x, test_y_cat, show_accuracy=True, verbose=0)
print('Test score before fine turning:', score[0])
print('Test accuracy before fine turning:', score[1])
model.fit(train_x, train_y_cat, batch_size=batch_size, nb_epoch=nb_epoch,
show_accuracy=True, validation_data=(test_x, test_y_cat))
score = model.evaluate(test_x, test_y_cat, show_accuracy=True, verbose=0)
print('Test score after fine turning:', score[0])
print('Test accuracy after fine turning:', score[1])
In [ ]:
In [ ]:
In [ ]: