In [1]:
from __future__ import print_function
import os
import numpy as np
np.random.seed(123)
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.models import Sequential, Model
from keras.layers import Embedding, Convolution1D, MaxPooling1D, Flatten
from keras.layers import Input, Merge, Dense
from keras.layers import Dropout
from keras.optimizers import SGD
from keras.callbacks import ModelCheckpoint
databaseConnectionServer = 'srn02.cs.cityu.edu.hk'
documentTable = 'document'
BASE_DIR = '../../'
GLOVE_DIR = BASE_DIR + 'glove/'
EMBEDDING_DIM = 200
embedfile = 'glove.6B.' + str(EMBEDDING_DIM) + 'd.txt'
chunk_size = 1000
samples = 3200
MAX_NB_WORDS = 40000
VALIDATION_SPLIT = 0.2
MAX_NB_WORDS = 40000
CONVOLUTION_FEATURE = 256
BORDER_MODE = 'valid'
DENSE_FEATURE = 256
DROP_OUT = 0.5
LEARNING_RATE=0.01
MOMENTUM=0.9
nb_epoch=10
batch_size=16
doc_id = 85
author_id = 44
authorList = [64,44,82,100]
MAX_SEQUENCE_LENGTH = chunk_size
classes = len(authorList)
In [2]:
print('Level = Word')
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, embedfile))
for line in f:
values = line.split()
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
embeddings_index[word] = coefs
f.close()
print('File used: %s' % (embedfile))
print('Found %s word vectors.' % (len(embeddings_index)))
In [3]:
texts = [] # list of text samples
labels_index = {} # dictionary mapping label name to numeric id
labels = [] # list of label ids
import DatabaseQuery
from sshtunnel import SSHTunnelForwarder
PORT=5432
with SSHTunnelForwarder((databaseConnectionServer, 22),
ssh_username='stylometry',
ssh_password='stylometry',
remote_bind_address=('localhost', 5432),
local_bind_address=('localhost', 5400)):
textToUse = DatabaseQuery.getWordAuthData(5400, authorList, doc_id,
documentTable = documentTable, chunk_size = chunk_size)
labels = []
texts = []
size = []
authorList = textToUse.author_id.unique()
for auth in authorList:
current = textToUse.loc[textToUse['author_id'] == auth]
size.append(current.shape[0])
print("Author: %5s Size: %5s" % (auth, current.shape[0]))
print("Min: %s" % (min(size)))
print("Max: %s" % (max(size)))
authorList = authorList.tolist()
for auth in authorList:
current = textToUse.loc[textToUse['author_id'] == auth]
if (samples > min(size)):
samples = min(size)
current = current.sample(n = samples)
textlist = current.doc_content.tolist()
texts = texts + textlist
labels = labels + [authorList.index(author_id) for author_id in current.author_id.tolist()]
labels_index = {}
labels_index[0] = 0
for i, auth in enumerate(authorList):
labels_index[i] = auth
del textToUse
print('Authors %s.' % (str(authorList)))
print('Found %s texts.' % len(texts))
print('Found %s labels.' % len(labels))
In [4]:
# finally, vectorize the text samples into a 2D integer tensor
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
data = pad_sequences(sequences, maxlen=chunk_size)
labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)
# split the data into a training set and a validation set
from sklearn.model_selection import train_test_split
trainX, valX, trainY, valY = train_test_split(data, labels, test_size=VALIDATION_SPLIT)
del data, labels
In [5]:
print('Preparing embedding matrix.')
# prepare embedding matrix
nb_words = MAX_NB_WORDS
embedding_matrix = np.zeros((nb_words + 1, EMBEDDING_DIM))
for word, i in word_index.items():
if i > MAX_NB_WORDS:
continue
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
# words not found in embedding index will be all-zeros.
embedding_matrix[i] = embedding_vector
del embeddings_index
print('Training model.')
In [6]:
print(nb_words)
print(MAX_NB_WORDS)
print(len(word_index))
In [19]:
ngram_filters = [3, 4] # Define ngrams list, 3-gram, 4-gram, 5-gram
convs = []
graph_in = Input(shape=(chunk_size, EMBEDDING_DIM))
for n_gram in ngram_filters:
conv = Convolution1D( # Layer X, Features: 256, Kernel Size: ngram
nb_filter=CONVOLUTION_FEATURE, # Number of kernels or number of filters to generate
filter_length=n_gram, # Size of kernels, ngram
activation='relu')(graph_in) # Activation function to use
pool = MaxPooling1D( # Layer X a, Max Pooling: 3
pool_length=3)(conv) # Size of kernels
flat = Flatten()(pool)
convs.append(flat)
model = Sequential()
model.add(Embedding( # Layer 0, Start
input_dim=nb_words + 1, # Size to dictionary, has to be input + 1
output_dim=EMBEDDING_DIM, # Dimensions to generate
weights=[embedding_matrix], # Initialize word weights
input_length=chunk_size, # Define length to input sequences in the first layer
trainable=False)) # Disable weight changes during training
model.add(Dropout(0.25)) # Dropout 25%
out = Merge(mode='concat')(convs) # Layer 1, Output Size: Concatted ngrams feature maps
graph = Model(input=graph_in, output=out) # Concat the ngram convolutions
model.add(graph) # Concat the ngram convolutions
model.add(Dropout(DROP_OUT)) # Dropout 50%
model.add(Dense( # Layer 3, Output Size: 256
output_dim=DENSE_FEATURE, # Output dimension
activation='sigmoid')) # Activation function to use
model.add(Dense( # Layer 4, Output Size: Size Unique Labels, Final
output_dim=classes, # Output dimension
activation='softmax')) # Activation function to use
sgd = SGD(lr=LEARNING_RATE, momentum=MOMENTUM, nesterov=True)
filepath="author-cnn-ngrams-word.hdf5"
model.load_weights(filepath)
model.compile(loss='categorical_crossentropy', optimizer=sgd,
metrics=['accuracy'])
In [20]:
filepath="author-cnn-ngrams-word.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]
# Function to take input of data and return fitted model
# history = model.fit(trainX, trainY, validation_data=(valX, valY),
# nb_epoch=nb_epoch, batch_size=batch_size,
# callbacks=callbacks_list)
# load weights from the best checkpoint
model.load_weights(filepath)
# Compile model again (required to make predictions)
model.compile(loss='categorical_crossentropy', optimizer=sgd,
metrics=['accuracy'])
train_acc = (model.evaluate(trainX, trainY))[1]
print("\n\nFinal Train Accuracy: %.2f" % (train_acc * 100))
val_acc = (model.evaluate(valX, valY))[1]
print("\nFinal Test Accuracy: %.2f" % (val_acc * 100))
import cPickle as pickle
with open('tokenizer.pickle', 'wb') as handle:
pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
In [ ]:
In [21]:
model.pop()
# feature_model = model
# # Compile model again (required to make predictions)
# model.compile(loss='categorical_crossentropy', optimizer=sgd,
# metrics=['accuracy'])
In [22]:
pred = model.predict(trainX)
In [23]:
print(pred.shape)
In [ ]:
print(pred[0])
In [ ]:
print(pred[4141])
In [ ]: