In [1]:
# coding: utf-8
from __future__ import print_function
import os
import numpy as np
np.random.seed(1337)

import pandas as pd
import theano
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Dense, Flatten
from keras.layers import Convolution1D, MaxPooling1D, Embedding
from keras.models import Model
from keras.layers import Input, Dropout
from keras.optimizers import SGD, Adadelta
from keras.models import Sequential
import sys
import string

MAX_SEQUENCE_LENGTH = 1000
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2
CONVOLUTION_FEATURE = 256
DENSE_FEATURE = 1024
DROP_OUT = 0.5
LEARNING_RATE=0.001
MOMENTUM=0.9
EPOCH=25
BATCH_SIZE=128
embed = 100
embedfile = 'glove.6B.100d.txt'
authorList = [551, 2703, 2971, 8303, 7679]
doc_id = 1210
chunk_size = 1000
nb_epoch = 30
EPOCH = nb_epoch


Using gpu device 0: GeForce GTX 950 (CNMeM is disabled, cuDNN 5005)
Using Theano backend.

In [2]:
# first, build index mapping words in the embeddings set
# to their embedding vector

#This alphabet is 69 chars vs. 70 reported in the paper since they include two
# '-' characters. See https://github.com/zhangxiangxiao/Crepe#issues.

print('Indexing char vectors.')

alphabet = (list(string.ascii_lowercase) + list(string.digits) + 
            list(string.punctuation) + ['\n'])
vocab_size = len(alphabet)
check = set(alphabet)

vocab = {}
reverse_vocab = {}
for ix, t in enumerate(alphabet):
    vocab[t] = ix
    reverse_vocab[ix] = t

MAX_NB_WORDS = vocab_size

print('Found %s char vectors.' % str(MAX_NB_WORDS))

# second, prepare text samples and their labels
print('Processing text dataset')


Indexing char vectors.
Found 69 char vectors.
Processing text dataset

In [3]:
texts = []  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
labels = []  # list of label ids
import DatabaseQuery
# textToUse = pd.read_csv("suffle_4_6000.csv", names=["author_id", "doc_content"], dtype={'author_id': int})
from sshtunnel import SSHTunnelForwarder
PORT=5432
with SSHTunnelForwarder(('srn01.cs.cityu.edu.hk', 22),
                        ssh_username='stylometry',
                        ssh_password='stylometry',
                        remote_bind_address=('localhost', 5432),
                        local_bind_address=('localhost', 5400)):
    textToUse = DatabaseQuery.getCharAuthData(5400, authors = authorList, doc = doc_id, 
                                              vocab_size = vocab_size, chunk_size = chunk_size)


chunk_size 1449.
Execution completed
Read completed
Number of rows: 113
author_id       int64
doc_content    object
dtype: object
Data Frame created: Shape: (30266, 2)

In [4]:
labels = []
texts = []
size = []
authorList = textToUse.author_id.unique()
for auth in authorList:
    current = textToUse.loc[textToUse['author_id'] == auth]
    size.append(current.shape[0])
print("Mean: %s" % (sum(size) / len(size)))
print("Min: %s" % (min(size)))
print("Max: %s" % (max(size)))

authorList = authorList.tolist()


Mean: 6053
Min: 1045
Max: 18665

In [5]:
labels = []
texts = []
maxRows = 1000
for auth in authorList:
    current = textToUse.loc[textToUse['author_id'] == auth]
    current = current.sample(n = maxRows)
    textlist = current.doc_content.tolist()
    texts = texts + textlist
    labels = labels + [authorList.index(author_id) for author_id in current.author_id.tolist()]
labels_index = {}
labels_index[0] = 0
for i, auth in enumerate(authorList):
    labels_index[i] = auth

del textToUse

print('Found %s texts.' % len(texts))
print('Found %s labels.' % len(labels))


Found 5000 texts.
Found 5000 labels.

In [6]:
# def encode_data(x, MAX_SEQUENCE_LENGTH, vocab, vocab_size, check):
#Iterate over the loaded data and create a matrix of size MAX_SEQUENCE_LENGTH x vocabsize
#In this case that will be 1014x69. This is then placed in a 3D matrix of size
#data_samples x MAX_SEQUENCE_LENGTH x vocab_size. Each character is encoded into a one-hot
#array. Chars not in the vocab are encoded into an all zero vector.

MAX_SEQUENCE_LENGTH = (int) ((100 * chunk_size) / vocab_size)

data = np.zeros((len(texts), MAX_SEQUENCE_LENGTH, vocab_size))
for dix, sent in enumerate(texts):
    counter = 0
    sent_array = np.zeros((MAX_SEQUENCE_LENGTH, vocab_size))
    chars = list(sent.lower().replace(' ', ''))
    for c in chars:
        if counter >= MAX_SEQUENCE_LENGTH:
            pass
        else:
            char_array = np.zeros(vocab_size, dtype=np.int)
            if c in check:
                ix = vocab[c]
                char_array[ix] = 1
            sent_array[counter, :] = char_array
            counter += 1
    data[dix, :, :] = sent_array

labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)


Shape of data tensor: (5000, 1449, 69)
Shape of label tensor: (5000, 5)

In [7]:
# split the data into a training set and a validation set
from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(data, labels, test_size=0.3, random_state=42)

print('Training model.')

del data, labels


Training model.

In [8]:
print(y_train[4])
print(len(x_train[4]))
print(type(x_train[4]))


[ 0.  0.  1.  0.  0.]
1449
<type 'numpy.ndarray'>

In [9]:
model = Sequential()

# model.add(Embedding(                          # Layer 0, Start
#     input_dim=nb_words + 1,                   # Size to dictionary, has to be input + 1
#     output_dim=EMBEDDING_DIM,                 # Dimensions to generate
#     weights=[embedding_matrix],               # Initialize word weights
#     input_length=MAX_SEQUENCE_LENGTH))        # Define length to input sequences in the first layer

model.add(Convolution1D(                      # Layer 1,   Features: 256, Kernel Size: 7
    nb_filter=CONVOLUTION_FEATURE,            # Number of kernels or number of filters to generate
    filter_length=7,                          # Size of kernels
    border_mode='valid',                      # Border = 'valid', cause kernel to reduce dimensions
    activation='relu',                        # Activation function to use
    input_shape=(MAX_SEQUENCE_LENGTH, vocab_size)))

model.add(MaxPooling1D(                       # Layer 1a,  Max Pooling: 3
    pool_length=3))                           # Size of kernels

model.add(Convolution1D(                      # Layer 2,   Features: 256, Kernel Size: 7
    nb_filter=CONVOLUTION_FEATURE,            # Number of kernels or number of filters to generate
    filter_length=7,                          # Size of kernels
    border_mode='valid',                      # Border = 'valid', cause kernel to reduce dimensions
    activation='relu'))                       # Activation function to use

model.add(MaxPooling1D(                       # Layer 2a,  Max Pooling: 3
    pool_length=3))                           # Size of kernels

model.add(Convolution1D(                      # Layer 3,   Features: 256, Kernel Size: 3
    nb_filter=CONVOLUTION_FEATURE,            # Number of kernels or number of filters to generate
    filter_length=3,                          # Size of kernels
    border_mode='valid',                      # Border = 'valid', cause kernel to reduce dimensions
    activation='relu'))                       # Activation function to use

model.add(Convolution1D(                      # Layer 4,   Features: 256, Kernel Size: 3
    nb_filter=CONVOLUTION_FEATURE,            # Number of kernels or number of filters to generate
    filter_length=3,                          # Size of kernels
    border_mode='valid',                      # Border = 'valid', cause kernel to reduce dimensions
    activation='relu'))                       # Activation function to use

model.add(Convolution1D(                      # Layer 5,   Features: 256, Kernel Size: 3
    nb_filter=CONVOLUTION_FEATURE,            # Number of kernels or number of filters to generate
    filter_length=3,                          # Size of kernels
    border_mode='valid',                      # Border = 'valid', cause kernel to reduce dimensions
    activation='relu'))                       # Activation function to use

model.add(Convolution1D(                      # Layer 6,   Features: 256, Kernel Size: 3
    nb_filter=CONVOLUTION_FEATURE,            # Number of kernels or number of filters to generate
    filter_length=5,                          # Size of kernels
    border_mode='valid',                      # Border = 'valid', cause kernel to reduce dimensions
    activation='relu'))                       # Activation function to use

model.add(MaxPooling1D(                       # Layer 6a,  Max Pooling: 3
    pool_length=3))                           # Size of kernels

model.add(Flatten())                          # Layer 7

model.add(Dense(                              # Layer 7a,  Output Size: 1024
    output_dim=DENSE_FEATURE,                 # Output dimension
    activation='relu'))                       # Activation function to use

model.add(Dropout(DROP_OUT))

model.add(Dense(                              # Layer 8,   Output Size: 1024
    output_dim=DENSE_FEATURE,                 # Output dimension
    activation='relu'))                       # Activation function to use

model.add(Dropout(DROP_OUT))

model.add(Dense(                              # Layer 9,  Output Size: Size Unique Labels, Final
    output_dim=len(labels_index),             # Output dimension
    activation='softmax'))                    # Activation function to use

# model = Model(start, end)

sgd = SGD(lr=LEARNING_RATE, momentum=MOMENTUM, nesterov=True)

adadelta = Adadelta(lr=1.0, rho=0.95, epsilon=1e-08)

model.compile(loss='categorical_crossentropy', optimizer=sgd,
              metrics=['accuracy'])

print("Done compiling.")


Done compiling.

In [10]:
model.fit(x_train, y_train, validation_data=(x_val, y_val),
          nb_epoch=EPOCH, batch_size=BATCH_SIZE)


Train on 3500 samples, validate on 1500 samples
Epoch 1/30
3500/3500 [==============================] - 30s - loss: 1.6098 - acc: 0.1957 - val_loss: 1.6096 - val_acc: 0.1973
Epoch 2/30
3500/3500 [==============================] - 30s - loss: 1.6092 - acc: 0.2083 - val_loss: 1.6096 - val_acc: 0.1927
Epoch 3/30
3500/3500 [==============================] - 30s - loss: 1.6096 - acc: 0.1997 - val_loss: 1.6097 - val_acc: 0.1907
Epoch 4/30
3500/3500 [==============================] - 30s - loss: 1.6092 - acc: 0.1986 - val_loss: 1.6097 - val_acc: 0.1920
Epoch 5/30
3500/3500 [==============================] - 30s - loss: 1.6096 - acc: 0.1966 - val_loss: 1.6098 - val_acc: 0.1920
Epoch 6/30
3500/3500 [==============================] - 30s - loss: 1.6092 - acc: 0.1949 - val_loss: 1.6098 - val_acc: 0.1900
Epoch 7/30
3500/3500 [==============================] - 30s - loss: 1.6091 - acc: 0.2114 - val_loss: 1.6099 - val_acc: 0.1893
Epoch 8/30
3500/3500 [==============================] - 30s - loss: 1.6088 - acc: 0.2131 - val_loss: 1.6100 - val_acc: 0.1887
Epoch 9/30
3500/3500 [==============================] - 30s - loss: 1.6087 - acc: 0.2083 - val_loss: 1.6100 - val_acc: 0.1893
Epoch 10/30
3500/3500 [==============================] - 30s - loss: 1.6088 - acc: 0.2103 - val_loss: 1.6100 - val_acc: 0.1907
Epoch 11/30
3500/3500 [==============================] - 30s - loss: 1.6090 - acc: 0.2109 - val_loss: 1.6100 - val_acc: 0.1907
Epoch 12/30
3500/3500 [==============================] - 30s - loss: 1.6092 - acc: 0.2037 - val_loss: 1.6100 - val_acc: 0.1900
Epoch 13/30
3500/3500 [==============================] - 30s - loss: 1.6088 - acc: 0.2097 - val_loss: 1.6100 - val_acc: 0.1887
Epoch 14/30
3500/3500 [==============================] - 30s - loss: 1.6091 - acc: 0.2091 - val_loss: 1.6098 - val_acc: 0.1893
Epoch 15/30
3500/3500 [==============================] - 30s - loss: 1.6089 - acc: 0.2086 - val_loss: 1.6099 - val_acc: 0.1893
Epoch 16/30
3500/3500 [==============================] - 30s - loss: 1.6085 - acc: 0.2151 - val_loss: 1.6098 - val_acc: 0.1927
Epoch 17/30
3500/3500 [==============================] - 30s - loss: 1.6087 - acc: 0.2037 - val_loss: 1.6098 - val_acc: 0.1893
Epoch 18/30
3500/3500 [==============================] - 30s - loss: 1.6085 - acc: 0.2140 - val_loss: 1.6098 - val_acc: 0.1893
Epoch 19/30
3500/3500 [==============================] - 30s - loss: 1.6089 - acc: 0.2140 - val_loss: 1.6097 - val_acc: 0.1893
Epoch 20/30
3500/3500 [==============================] - 30s - loss: 1.6085 - acc: 0.2123 - val_loss: 1.6097 - val_acc: 0.1893
Epoch 21/30
3500/3500 [==============================] - 30s - loss: 1.6086 - acc: 0.2109 - val_loss: 1.6097 - val_acc: 0.1893
Epoch 22/30
3500/3500 [==============================] - 30s - loss: 1.6085 - acc: 0.2114 - val_loss: 1.6099 - val_acc: 0.1953
Epoch 23/30
3500/3500 [==============================] - 30s - loss: 1.6085 - acc: 0.2180 - val_loss: 1.6098 - val_acc: 0.1940
Epoch 24/30
3500/3500 [==============================] - 30s - loss: 1.6082 - acc: 0.2091 - val_loss: 1.6099 - val_acc: 0.1887
Epoch 25/30
3500/3500 [==============================] - 30s - loss: 1.6086 - acc: 0.2066 - val_loss: 1.6098 - val_acc: 0.1893
Epoch 26/30
3500/3500 [==============================] - 30s - loss: 1.6089 - acc: 0.2034 - val_loss: 1.6098 - val_acc: 0.1893
Epoch 27/30
3500/3500 [==============================] - 30s - loss: 1.6084 - acc: 0.2011 - val_loss: 1.6098 - val_acc: 0.1893
Epoch 28/30
3500/3500 [==============================] - 30s - loss: 1.6089 - acc: 0.2086 - val_loss: 1.6096 - val_acc: 0.1893
Epoch 29/30
3500/3500 [==============================] - 30s - loss: 1.6082 - acc: 0.2126 - val_loss: 1.6096 - val_acc: 0.1920
Epoch 30/30
3500/3500 [==============================] - 30s - loss: 1.6081 - acc: 0.2180 - val_loss: 1.6096 - val_acc: 0.1893
Out[10]:
<keras.callbacks.History at 0x7ff66f196550>

In [11]:
texts = []  # list of text samples
labels = []  # list of label ids
import DatabaseQuery
# textToUse = pd.read_csv("suffle_4_6000.csv", names=["author_id", "doc_content"], dtype={'author_id': int})
from sshtunnel import SSHTunnelForwarder
PORT=5432
with SSHTunnelForwarder(('srn01.cs.cityu.edu.hk', 22),
                        ssh_username='stylometry',
                        ssh_password='stylometry',
                        remote_bind_address=('localhost', 5432),
                        local_bind_address=('localhost', 5400)):
    textToUse = DatabaseQuery.getCharDocData(5400, doc_id, chunk_size = chunk_size)
labels = []
texts = []
for index, row in textToUse.iterrows():
    labels.append(authorList.index(row.author_id))
    texts.append(row.doc_content)

print('Found %s texts.' % len(texts))

del textToUse


Execution completed
Read completed
Number of rows: 1
author_id       int64
doc_content    object
dtype: object
Data Frame created: Shape: (396, 2)
Found 396 texts.

In [12]:
# def encode_data(x, MAX_SEQUENCE_LENGTH, vocab, vocab_size, check):
#Iterate over the loaded data and create a matrix of size MAX_SEQUENCE_LENGTH x vocabsize
#In this case that will be 1014x69. This is then placed in a 3D matrix of size
#data_samples x MAX_SEQUENCE_LENGTH x vocab_size. Each character is encoded into a one-hot
#array. Chars not in the vocab are encoded into an all zero vector.


MAX_SEQUENCE_LENGTH = (int) ((100 * chunk_size) / vocab_size)

data = np.zeros((len(texts), MAX_SEQUENCE_LENGTH, vocab_size))
for dix, sent in enumerate(texts):
    counter = 0
    sent_array = np.zeros((MAX_SEQUENCE_LENGTH, vocab_size))
    chars = list(sent.lower().replace(' ', ''))
    for c in chars:
        if counter >= MAX_SEQUENCE_LENGTH:
            pass
        else:
            char_array = np.zeros(vocab_size, dtype=np.int)
            if c in check:
                ix = vocab[c]
                char_array[ix] = 1
            sent_array[counter, :] = char_array
            counter += 1
    data[dix, :, :] = sent_array

print('Shape of data tensor:', data.shape)

testX = data[:]

print('Shape of label tensor:', testX.shape)


Shape of data tensor: (396, 1449, 69)
Shape of label tensor: (396, 1449, 69)

In [13]:
# Function to take input of data and return prediction model
predY = np.array(model.predict(testX, batch_size=128))
predYList = predY[:]
entro = []
import math
for row in predY:
    entroval = 0
    for i in row:
        entroval += (i * (math.log(i , 2)))
    entroval = -1 * entroval
    entro.append(entroval)
yx = zip(entro, predY)
yx = sorted(yx, key = lambda t: t[0])
newPredY = [x for y, x in yx]
predYEntroList = newPredY[:int(len(newPredY)*0.9)]
predY = np.mean(predYEntroList, axis=0)

In [14]:
print(labels_index)


{0: 551, 1: 2703, 2: 2971, 3: 7679, 4: 8303}

In [15]:
print(predY[labels[0]])


0.195403

In [ ]: