In [1]:
# coding: utf-8
from __future__ import print_function
import os
import numpy as np
np.random.seed(1337)
import pandas as pd
import theano
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Dense, Flatten
from keras.layers import Convolution1D, MaxPooling1D, Embedding
from keras.models import Model
from keras.layers import Input, Dropout
from keras.optimizers import SGD, Adadelta
from keras.models import Sequential
import sys
import string
MAX_SEQUENCE_LENGTH = 1000
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2
CONVOLUTION_FEATURE = 256
DENSE_FEATURE = 1024
DROP_OUT = 0.5
LEARNING_RATE=0.001
MOMENTUM=0.9
EPOCH=25
BATCH_SIZE=128
embed = 100
embedfile = 'glove.6B.100d.txt'
authorList = [551, 2703, 2971, 8303, 7679]
doc_id = 1210
chunk_size = 1000
nb_epoch = 30
EPOCH = nb_epoch
In [2]:
# first, build index mapping words in the embeddings set
# to their embedding vector
#This alphabet is 69 chars vs. 70 reported in the paper since they include two
# '-' characters. See https://github.com/zhangxiangxiao/Crepe#issues.
print('Indexing char vectors.')
alphabet = (list(string.ascii_lowercase) + list(string.digits) +
list(string.punctuation) + ['\n'])
vocab_size = len(alphabet)
check = set(alphabet)
vocab = {}
reverse_vocab = {}
for ix, t in enumerate(alphabet):
vocab[t] = ix
reverse_vocab[ix] = t
MAX_NB_WORDS = vocab_size
print('Found %s char vectors.' % str(MAX_NB_WORDS))
# second, prepare text samples and their labels
print('Processing text dataset')
In [3]:
texts = [] # list of text samples
labels_index = {} # dictionary mapping label name to numeric id
labels = [] # list of label ids
import DatabaseQuery
# textToUse = pd.read_csv("suffle_4_6000.csv", names=["author_id", "doc_content"], dtype={'author_id': int})
from sshtunnel import SSHTunnelForwarder
PORT=5432
with SSHTunnelForwarder(('srn01.cs.cityu.edu.hk', 22),
ssh_username='stylometry',
ssh_password='stylometry',
remote_bind_address=('localhost', 5432),
local_bind_address=('localhost', 5400)):
textToUse = DatabaseQuery.getCharAuthData(5400, authors = authorList, doc = doc_id,
vocab_size = vocab_size, chunk_size = chunk_size)
In [4]:
labels = []
texts = []
size = []
authorList = textToUse.author_id.unique()
for auth in authorList:
current = textToUse.loc[textToUse['author_id'] == auth]
size.append(current.shape[0])
print("Mean: %s" % (sum(size) / len(size)))
print("Min: %s" % (min(size)))
print("Max: %s" % (max(size)))
authorList = authorList.tolist()
In [5]:
labels = []
texts = []
maxRows = 1000
for auth in authorList:
current = textToUse.loc[textToUse['author_id'] == auth]
current = current.sample(n = maxRows)
textlist = current.doc_content.tolist()
texts = texts + textlist
labels = labels + [authorList.index(author_id) for author_id in current.author_id.tolist()]
labels_index = {}
labels_index[0] = 0
for i, auth in enumerate(authorList):
labels_index[i] = auth
del textToUse
print('Found %s texts.' % len(texts))
print('Found %s labels.' % len(labels))
In [6]:
# def encode_data(x, MAX_SEQUENCE_LENGTH, vocab, vocab_size, check):
#Iterate over the loaded data and create a matrix of size MAX_SEQUENCE_LENGTH x vocabsize
#In this case that will be 1014x69. This is then placed in a 3D matrix of size
#data_samples x MAX_SEQUENCE_LENGTH x vocab_size. Each character is encoded into a one-hot
#array. Chars not in the vocab are encoded into an all zero vector.
MAX_SEQUENCE_LENGTH = (int) ((100 * chunk_size) / vocab_size)
data = np.zeros((len(texts), MAX_SEQUENCE_LENGTH, vocab_size))
for dix, sent in enumerate(texts):
counter = 0
sent_array = np.zeros((MAX_SEQUENCE_LENGTH, vocab_size))
chars = list(sent.lower().replace(' ', ''))
for c in chars:
if counter >= MAX_SEQUENCE_LENGTH:
pass
else:
char_array = np.zeros(vocab_size, dtype=np.int)
if c in check:
ix = vocab[c]
char_array[ix] = 1
sent_array[counter, :] = char_array
counter += 1
data[dix, :, :] = sent_array
labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)
In [7]:
# split the data into a training set and a validation set
from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(data, labels, test_size=0.3, random_state=42)
print('Training model.')
del data, labels
In [8]:
print(y_train[4])
print(len(x_train[4]))
print(type(x_train[4]))
In [9]:
model = Sequential()
# model.add(Embedding( # Layer 0, Start
# input_dim=nb_words + 1, # Size to dictionary, has to be input + 1
# output_dim=EMBEDDING_DIM, # Dimensions to generate
# weights=[embedding_matrix], # Initialize word weights
# input_length=MAX_SEQUENCE_LENGTH)) # Define length to input sequences in the first layer
model.add(Convolution1D( # Layer 1, Features: 256, Kernel Size: 7
nb_filter=CONVOLUTION_FEATURE, # Number of kernels or number of filters to generate
filter_length=7, # Size of kernels
border_mode='valid', # Border = 'valid', cause kernel to reduce dimensions
activation='relu', # Activation function to use
input_shape=(MAX_SEQUENCE_LENGTH, vocab_size)))
model.add(MaxPooling1D( # Layer 1a, Max Pooling: 3
pool_length=3)) # Size of kernels
model.add(Convolution1D( # Layer 2, Features: 256, Kernel Size: 7
nb_filter=CONVOLUTION_FEATURE, # Number of kernels or number of filters to generate
filter_length=7, # Size of kernels
border_mode='valid', # Border = 'valid', cause kernel to reduce dimensions
activation='relu')) # Activation function to use
model.add(MaxPooling1D( # Layer 2a, Max Pooling: 3
pool_length=3)) # Size of kernels
model.add(Convolution1D( # Layer 3, Features: 256, Kernel Size: 3
nb_filter=CONVOLUTION_FEATURE, # Number of kernels or number of filters to generate
filter_length=3, # Size of kernels
border_mode='valid', # Border = 'valid', cause kernel to reduce dimensions
activation='relu')) # Activation function to use
model.add(Convolution1D( # Layer 4, Features: 256, Kernel Size: 3
nb_filter=CONVOLUTION_FEATURE, # Number of kernels or number of filters to generate
filter_length=3, # Size of kernels
border_mode='valid', # Border = 'valid', cause kernel to reduce dimensions
activation='relu')) # Activation function to use
model.add(Convolution1D( # Layer 5, Features: 256, Kernel Size: 3
nb_filter=CONVOLUTION_FEATURE, # Number of kernels or number of filters to generate
filter_length=3, # Size of kernels
border_mode='valid', # Border = 'valid', cause kernel to reduce dimensions
activation='relu')) # Activation function to use
model.add(Convolution1D( # Layer 6, Features: 256, Kernel Size: 3
nb_filter=CONVOLUTION_FEATURE, # Number of kernels or number of filters to generate
filter_length=5, # Size of kernels
border_mode='valid', # Border = 'valid', cause kernel to reduce dimensions
activation='relu')) # Activation function to use
model.add(MaxPooling1D( # Layer 6a, Max Pooling: 3
pool_length=3)) # Size of kernels
model.add(Flatten()) # Layer 7
model.add(Dense( # Layer 7a, Output Size: 1024
output_dim=DENSE_FEATURE, # Output dimension
activation='relu')) # Activation function to use
model.add(Dropout(DROP_OUT))
model.add(Dense( # Layer 8, Output Size: 1024
output_dim=DENSE_FEATURE, # Output dimension
activation='relu')) # Activation function to use
model.add(Dropout(DROP_OUT))
model.add(Dense( # Layer 9, Output Size: Size Unique Labels, Final
output_dim=len(labels_index), # Output dimension
activation='softmax')) # Activation function to use
# model = Model(start, end)
sgd = SGD(lr=LEARNING_RATE, momentum=MOMENTUM, nesterov=True)
adadelta = Adadelta(lr=1.0, rho=0.95, epsilon=1e-08)
model.compile(loss='categorical_crossentropy', optimizer=sgd,
metrics=['accuracy'])
print("Done compiling.")
In [10]:
model.fit(x_train, y_train, validation_data=(x_val, y_val),
nb_epoch=EPOCH, batch_size=BATCH_SIZE)
Out[10]:
In [11]:
texts = [] # list of text samples
labels = [] # list of label ids
import DatabaseQuery
# textToUse = pd.read_csv("suffle_4_6000.csv", names=["author_id", "doc_content"], dtype={'author_id': int})
from sshtunnel import SSHTunnelForwarder
PORT=5432
with SSHTunnelForwarder(('srn01.cs.cityu.edu.hk', 22),
ssh_username='stylometry',
ssh_password='stylometry',
remote_bind_address=('localhost', 5432),
local_bind_address=('localhost', 5400)):
textToUse = DatabaseQuery.getCharDocData(5400, doc_id, chunk_size = chunk_size)
labels = []
texts = []
for index, row in textToUse.iterrows():
labels.append(authorList.index(row.author_id))
texts.append(row.doc_content)
print('Found %s texts.' % len(texts))
del textToUse
In [12]:
# def encode_data(x, MAX_SEQUENCE_LENGTH, vocab, vocab_size, check):
#Iterate over the loaded data and create a matrix of size MAX_SEQUENCE_LENGTH x vocabsize
#In this case that will be 1014x69. This is then placed in a 3D matrix of size
#data_samples x MAX_SEQUENCE_LENGTH x vocab_size. Each character is encoded into a one-hot
#array. Chars not in the vocab are encoded into an all zero vector.
MAX_SEQUENCE_LENGTH = (int) ((100 * chunk_size) / vocab_size)
data = np.zeros((len(texts), MAX_SEQUENCE_LENGTH, vocab_size))
for dix, sent in enumerate(texts):
counter = 0
sent_array = np.zeros((MAX_SEQUENCE_LENGTH, vocab_size))
chars = list(sent.lower().replace(' ', ''))
for c in chars:
if counter >= MAX_SEQUENCE_LENGTH:
pass
else:
char_array = np.zeros(vocab_size, dtype=np.int)
if c in check:
ix = vocab[c]
char_array[ix] = 1
sent_array[counter, :] = char_array
counter += 1
data[dix, :, :] = sent_array
print('Shape of data tensor:', data.shape)
testX = data[:]
print('Shape of label tensor:', testX.shape)
In [13]:
# Function to take input of data and return prediction model
predY = np.array(model.predict(testX, batch_size=128))
predYList = predY[:]
entro = []
import math
for row in predY:
entroval = 0
for i in row:
entroval += (i * (math.log(i , 2)))
entroval = -1 * entroval
entro.append(entroval)
yx = zip(entro, predY)
yx = sorted(yx, key = lambda t: t[0])
newPredY = [x for y, x in yx]
predYEntroList = newPredY[:int(len(newPredY)*0.9)]
predY = np.mean(predYEntroList, axis=0)
In [14]:
print(labels_index)
In [15]:
print(predY[labels[0]])
In [ ]: