Model template

- Preprocess data.
- Prepare sequences to model.
- Build model.
- Validate it.

Preprocess data


In [ ]:
# Header
from __future__ import print_function

import numpy as np
import tensorflow as tf
print('Tensorflow version: ', tf.__version__)
import time

#Show images
import matplotlib.pyplot as plt
%matplotlib inline
# plt configuration
plt.rcParams['figure.figsize'] = (10, 10)        # size of images
plt.rcParams['image.interpolation'] = 'nearest'  # show exact image
#plt.rcParams['image.cmap'] = 'gray'  # use grayscale 


# GPU devices visible by python
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID" 
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [ ]:
# Import train and test data
data_path='/home/ubuntu/data/training/text/sentiment/aclImdb/'

X_train = np.load(data_path + 'X_train.npy')
y_train = np.load(data_path + 'y_train.npy')
X_test  = np.load(data_path + 'X_test.npy')
y_test  = np.load(data_path + 'y_test.npy')

print(X_train.shape, y_train.shape)
print(X_test.shape,  y_test.shape)

Prepare sequences to model


In [ ]:
max_features = 20000 # Number of most frequent words selected. the less frequent recode to 0
max_len = 100  # cut texts after this number of words (among top max_features most common words)


#Select the most frequent max_features, recode others using 0
def remove_features(x):
    return [[0 if w >= max_features else w for w in sen] for sen in x]

X_train = remove_features(X_train)
X_test  = remove_features(X_test)


# Cut or complete the sentences to length = maxlen
from tensorflow.contrib.keras import preprocessing

print("Pad sequences (samples x time)")
X_train = preprocessing.sequence.pad_sequences(X_train, maxlen=max_len)
X_test = preprocessing.sequence.pad_sequences(X_test, maxlen=max_len)
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

Build model


In [ ]:
# Model
dim_embedings = 128 #Dimension of the embedings vector
num_hidden_rnn = 128 #Num of neurons in the Recurent network 

from tensorflow.contrib.keras import layers, models, optimizers

print('Build model 1 - Basic model...')

# LAYER 1: inputs
seq_prev_input = layers.Input(shape=(max_len, ), dtype='int32') 

# LAYER 2: Create embedings
embeds = layers.Embedding(max_features, dim_embedings, input_length=max_len)(seq_prev_input)

# LAYERS 3: Model layers
#------------------------
# PUT YOUR MODEL HERE!!!
#------------------------


# LAYER 4: Dense layer to outputs - softmax activation
output = layers.Dense(2, activation='softmax')(rnn_out)

# Model Architecture defined
model_1 = models.Model(inputs=seq_prev_input, outputs=output)
model_1.summary()

# Compile model and select optimizer
rms_optimizer = optimizers.RMSprop(lr=0.001)
model_1.compile(loss='sparse_categorical_crossentropy', optimizer=rms_optimizer, metrics=['accuracy'])

In [ ]:
#Plot the model graph
from tensorflow.contrib.keras import utils

# Create model image
utils.plot_model(model_1, '/tmp/model1.png')

# Show image
plt.imshow(plt.imread('/tmp/model1.png'))

In [ ]:
# Train
batch_size = 128

print("Train...")
history = model_1.fit(X_train, y_train, batch_size=batch_size, epochs=10,
                      validation_data=(X_test, y_test))

In [ ]:
#Plot graphs in the notebook output
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.show()

Validate it


In [ ]:
# Score and obtain probabilities
pred_test = model_1.predict(X_test)
print(pred_test.shape)

In [ ]:
#Import metrics
from sklearn.metrics import roc_curve, auc, accuracy_score

#Calculate accuracy with sklearn
print(accuracy_score(y_test, [1 if p>0.5 else 0 for p in pred_test[:,1]]))

#Calculate ROC curve
fpr, tpr, _ = roc_curve(y_test, pred_test[:,1])
print('AUC: ', auc(fpr, tpr)  )

#Plot ROC curve
plt.plot(fpr, tpr)

In [ ]:
# Score new text

# Load dictionary
import pickle
with open(data_path + 'worddict.pickle', 'rb') as pfile:
    worddict = pickle.load(pfile)

    
def tokenize(sentences):
    from nltk import word_tokenize
    tokens = []
    for sentence in sentences:
        tokens += [word_tokenize(sentence)]
    return tokens


def generate_sequence(sentences, dictionary):
    seqs = [None] * len(sentences)
    for idx, ss in enumerate(sentences):
        seqs[idx] = [dictionary[w] if w in dictionary else 1 for w in ss]
    return seqs


def remove_features(x):
    return [[0 if w >= max_features else w for w in sen] for sen in x]


def score_new_text(text):
    seq = generate_sequence(tokenize([text]), worddict)
    seq = remove_features(seq)
    seq = preprocessing.sequence.pad_sequences(seq, maxlen=max_len)
    pred_test = model_1.predict(seq, batch_size=1)
    return float(pred_test[:,1])

In [ ]:
#Evaluate one negative record

text = "You have to start worrying when you see that Michael Madsen is leading the Cast of any movie. I wont go through the list of shame that is his movie career.<br /><br />I watched 45 minutes and still was not sure what really was going on. The movie consisted of a love hate relationship between Madsen and Argento, Which basically was Madsen insulting her, threatening violence and generally treating her like dirt. She on the other hand loves him, then shes doesn't, then she does, the she desires him, then she loves him again......whats wrong with you woman !!!! <br /><br />The Script is awful, lousy soundtrack and pointless aggressive and crude sexuality which i believe was added to entice some viewers as the movie has little else to offer. I would have given the movie a 1 but it just about managed a 2 with a little excitement in the last 20 minutes. It did actually answer one question in the final few minutes but i am not going to share that, i will make you suffer for the full movie like i did."
print(text)
print('Positive score:', score_new_text(text))

In [ ]:
#Evaluate one positive record

text = "The distribution was good, the subject could have been interessant and comic. whereas, he described the wandering of an old non credible communist looking for loving sensations. Instead of this, the atmosphere is nor lively nor heavy."
print(text)
print('Positive score:', score_new_text(text))

In [ ]: