In [1]:
# Header
from __future__ import print_function
import numpy as np
import tensorflow as tf
print('Tensorflow version: ', tf.__version__)
import time
#Show images
import matplotlib.pyplot as plt
%matplotlib inline
# plt configuration
plt.rcParams['figure.figsize'] = (10, 10) # size of images
plt.rcParams['image.interpolation'] = 'nearest' # show exact image
#plt.rcParams['image.cmap'] = 'gray' # use grayscale
# GPU devices visible by python
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="0"
data_path = '/home/ubuntu/data/training/text/sentiment/'
In [2]:
# Import train and test data
X_train = np.load(data_path + 'aclImdb/X_train.npy')
y_train = np.load(data_path + 'aclImdb/y_train.npy')
X_test = np.load(data_path + 'aclImdb/X_test.npy')
y_test = np.load(data_path + 'aclImdb/y_test.npy')
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
In [3]:
#Load embeddings
import pandas as pd
import csv
import pickle
# Load worddict
with open(data_path + 'worddict.pickle', 'rb') as pfile:
worddict = pickle.load(pfile)
embed_dim = 300
df_glove = pd.read_csv(data_path + "glove.6B."+str(embed_dim)+"d.txt", index_col=0 ,sep=' ',
header = None, quoting=csv.QUOTE_NONE, encoding='utf-8')
#Merge with the dictionary of the current texts: Inner join, only words in the corpus and in glove.
df_glove = df_glove.merge(pd.DataFrame.from_dict(worddict, orient='index'), left_index=True, right_index=True)
print('Merged words: ', df_glove.shape[0])
#Create dictionary: word_number_id --> [glove vector associated]
glove={}
for i,r in df_glove[:].iterrows():
glove[int(r[0])] = [r[j] for j in range(1,embed_dim+1)]
print('Dictionary length: ', len(glove))
In [4]:
#Create embeddings 3D tensors
max_len = 100
def embedd(x):
r = np.zeros((max_len, embed_dim))
pos = max_len-1
for i in range(len(x),0,-1):
found = True
try:
v = np.array([glove[x[i-1]]])
except:
found = False
if found and pos>=0:
r[pos,:] = v
pos += -1
return r
X_train = np.array([embedd(s) for s in X_train], dtype=np.float32)
print('Train shape:', X_train.shape)
X_test = np.array([embedd(s) for s in X_test], dtype=np.float32)
print('Test shape:', X_test.shape)
In [7]:
# Save data in HDF5 to use with a batch generator
import h5py
with h5py.File(data_path + 'sentiment_glove_data.h5') as hdf5_f:
hdf5_f.create_dataset('X_train', data=np.array(X_train))
hdf5_f.create_dataset('y_train', data=np.array(y_train))
hdf5_f.create_dataset('X_test' , data=np.array(X_test ))
hdf5_f.create_dataset('y_test' , data=np.array(y_test ))
In [5]:
# Model
num_hidden_rnn = 128 #Num of neurons in the Recurent network
from tensorflow.contrib.keras import layers, models, optimizers
print('Build model 1 - Basic model...')
# LAYER 1: inputs
seq_prev_input = layers.Input(shape=(max_len, embed_dim), dtype='float32')
# LAYER 2: Create embedings
#embeds = layers.Embedding(max_features, dim_embedings, input_length=max_len)(seq_prev_input)
# LAYERS 3: RNN - forwards LSTM with dropout
forward = layers.LSTM(num_hidden_rnn, return_sequences=True,
dropout=0.3, recurrent_dropout=0.3, name='Forward1')(seq_prev_input)
rnn_out = layers.LSTM(num_hidden_rnn, return_sequences=False,
dropout=0.3, recurrent_dropout=0.3, name='Forward2')(forward)
# LAYER 4: Dense layer to outputs - softmax activation
output = layers.Dense(2, activation='softmax')(rnn_out)
# Model Architecture defined
model_1 = models.Model(inputs=seq_prev_input, outputs=output)
model_1.summary()
# Compile model and select optimizer
model_1.compile(loss='sparse_categorical_crossentropy', optimizer='Adam', metrics=['accuracy'])
In [6]:
#Plot the model graph
from tensorflow.contrib.keras import utils
# Create model image
utils.plot_model(model_1, '/tmp/model1.png')
# Show image
plt.imshow(plt.imread('/tmp/model1.png'))
Out[6]:
In [7]:
# Train
batch_size = 128
print("Train...")
history = model_1.fit(X_train, y_train, batch_size=batch_size, epochs=20,
validation_data=(X_test, y_test))
In [8]:
#Plot graphs in the notebook output
import matplotlib.pyplot as plt
%matplotlib inline
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.show()
In [9]:
# Score and obtain probabilities
pred_test = model_1.predict(X_test)
print(pred_test.shape)
In [10]:
#Import metrics
from sklearn.metrics import roc_curve, auc, accuracy_score
#Calculate accuracy with sklearn
print('Accuracy: ',accuracy_score(y_test, [1 if p>0.5 else 0 for p in pred_test[:,1]]))
#Calculate ROC curve
fpr, tpr, _ = roc_curve(y_test, pred_test[:,1])
print('AUC: ', auc(fpr, tpr) )
#Plot ROC curve
plt.plot(fpr, tpr)
Out[10]:
In [ ]: