In [1]:
import numpy as np
from theano.tensor.shared_randomstreams import RandomStreams
from matplotlib import pyplot
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.layers import Dropout
from keras.models import load_model
a) Cargar el dataset completo
In [2]:
from keras.datasets import imdb
np.random.seed(3)
srng = RandomStreams(8)
(X_train, y_train), (X_test, y_test) = imdb.load_data(seed=15)
b) Cantidad de palabras en el dataset, boxplot de distribución de largo de palabras
In [3]:
# Concatenamiento de conjuntos de entrenamiento
X = np.concatenate((X_train, X_test), axis=0)
y = np.concatenate((y_train, y_test), axis=0)
print "Palabras en el dataset: ", X.size
result = map(len, X)
pyplot.boxplot(result)
pyplot.show()
c) Cargar palabras más relevantes y acotar largo de comentarios
In [4]:
# Se cargan las 3000 palabras más relevantes
top_words = 3000
(X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=top_words, seed=15)
# Se acotan los comentarios a un máximo de 500 palabras
X_train = sequence.pad_sequences(X_train, maxlen=500)
X_test = sequence.pad_sequences(X_test, maxlen=500)
d) Entrenamiento red LSTM con capa de embedding
In [5]:
# Tamaño vector generado por embedding
embedding_vector_length = 32
model = Sequential()
model.add(Embedding(top_words, embedding_vector_length, input_length=500))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
# Se entrena el modelo en servidor GPU y se guarda para luego ser evaluado
model.fit(X_train, y_train, validation_data=(X_test, y_test), nb_epoch=3, batch_size=64)
model.save('Pregunta2/LSTM-32.h5')
In [6]:
model = load_model('Pregunta2/LSTM-32.h5')
scores = model.evaluate(X_test, y_test, verbose=0)
print('Accuracy: ', scores[1])
e) Entrenamiento red LSTM con distintos tamaños de vector de embedding, se prueba con valores 16 y 64 además del 32 de la pregunta anterior
In [7]:
# Tamaño vector generado por embedding largo 16
embedding_vector_length = 16
model = Sequential()
model.add(Embedding(top_words, embedding_vector_length, input_length=500))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
# Se entrena el modelo en servidor GPU y se guarda para luego ser evaluado
model.fit(X_train, y_train, validation_data=(X_test, y_test), nb_epoch=3, batch_size=64)
model.save('pregunta2/LSTM-16.h5')
In [8]:
# Tamaño vector generado por embedding largo 64
embedding_vector_length = 64
model = Sequential()
model.add(Embedding(top_words, embedding_vector_length, input_length=500))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
# Se entrena el modelo en servidor GPU y se guarda para luego ser evaluado
model.fit(X_train, y_train, validation_data=(X_test, y_test), nb_epoch=3, batch_size=64)
model.save('pregunta2/LSTM-64.h5')
In [9]:
# Se cargan los modelos obtenidos
model16 = load_model('Pregunta2/LSTM-16.h5')
model32 = load_model('Pregunta2/LSTM-32.h5')
model64 = load_model('Pregunta2/LSTM-64.h5')
# Se obtiene el accuracy de cada modelo
scores16 = model16.evaluate(X_test, y_test, verbose=0)
scores32 = model32.evaluate(X_test, y_test, verbose=0)
scores64 = model64.evaluate(X_test, y_test, verbose=0)
print('Accuracy tamaño de vector 16: ', scores16[1])
print('Accuracy tamaño de vector 32: ', scores32[1])
print('Accuracy tamaño de vector 64: ', scores64[1])
print('Pérdida tamaño de vector 16: ', scores16[0])
print('Pérdida tamaño de vector 32: ', scores32[0])
print('Pérdida tamaño de vector 64: ', scores64[0])
f) Entrenamiento cambiando el tamaño de las palabras seleccionadas
Se probará con 3000, 5000 y 8000 palabras top
In [10]:
# Se cargan las 3000 palabras más relevantes
top_words = 3000
(X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=top_words, seed=15)
# Se acotan los comentarios a un máximo de 500 palabras
X_train = sequence.pad_sequences(X_train, maxlen=500)
X_test = sequence.pad_sequences(X_test, maxlen=500)
# Tamaño vector generado por embedding largo 32
embedding_vector_length = 32
model = Sequential()
model.add(Embedding(top_words, embedding_vector_length, input_length=500))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
# Se entrena el modelo en servidor GPU y se guarda para luego ser evaluado
model.fit(X_train, y_train, validation_data=(X_test, y_test), nb_epoch=3, batch_size=64)
model.save('pregunta2/LSTM-words-3000.h5')
In [11]:
# Se cargan las 5000 palabras más relevantes
top_words = 5000
(X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=top_words, seed=15)
# Se acotan los comentarios a un máximo de 500 palabras
X_train = sequence.pad_sequences(X_train, maxlen=500)
X_test = sequence.pad_sequences(X_test, maxlen=500)
# Tamaño vector generado por embedding largo 32
embedding_vector_length = 32
model = Sequential()
model.add(Embedding(top_words, embedding_vector_length, input_length=500))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
# Se entrena el modelo en servidor GPU y se guarda para luego ser evaluado
model.fit(X_train, y_train, validation_data=(X_test, y_test), nb_epoch=3, batch_size=64)
model.save('pregunta2/LSTM-words-5000.h5')
In [12]:
# Se cargan las 8000 palabras más relevantes
top_words = 8000
(X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=top_words, seed=15)
# Se acotan los comentarios a un máximo de 500 palabras
X_train = sequence.pad_sequences(X_train, maxlen=500)
X_test = sequence.pad_sequences(X_test, maxlen=500)
# Tamaño vector generado por embedding largo 32
embedding_vector_length = 32
model = Sequential()
model.add(Embedding(top_words, embedding_vector_length, input_length=500))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
# Se entrena el modelo en servidor GPU y se guarda para luego ser evaluado
model.fit(X_train, y_train, validation_data=(X_test, y_test), nb_epoch=3, batch_size=64)
model.save('pregunta2/LSTM-words-8000.h5')
In [15]:
# Se cargan los modelos opbtenidos
model_3000 = load_model('Pregunta2/LSTM-words-3000.h5')
model_5000 = load_model('Pregunta2/LSTM-words-5000.h5')
model_8000 = load_model('Pregunta2/LSTM-words-8000.h5')
# Se obtiene el accuracy de cada modelo
# Se cargan las 8000 palabras más relevantes
top_words = 3000
(X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=top_words, seed=15)
X_train = sequence.pad_sequences(X_train, maxlen=500)
X_test = sequence.pad_sequences(X_test, maxlen=500)
scores3000 = model_3000.evaluate(X_test, y_test, verbose=0)
top_words = 5000
(X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=top_words, seed=15)
X_train = sequence.pad_sequences(X_train, maxlen=500)
X_test = sequence.pad_sequences(X_test, maxlen=500)
scores5000 = model_5000.evaluate(X_test, y_test, verbose=0)
top_words = 8000
(X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=top_words, seed=15)
X_train = sequence.pad_sequences(X_train, maxlen=500)
X_test = sequence.pad_sequences(X_test, maxlen=500)
scores8000 = model_8000.evaluate(X_test, y_test, verbose=0)
print('Accuracy cantidad de palabras 3000: ', scores3000[1])
print('Accuracy cantidad de palabras 5000: ', scores5000[1])
print('Accuracy cantidad de palabras 8000: ', scores8000[1])
print('Perdida cantidad de palabras 3000: ', scores3000[0])
print('Perdida cantidad de palabras 5000: ', scores5000[0])
print('Perdida cantidad de palabras 8000: ', scores8000[0])
g) Usar Dropout para entrenar
In [16]:
embedding_vector_length = 32
top_words = 5000
model = Sequential()
model.add(Embedding(top_words, embedding_vector_length, input_length=500))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train, y_train, nb_epoch=3, batch_size=64)
model.save('pregunta2/LSTM-dropout.h5')
In [19]:
top_words = 5000
(X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=top_words, seed=15)
X_train = sequence.pad_sequences(X_train, maxlen=500)
X_test = sequence.pad_sequences(X_test, maxlen=500)
model = load_model('Pregunta2/LSTM-dropout.h5')
scores = model.evaluate(X_test, y_test, verbose=0)
print('Accuracy: ', scores[1])
print('Pérdida: ', scores[0])
h) Propuesta nuevo modelo
In [4]:
top_words = 5000
(X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=top_words, seed=15)
X_train = sequence.pad_sequences(X_train, maxlen=500)
X_test = sequence.pad_sequences(X_test, maxlen=500)
embedding_vector_length = 64
top_words = 5000
model = Sequential()
model.add(Embedding(top_words, embedding_vector_length, input_length=500))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train, y_train, nb_epoch=3, batch_size=64)
model.save('pregunta2/LSTM-dropout-64.h5')
In [5]:
model = load_model('Pregunta2/LSTM-dropout-64.h5')
scores = model.evaluate(X_test, y_test, verbose=0)
print('Accuracy: ', scores[1])
print('Pérdida: ', scores[0])
In [6]:
top_words = 5000
(X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=top_words, seed=15)
X_train = sequence.pad_sequences(X_train, maxlen=500)
X_test = sequence.pad_sequences(X_test, maxlen=500)
embedding_vector_length = 64
top_words = 5000
model = Sequential()
model.add(Embedding(top_words, embedding_vector_length, input_length=500))
model.add(Dropout(0.1))
model.add(LSTM(100))
model.add(Dropout(0.1))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train, y_train, nb_epoch=3, batch_size=64)
model.save('pregunta2/LSTM-dropout-64.h5')
In [8]:
model = load_model('Pregunta2/LSTM-dropout-64-2.h5')
scores = model.evaluate(X_test, y_test, verbose=0)
print('Accuracy: ', scores[1])
print('Perdida: ', scores[0])
In [3]:
top_words = 5000
(X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=top_words, seed=15)
X_train = sequence.pad_sequences(X_train, maxlen=500)
X_test = sequence.pad_sequences(X_test, maxlen=500)
embedding_vector_length = 32
top_words = 5000
model = Sequential()
model.add(Embedding(top_words, embedding_vector_length, input_length=500))
model.add(Dropout(0.1))
model.add(LSTM(100))
model.add(Dropout(0.1))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train, y_train, nb_epoch=3, batch_size=64)
model.save('pregunta2/LSTM-dropout-64-3.h5')
In [4]:
model = load_model('Pregunta2/LSTM-dropout-64-3.h5')
scores = model.evaluate(X_test, y_test, verbose=0)
print('Accuracy: ', scores[1])
print('Perdida: ', scores[0])
In [ ]: