In [1]:
# Based on
# https://github.com/fchollet/deep-learning-with-python-notebooks/blob/master/6.1-using-word-embeddings.ipynb
# https://machinelearningmastery.com/develop-word-embeddings-python-gensim/
In [2]:
import warnings
warnings.filterwarnings('ignore')
In [3]:
%matplotlib inline
%pylab inline
In [4]:
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.ERROR)
print(tf.__version__)
In [5]:
import os
imdb_dir = 'C:/Users/olive/Development/data/aclImdb'
train_dir = os.path.join(imdb_dir, 'train')
labels = []
texts = []
for label_type in ['neg', 'pos']:
dir_name = os.path.join(train_dir, label_type)
for fname in os.listdir(dir_name):
if fname[-4:] == '.txt':
f = open(os.path.join(dir_name, fname), encoding='UTF-8')
texts.append(f.read())
f.close()
if label_type == 'neg':
labels.append(0)
else:
labels.append(1)
In [6]:
len(texts)
Out[6]:
In [7]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np
maxlen = 500 # We will cut reviews after 100 words
training_samples = 15000 # We will be training on 200 samples
validation_samples = 10000 # We will be validating on 10000 samples
max_words = 10000 # We will only consider the top 10,000 words in the dataset
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
data = pad_sequences(sequences, maxlen=maxlen)
labels = np.asarray(labels)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)
# But first, shuffle the data, since we started from data
# where sample are ordered (all negative first, then all positive).
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
In [8]:
from sklearn.model_selection import train_test_split
In [9]:
x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42, stratify=labels)
In [10]:
x_train.shape
Out[10]:
In [11]:
glove_dir = 'C:/Users/olive/Development/data/glove.6B'
embeddings_index = {}
f = open(os.path.join(glove_dir, 'glove.6B.100d.txt'), encoding='UTF-8')
for line in f:
values = line.split()
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))
In [12]:
embedding_dim = 100
embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
embedding_vector = embeddings_index.get(word)
if i < max_words:
if embedding_vector is not None:
# Words not found in embedding index will be all-zeros.
embedding_matrix[i] = embedding_vector
In [13]:
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
In [14]:
model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = False
In [15]:
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['acc'])
model.summary()
In [18]:
batch_size=1000
model.fit(x_train, y_train,
epochs=10,
batch_size=batch_size,
validation_split=0.2)
Out[18]:
In [16]:
model.layers[0].trainable = True
In [17]:
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['acc'])
model.summary()
In [19]:
batch_size=1000
model.fit(x_train, y_train,
epochs=20,
batch_size=batch_size,
validation_split=0.2)
Out[19]:
In [20]:
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['acc'])
model.summary()
In [21]:
batch_size=1000
model.fit(x_train, y_train,
epochs=10,
batch_size=batch_size,
validation_split=0.2)
Out[21]:
In [23]:
train_loss, train_accuracy = model.evaluate(x_train, y_train, batch_size=batch_size)
train_accuracy
Out[23]:
In [24]:
test_loss, test_accuracy = model.evaluate(x_test, y_test, batch_size=batch_size)
test_accuracy
Out[24]:
In [ ]: