In [ ]:
import os
import re
import tarfile
import tqdm
import requests
from pugnlp.futil import path_status, find_files
import numpy as np # Keras takes care of most of this but it likes to see Numpy arrays
from keras.preprocessing import sequence # A helper module to handle padding input
from keras.models import Sequential # The base keras Neural Network model
from keras.layers import Dense, Dropout, Activation # The layer objects we will pile into the model
from keras.layers import Conv1D, GlobalMaxPooling1D
In [2]:
# From the nlpia package for downloading data too big for the repo
BIG_URLS = {
'w2v': (
'https://www.dropbox.com/s/965dir4dje0hfi4/GoogleNews-vectors-negative300.bin.gz?dl=1',
1647046227,
),
'slang': (
'https://www.dropbox.com/s/43c22018fbfzypd/slang.csv.gz?dl=1',
117633024,
),
'tweets': (
'https://www.dropbox.com/s/5gpb43c494mc8p0/tweets.csv.gz?dl=1',
311725313,
),
'lsa_tweets': (
'https://www.dropbox.com/s/rpjt0d060t4n1mr/lsa_tweets_5589798_2003588x200.tar.gz?dl=1',
3112841563, # 3112841312,
),
'imdb': (
'https://www.dropbox.com/s/yviic64qv84x73j/aclImdb_v1.tar.gz?dl=1',
3112841563, # 3112841312,
),
}
In [3]:
# These functions are part of the nlpia package which can be pip installed and run from there.
def dropbox_basename(url):
filename = os.path.basename(url)
match = re.findall(r'\?dl=[0-9]$', filename)
if match:
return filename[:-len(match[0])]
return filename
def download_file(url, data_path='.', filename=None, size=None, chunk_size=4096, verbose=True):
"""Uses stream=True and a reasonable chunk size to be able to download large (GB) files over https"""
if filename is None:
filename = dropbox_basename(url)
file_path = os.path.join(data_path, filename)
if url.endswith('?dl=0'):
url = url[:-1] + '1' # noninteractive download
if verbose:
tqdm_prog = tqdm
print('requesting URL: {}'.format(url))
else:
tqdm_prog = no_tqdm
r = requests.get(url, stream=True, allow_redirects=True)
size = r.headers.get('Content-Length', None) if size is None else size
print('remote size: {}'.format(size))
stat = path_status(file_path)
print('local size: {}'.format(stat.get('size', None)))
if stat['type'] == 'file' and stat['size'] == size: # TODO: check md5 or get the right size of remote file
r.close()
return file_path
print('Downloading to {}'.format(file_path))
with open(file_path, 'wb') as f:
for chunk in r.iter_content(chunk_size=chunk_size):
if chunk: # filter out keep-alive chunks
f.write(chunk)
r.close()
return file_path
def untar(fname):
if fname.endswith("tar.gz"):
with tarfile.open(fname) as tf:
tf.extractall()
else:
print("Not a tar.gz file: {}".format(fname))
In [ ]:
download_file(BIG_URLS['w2v'][0])
In [ ]:
untar(download_file(BIG_URLS['imdb'][0]))
In [ ]:
import glob
import os
from random import shuffle
def pre_process_data(filepath):
"""
This is dependent on your training data source but we will try to generalize it as best as possible.
"""
positive_path = os.path.join(filepath, 'pos')
negative_path = os.path.join(filepath, 'neg')
pos_label = 1
neg_label = 0
dataset = []
for filename in glob.glob(os.path.join(positive_path, '*.txt')):
with open(filename, 'r') as f:
dataset.append((pos_label, f.read()))
for filename in glob.glob(os.path.join(negative_path, '*.txt')):
with open(filename, 'r') as f:
dataset.append((neg_label, f.read()))
shuffle(dataset)
return dataset
dataset = pre_process_data('./aclImdb/train')
print(dataset[0])
In [ ]:
from nltk.tokenize import TreebankWordTokenizer
from gensim.models import KeyedVectors
word_vectors = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True, limit=200000)
def tokenize_and_vectorize(dataset):
tokenizer = TreebankWordTokenizer()
vectorized_data = []
expected = []
for sample in dataset:
tokens = tokenizer.tokenize(sample[1])
sample_vecs = []
for token in tokens:
try:
sample_vecs.append(word_vectors[token])
except KeyError:
pass # No matching token in the Google w2v vocab
vectorized_data.append(sample_vecs)
return vectorized_data
In [ ]:
def collect_expected(dataset):
""" Peel of the target values from the dataset """
expected = []
for sample in dataset:
expected.append(sample[0])
return expected
In [ ]:
vectorized_data = tokenize_and_vectorize(dataset)
expected = collect_expected(dataset)
In [ ]:
split_point = int(len(vectorized_data)*.8)
x_train = vectorized_data[:split_point]
y_train = expected[:split_point]
x_test = vectorized_data[split_point:]
y_test = expected[split_point:]
In [ ]:
maxlen = 400
batch_size = 32 # How many samples to show the net before backpropogating the error and updating the weights
embedding_dims = 300 # Length of the token vectors we will create for passing into the Convnet
filters = 250 # Number of filters we will train
kernel_size = 3 # The width of the filters, actual filters will each be a matrix of weights of size: embedding_dims x kernel_size or 50 x 3 in our case
hidden_dims = 250 # Number of neurons in the plain feed forward net at the end of the chain
epochs = 2 # Number of times we will pass the entire training dataset through the network
In [ ]:
# Must manually pad/truncate
def pad_trunc(data, maxlen):
""" For a given dataset pad with zero vectors or truncate to maxlen """
new_data = []
# Create a vector of 0's the length of our word vectors
zero_vector = []
for _ in range(len(data[0][0])):
zero_vector.append(0.0)
for sample in data:
if len(sample) > maxlen:
temp = sample[:maxlen]
elif len(sample) < maxlen:
temp = sample
additional_elems = maxlen - len(sample)
for _ in range(additional_elems):
temp.append(zero_vector)
else:
temp = sample
new_data.append(temp)
return new_data
In [ ]:
x_train = pad_trunc(x_train, maxlen)
x_test = pad_trunc(x_test, maxlen)
x_train = np.reshape(x_train, (len(x_train), maxlen, embedding_dims))
y_train = np.array(y_train)
x_test = np.reshape(x_test, (len(x_test), maxlen, embedding_dims))
y_test = np.array(y_test)
In [ ]:
print('Build model...')
model = Sequential()
# we add a Convolution1D, which will learn filters
# word group filters of size filter_length:
model.add(Conv1D(filters,
kernel_size,
padding='valid',
activation='relu',
strides=1,
input_shape=(maxlen, embedding_dims)))
# we use max pooling:
model.add(GlobalMaxPooling1D())
# We add a vanilla hidden layer:
model.add(Dense(hidden_dims))
model.add(Dropout(0.2))
model.add(Activation('relu'))
# We project onto a single unit output layer, and squash it with a sigmoid:
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy'])
model.fit(x_train, y_train,
batch_size=batch_size,
epochs=epochs,
validation_data=(x_test, y_test))
model_structure = model.to_json()
with open("cnn_model.json", "w") as json_file:
json_file.write(model_structure)
model.save_weights("cnn_weights.h5")
print('Model saved.')
In [ ]:
from keras.models import model_from_json
with open("cnn_model.json", "r") as json_file:
json_string = json_file.read()
model = model_from_json(json_string)
model.load_weights('cnn_weights.h5')
In [ ]:
sample_1 = "I'm hate that the dismal weather that had me down for so long, when will it break! Ugh, when does happiness return? The sun is blinding and the puffy clouds are too thin. I can't wait for the weekend."
In [ ]:
# We pass a dummy value in the first element of the tuple just because our helper expects it from the way processed the initial data. That value won't ever see the network, so it can be whatever.
vec_list = tokenize_and_vectorize([(1, sample_1)])
# Tokenize returns a list of the data (length 1 here)
test_vec_list = pad_trunc(vec_list, maxlen)
test_vec = np.reshape(test_vec_list, (len(test_vec_list), maxlen, embedding_dims))
model.predict(test_vec)
In [ ]:
model.predict_classes(test_vec)