In [1]:
import tensorflow as tf
import numpy as np
import random, json, string, pickle
import keras
import keras.layers
from keras.layers.core import Flatten, Dense, Dropout
from keras.layers.convolutional import Convolution2D, MaxPooling2D, ZeroPadding2D
import keras.models
from keras.models import Sequential
import keras.optimizers
import keras.callbacks
from keras.preprocessing import image
from keras.preprocessing.image import ImageDataGenerator
import matplotlib.pyplot as plt
import sqlite3
%matplotlib inline
import keras.applications.vgg16 as vgg16
Bokeh is used later to make an attractive plot of train time per epoch as well as training loss and validation loss.
In [2]:
from bokeh.charts import Line
from bokeh.plotting import figure, show
from bokeh.models import Range1d
from bokeh.io import output_notebook
import bokeh as bokeh
output_notebook()
class LossTimeHistory(keras.callbacks.Callback):
def on_train_begin(self, logs={}):
self.losses = []
self.durations = []
self.val_loss = []
def on_epoch_begin(self, epoch, logs={}):
import time
self.start_time = time.time()
def on_epoch_end(self, epoch, logs={}):
self.losses.append(logs.get('loss'))
self.val_loss.append(logs.get('val_loss'))
import time
duration = time.time()-self.start_time
self.durations.append(duration)
The "Pantry" is what we call the ingredients we'd ultimately like to predict on.
In [3]:
size_pantry = 500
In [4]:
# get the pretrained model (minus FC layers)
original_model = vgg16.VGG16(weights='imagenet', include_top=False, input_tensor = vgg16.Input(shape=(224,224,3)))
# construct model
deep_features = Flatten()(original_model.output)
ingredients_guessing = Dense(size_pantry, activation='sigmoid')(deep_features)#fc1)
m = keras.models.Model(input=original_model.input, output=ingredients_guessing)
print(m.summary())
#optimizer = keras.optimizers.SGD(lr=0.0001, momentum=0.9, decay=0, nesterov=True)
optimizer = keras.optimizers.Adadelta(lr=1.0, rho=0.95, epsilon=1e-08, decay=0.0)
m.compile(optimizer, loss='binary_crossentropy')
Creating the Pantry
In [ ]:
conn = sqlite3.connect("yummly.db")
recipes = conn.execute("SELECT ID, CleanIngredients FROM Recipe WHERE ImageAvailable=1 AND English=1;").fetchall()
conn.close()
In [ ]:
def prepIngredients(ingredients_string):
return [x.strip() for x in ingredients_string.split(";")]
recipes = [tuple([x[0], prepIngredients(x[1])])for x in recipes]
In [ ]:
all_ingredients = np.array([item for sublist in recipes for item in sublist[1]])
unique_ing = np.unique(all_ingredients, return_counts = True)
argsort_results = np.argsort(unique_ing[1])
sorted_ing = unique_ing[0][argsort_results]
sorted_ing = sorted_ing[::-1]
id2food = sorted_ing[0:size_pantry]
food2id = {food: idx[0] for idx, food in np.ndenumerate(id2food)}
In [ ]:
# free up some memory
all_ingredients = None
argsort_results = None
In [ ]:
ids = []
recipe_ingredients = [x[1] for x in recipes]
labels = []
class_frequency = [0] * size_pantry
# Then we build our training data. We're not going to include
# recipes that have ingredients not in the pantry
for i in range(0, len(recipes)):
num_misses = 0
current_recipe = [0] * size_pantry
for j in range(0, len(recipe_ingredients[i])):
if food2id.get(recipe_ingredients[i][j]) != None:
food_id = food2id.get(recipe_ingredients[i][j])
current_recipe[food_id] = 1
class_frequency[food_id] = class_frequency[food_id] + 1
else:
num_misses = num_misses + 1
if num_misses == 0 and sum(current_recipe) > 0:
labels.append(current_recipe)
ids.append(recipes[i][0])
In [ ]:
recipe_ingredients = None
recipes = None
In [ ]:
class_weights = [{index: len(ids) / x / 100} for index, x in enumerate(class_frequency)]
class_weights2 = [{index: np.power(5000 / x, 2)} for index, x in enumerate(class_frequency)]
In [ ]:
#pickle.dump( [ids, labels, food2id, id2food, class_weights, class_weights2], open( "ids_labels_nomissing_500.p", "wb" ) )
In [5]:
[ids, labels, food2id, id2food, class_weights, class_weights2] = pickle.load( open( "ids_labels_nomissing_500.p", "rb" ) )
In [6]:
four_fifths = (len(labels) // 5) * 4
train_ids = ids[0:four_fifths]
train_labels = labels[0:four_fifths]
test_ids = ids[four_fifths:]
test_labels = labels[four_fifths:]
In [7]:
ids = None
labels = None
In [9]:
m.load_weights('model_weights_w_cnn.hdf5')
In [10]:
filepath = "model_weights_w_cnn.hdf5"
checkpoint = keras.callbacks.ModelCheckpoint(filepath, monitor='val_loss',
verbose=1, save_best_only=True, mode='min')
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0,
patience=150, verbose=1, mode='min')
history = LossTimeHistory()
In [ ]:
import random
def DataGenerator(imageIds, imageLabels, batch_size):
batch = np.zeros((batch_size, 224, 224, 3))
labels = np.zeros((batch_size, size_pantry))
while True:
batch_bad = False
for i in range(0, batch_size):
index = random.randint(0, len(imageIds) - 1)
img_path = 'resized_thumbs/' + imageIds[index] + ".jpg"
try:
#turns out a small number of our images are corrupted
img = image.load_img(img_path, target_size=(224, 224))
except:
#print("Error reading image: " + imageIds[index])
batch_bad = True
img = image.img_to_array(img)
batch[i, :, :, :] = img
labels[i, :] = imageLabels[index]
batch = vgg16.preprocess_input(batch)
if batch_bad is False:
yield batch, labels
m.fit_generator(DataGenerator(train_ids, train_labels, 64), steps_per_epoch = 500, epochs = 22000,
verbose = 2,
validation_data = DataGenerator(test_ids, test_labels, 64),
validation_steps = 500,
workers = 16,
max_q_size = 8,
pickle_safe = True,
class_weight = class_weights2,
callbacks = [history, checkpoint, early_stop])
show(bokeh.models.layouts.Row(
Line(data=history.losses, title='Loss Per Epoch', plot_width=390, plot_height=390, legend=None, ylabel='BCE', xlabel='Epoch'),
Line(data=history.val_loss, title='Validation Loss Per Epoch', plot_width=390, plot_height=390, legend=None, ylabel='BCE', xlabel='Epoch'),
Line(data=history.durations, title='Seconds Per Epoch', plot_width=390, plot_height=390, legend=None, ylabel='Seconds', xlabel='Epoch')))
In [ ]:
show(bokeh.models.layouts.Row(
Line(data=history.losses, title='Loss Per Epoch', plot_width=390, plot_height=390, legend=None, ylabel='BCE', xlabel='Epoch'),
Line(data=history.val_loss, title='Validation Loss Per Epoch', plot_width=390, plot_height=390, legend=None, ylabel='BCE', xlabel='Epoch'),
Line(data=history.durations, title='Seconds Per Epoch', plot_width=390, plot_height=390, legend=None, ylabel='Seconds', xlabel='Epoch')))
In [21]:
def foodPredict(m, img_path = None):
if img_path == None:
j = random.randint(0,len(test_ids))
print(test_ids[j])
img_path = 'resized_thumbs/' + test_ids[j] + '.jpg'
conn = sqlite3.connect("yummly.db")
print(conn.execute("SELECT Title FROM Recipe WHERE ID='" + str(test_ids[j]) + "';").fetchone())
conn.close()
print('\nGround truth')
for i in range(0,size_pantry):
if test_labels[j][i] == 1:
print(id2food[i])
img = image.load_img(img_path, target_size=(224, 224))
img_arr = image.img_to_array(img)
x = np.expand_dims(img_arr, axis=0) # The model only accepts batches so we add a dummy dimension.
x = vgg16.preprocess_input(x) # The preprocessing should be the same that was used during training.
predictions = m.predict(x)
argsort_results = np.argsort(-predictions)[0]
plt.imshow(np.asarray(img));
print("\nTop 10 Predictions")
for i in range(0,10):
#print(argsort_results[i])
print(id2food[argsort_results[i]] + ": " + str(predictions[0][argsort_results[i]]))
In [25]:
foodPredict(m)
In [ ]:
test_predictions = np.zeros((len(test_ids), size_pantry))
for j in range(0, len(test_ids)):
img_path = 'resized_thumbs/' + test_ids[j] + '.jpg'
img = image.load_img(img_path, target_size=(224, 224))
img_arr = image.img_to_array(img)
x = np.expand_dims(img_arr, axis=0) # The model only accepts batches so we add a dummy dimension.
x = vgg16.preprocess_input(x) # The preprocessing should be the same that was used during training.
test_predictions[j] = m.predict(x)
if (j % 10000 == 0):
print(str(j) + "...")
In [ ]:
from sklearn.metrics import average_precision_score
for i in range(0, size_pantry):
y_true = np.array(test_labels_np[:,i])
y_scores = np.array(test_predictions[:,i])
print(average_precision_score(y_true, y_scores))
In [ ]:
#pickle.dump( [test_labels_np, test_predictions], open( "test_labels_predictions.p", "wb" ) )
In [ ]:
[test_labels_np, test_predictions] = pickle.load( open( "test_labels_predictions.p", "rb" ) )
In [ ]:
print(__doc__)
import matplotlib.pyplot as plt
import numpy as np
from itertools import cycle
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
# Compute Precision-Recall and plot curve
precision = dict()
recall = dict()
average_precision = dict()
for i in range(size_pantry):
precision[i], recall[i], _ = precision_recall_curve(test_labels_np[:, i],
test_predictions[:, i])
average_precision[i] = average_precision_score(test_labels_np[:, i], test_predictions[:, i])
# Compute micro-average ROC curve and ROC area
precision["micro"], recall["micro"], _ = precision_recall_curve(test_labels_np.ravel(),
test_predictions.ravel())
average_precision["micro"] = average_precision_score(test_labels_np, test_predictions,
average="micro")
In [ ]:
# Plot Precision-Recall curve
for i in range(0,size_pantry):
plt.clf()
plt.plot(recall[i], precision[i], lw=2, color='navy',
label='Precision-Recall curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1])
plt.title('Precision-Recall, ' + id2food[i] + ': AUC={0:0.2f}'.format(average_precision[i]))
plt.legend(loc="upper right")
plt.show()