The visualization notebook includes code for generating some of the visualizations used in the project, the dummy model, and the baseline SVM model.
In [1]:
import tensorflow as tf
import matplotlib.pyplot as plt
import os
import numpy as np
import pandas as pd
#from IPython.display import display, Image
from scipy.ndimage import imread
import os, shutil
import sys
import random
import time
import pickle
from keras.preprocessing.image import ImageDataGenerator
from keras.preprocessing.image import array_to_img, img_to_array, load_img
from keras.models import Sequential, Model
from keras.layers import Conv2D, MaxPooling2D
from keras.layers import Activation, Dropout, Flatten, Dense
from keras import backend as K
from keras.callbacks import ModelCheckpoint
from keras import applications
import random
from PIL import Image
from scipy.stats import sem
from keras.models import model_from_json
from sklearn.metrics import f1_score
%matplotlib inline
#if K.image_data_format() == 'channels_first':
# input_shape = (3, img_width, img_height)
#else:
# input_shape = (img_width, img_height, 3)
In [2]:
# create a dict to hold the class weights
class_weight= dict()
# loop through the folders containing the images, use the folder name as the key
for folders in os.listdir('data'):
for label in os.listdir('data/'+folders):
if label in class_weight.keys():
class_weight[label] += len(os.listdir('data/'+folders+'/'+label))
else:
class_weight[label] = len(os.listdir('data/'+folders+'/'+label))
# plot items in each class before normalizing values
fig, ax = plt.subplots(figsize=(15,5))
plt.bar(range(len(class_weight)), class_weight.values(), align='center')
plt.xticks(range(len(class_weight)), list(class_weight.keys()))
ax.set_xticklabels(ax.xaxis.get_majorticklabels(), rotation=90)
plt.ylabel('Count')
plt.title('Number of images per class')
plt.show()
In [3]:
sum(class_weight.values())
Out[3]:
In [4]:
train_set = dict()
for label in os.listdir('data/train'):
if label in train_set.keys():
train_set[label] += len(os.listdir('data/train/'+label))
else:
train_set[label] = len(os.listdir('data/train/'+label))
print(max(train_set, key=lambda key: train_set[key]))
print(train_set['apples'])
num_train_maj = train_set['apples']
In [5]:
valid_set = dict()
for label in os.listdir('data/valid'):
if label in valid_set.keys():
valid_set[label] += len(os.listdir('data/valid/'+label))
else:
valid_set[label] = len(os.listdir('data/valid/'+label))
print(sum(valid_set.values()))
num_valid = sum(valid_set.values())
In [6]:
dummy_acc = num_train_maj / num_valid
print(dummy_acc)
In [2]:
from sklearn.datasets import load_files
from keras.utils import np_utils
import numpy as np
from glob import glob
from sklearn import preprocessing
# define function to load train, test, and validation datasets
def load_dataset(path):
data = load_files(path)
fruit_files = np.array(data['filenames'])
le = preprocessing.LabelEncoder()
fruit_targets = le.fit_transform(np.array(data['target']))
return fruit_files, fruit_targets
# load train, test, and validation datasets
train_files, train_targets = load_dataset('data/train')
valid_files, valid_targets = load_dataset('data/valid')
test_files, test_targets = load_dataset('data/test')
# load list of dog names
fruit_names = [item[11:-1] for item in sorted(glob("data/train/*/"))]
# print statistics about the dataset
print('There are %d total fruit categories.' % len(fruit_names))
print('There are %s total fruit images.\n' % len(np.hstack([train_files, valid_files, test_files])))
print('There are %d training fruit images.' % len(train_files))
print('There are %d validation fruit images.' % len(valid_files))
print('There are %d test fruit images.'% len(test_files))
In [3]:
# assert that the correct data type
print(type(train_files))
In [4]:
# function that shuffles and re-splits training and validation lists
# this function is used for each new round of training the SVM model.
# zip a and b
tr = zip(train_files,train_targets)
vld = zip(valid_files,valid_targets)
abcd = list(tr)+list(vld)
random.shuffle(abcd)
tr = abcd[:2009]
vld = abcd[2009:]
train_files, train_targets = zip(*tr)
valid_files, valid_targets = zip(*vld)
train_files = np.asarray(train_files)
train_targets = np.asarray(train_targets)
valid_files = np.asarray(valid_files)
valid_targets = np.asarray(valid_targets)
print('There are %d training fruit images.' % len(train_files))
print('There are %d validation fruit images.' % len(valid_files))
# zip a and b
tr = zip(train_files,train_targets)
vld = zip(valid_files,valid_targets)
abcd = list(tr)+list(vld)
random.shuffle(abcd)
tr = abcd[:2009]
vld = abcd[2009:]
train_files2, train_targets2 = zip(*tr)
valid_files2, valid_targets2 = zip(*vld)
train_files2 = np.asarray(train_files2)
train_targets2 = np.asarray(train_targets2)
valid_files2 = np.asarray(valid_files2)
valid_targets2 = np.asarray(valid_targets2)
print('There are %d training fruit images.' % len(train_files2))
print('There are %d validation fruit images.' % len(valid_files2))
In [5]:
# assert variables are of the right type
print(type(train_targets))
print(type(train_files))
To create the training and validation sets for the SVM benchmark, I need several building blocks:
In [6]:
# convert an image to an array
def jpg_image_to_array(image_path):
"""
Loads JPEG image into 3D Numpy array of shape
(width, height, channels)
"""
with Image.open(image_path) as image:
im_arr = np.fromstring(image.tobytes(), dtype=np.uint8)
try:
im_arr = im_arr.reshape((image.size[1], image.size[0], 3))
return im_arr
except:
return
def img_to_vector(im_arr):
"""
Resizes an array representing a RGB image and returns a flattened array of the image
"""
try:
img = Image.fromarray(im_arr,'RGB')
img = img.resize((250,250))
img = np.asarray(img)/255 # normalize the images
#print(img.shape)
return img.flatten()#.reshape(1,-1)
except:
return
vec = img_to_vector(jpg_image_to_array(train_files[0]))
print(vec)
vec = vec.reshape((250,250,3))
plt.imshow(vec)
plt.show()
In [7]:
for i in range(5):
img = Image.fromarray(jpg_image_to_array(train_files[i]),'RGB')
img = img.resize((250,250))
img = np.asarray(img)
#print(img.flatten()[0:9]/255)
plt.imshow(img)
plt.show()
When initially going through this workflow, it became apparent that some of the images had some formatting issues and would need to be removed from the dataset. The cell below performs that function.
In [8]:
train_list = []
train_bads = []
for i in train_files:
# test to make sure image can be resized
# if not store the file name in a list
# else append it to the list of resized images
obs = img_to_vector(jpg_image_to_array(i))
if obs != None:
train_list.append(obs)
else:
train_bads.append(i)
train_list2 = []
train_bads2 = []
for i in train_files2:
# test to make sure image can be resized
# if not store the file name in a list
# else append it to the list of resized images
obs = img_to_vector(jpg_image_to_array(i))
if obs != None:
train_list2.append(obs)
else:
train_bads2.append(i)
In [9]:
# some items were removed in the conversion of the images into the dataframe so need to remove the labels for those
# images from the train_targets list
print(train_bads)
print(np.where(train_files==train_bads[0]))
for i in train_bads:
print(np.where(train_files==i))
train_targets = np.delete(train_targets, np.where(train_files==i))
print(len(train_list)==len(train_targets))
# some items were removed in the conversion of the images into the dataframe so need to remove the labels for those
# images from the train_targets list
print(train_bads2)
print(np.where(train_files2==train_bads2[0]))
for i in train_bads2:
print(np.where(train_files2==i))
train_targets2 = np.delete(train_targets2, np.where(train_files2==i))
print(len(train_list2)==len(train_targets2))
In [10]:
print(len(train_list))
print(len(train_targets))
print(np.unique(train_targets))
print(len(train_list2))
print(len(train_targets2))
print(np.unique(train_targets2))
In [11]:
from sklearn.svm import SVC
#class_weight='balanced'
clf = SVC(class_weight='balanced')
clf2 = SVC(class_weight='balanced')
In [12]:
print(type(train_list))
In [13]:
clf.fit(train_list, train_targets)
clf2.fit(train_list2, train_targets2)
Out[13]:
In [ ]:
from sklearn.externals import joblib
joblib.dump(clf, 'svm1.pkl')
joblib.dump(clf2, 'svm2.pkl')
#clf = joblib.load('svm1.pkl')
#clf2 = joblib.load('svm2.pkl')
In [7]:
valid_list = []
valid_bads = []
for i in valid_files:
# test to make sure image can be resized
# if not store the file name in a list
# else append it to the list of resized images
obs = img_to_vector(jpg_image_to_array(i))
if obs != None:
valid_list.append(obs)
else:
valid_bads.append(i)
valid_list = pd.DataFrame(valid_list)
valid_list2 = []
valid_bads2 = []
for i in valid_files2:
# test to make sure image can be resized
# if not store the file name in a list
# else append it to the list of resized images
obs = img_to_vector(jpg_image_to_array(i))
if obs != None:
valid_list2.append(obs)
else:
valid_bads2.append(i)
valid_list2 = pd.DataFrame(valid_list2)
In [8]:
# some items were removed in the conversion of the images into the dataframe so need to remove the labels for those
# images from the train_targets list
print(valid_bads)
print(np.where(valid_files==valid_bads[0]))
for i in valid_bads:
print(np.where(valid_files==i))
valid_targets = np.delete(valid_targets, np.where(valid_files==i))
print(len(valid_list)==len(valid_targets))
# some items were removed in the conversion of the images into the dataframe so need to remove the labels for those
# images from the train_targets list
print(valid_bads2)
print(np.where(valid_files2==valid_bads2[0]))
for i in valid_bads2:
print(np.where(valid_files2==i))
valid_targets2 = np.delete(valid_targets2, np.where(valid_files2==i))
print(len(valid_list2)==len(valid_targets2))
In [9]:
print(valid_list.shape)
print(valid_targets.shape)
print(np.unique(valid_targets))
print(valid_list2.shape)
print(valid_targets2.shape)
print(np.unique(valid_targets2))
In [18]:
print('acc1: ', clf.score(valid_list, valid_targets))
print('acc2: ', clf2.score(valid_list2, valid_targets2))
from sklearn.metrics import f1_score
print('f1 1: ', f1_score(valid_targets, clf.predict(valid_list),average='weighted'))
print('f1 2: ', f1_score(valid_targets2, clf2.predict(valid_list2),average='weighted'))
#f1 1: 0.409576629999
#f1 2: 0.40152619965
In [ ]:
print(type(valid_list))
In [ ]:
#print(train_list)
i=798
print(clf.predict(train_list[i].reshape(1,-1)))
print(train_targets[i])
vec = train_list[i].reshape((250,250,3))
plt.imshow(vec)
plt.show()
In [21]:
#0.3857566765578635 trial 1
#0.414925373134 trial 2
#0.41369047619 trial 3
#0.428783382789 trial 4
#0.46884272997 trial 5
#acc1: 0.439821693908
#acc2: 0.433878157504
# acc1: 0.448888888889
svm_trials = np.asarray([0.3857566765578635,0.414925373134,0.41369047619,0.428783382789,0.46884272997])
print(svm_trials.mean())
from scipy.stats import sem
print(sem(svm_trials))
In [22]:
test_list = []
test_bads = []
for i in test_files:
# test to make sure image can be resized
# if not store the file name in a list
# else append it to the list of resized images
obs = img_to_vector(jpg_image_to_array(i))
if obs != None:
test_list.append(obs)
else:
test_list.append(i)
test_list = pd.DataFrame(test_list)
# some items were removed in the conversion of the images into the dataframe so need to remove the labels for those
# images from the train_targets list
print(test_bads)
print(np.where(test_files==test_bads[0]))
for i in test_bads:
print(np.where(test_files==i))
test_targets = np.delete(test_targets, np.where(test_files==i))
print(len(test_list)==len(test_targets))
print(len(test_list))
print(len(test_targets))
print(np.unique(test_targets))
In [23]:
# this cell gets ride of string characters and converts them to 0.5.
# The string values somehow got into the data? Uncomment the lines to find any cells
# that have string data.
#for i in test_list.columns:
# obj = test_list[i].dtype=='object'
# if obj == True:
# print(i)
# test_list[i] = pd.to_numeric(test_list[i])
# these lines convert columns with string values to numeric and then replace NaN
# values with 0.5
test_list = test_list.apply(lambda x: pd.to_numeric(x, errors='coerce'))
test_list = test_list.fillna(0.5)
In [24]:
# classifier scores on the test data
print(clf.score(test_list, test_targets))
print(clf2.score(test_list, test_targets))
In [25]:
#0.44702467344 test 1
#0.438316400581 test 2
#0.428156748911 test 3
#
svm_tests = np.asarray([0.44702467344,0.438316400581,0.428156748911,0.438316400581])
print(svm_tests.mean())
print(sem(svm_tests))
The cells below have the aggregated data from the CNN's and are used for calculating basic stats.
In [2]:
cnn1_trials = np.asarray([.6419,0.6464,0.5918])
print(cnn1_trials.mean())
print(sem(cnn1_trials))
In [3]:
cnn2_trials = np.asarray([0.5387,0.5144,0.5660])
print(cnn2_trials.mean())
print(sem(cnn2_trials))
In [4]:
cnn3_trials = np.asarray([0.8095,0.8244,0.8051])
print(cnn3_trials.mean())
print(sem(cnn3_trials))
In [5]:
cnn4_trials = np.asarray([0.7917,0.8348,0.7917])
print(cnn4_trials.mean())
print(sem(cnn4_trials))
In [32]:
# model 1
#first_try.h5
#weights.best.from_scratch.02.hdf5
#weights.best.from_scratch.03.hdf5
# model 2
#class-weights-weights-improvement-26-0.54.hdf5
#class-weights-weights-improvement02-14-0.51.hdf5
#class-weights-weights-improvement03-25-0.57.hdf5
# model 3
#tflearning-weights-improvement-10-0.81.hdf5
#tflearning-weights-improvement02-12-0.82.hdf5
#tflearning-weights-improvement03-12-0.81.hdf5
# model 4
#tflearningwclassweights-weights-improvement-09-0.79.hdf5
#tflearningwclassweights02-weights-improvement-18-0.83.hdf5
#tflearningwclassweights03-weights-improvement-16-0.84.hdf5
def load_a_model(model, weights):
json_file = open(model, 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights(weights)
print("Loaded model from disk")
return loaded_model
weights_list = ['first_try.h5','saved_models/weights.best.from_scratch.02.hdf5','saved_models/weights.best.from_scratch.03.hdf5',
'saved_models/class-weights-weights-improvement-26-0.54.hdf5', 'saved_models/class-weights-weights-improvement02-14-0.51.hdf5',
'saved_models/class-weights-weights-improvement03-25-0.57.hdf5']
results = []
for w in weights_list:
# load the model
curr_model = load_a_model('scratch_model.json', w)
# compile the model
curr_model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
# evaluate the model on validation data
preds = []
preds2 = []
for i in valid_list.index:
preds.append(np.argmax(curr_model.predict(np.reshape(valid_list.iloc[i],(1,250,250,3)))))
score1 = f1_score(valid_targets,preds,average='weighted')
for i in valid_list2.index:
preds2.append(np.argmax(curr_model.predict(np.reshape(valid_list2.iloc[i],(1,250,250,3)))))
score2 = f1_score(valid_targets2,preds2,average='weighted')
results.append((w,score1,score2))
In [33]:
print(results)
In [47]:
valid_data = np.load(open('bottleneck_features_validation.npy','rb'))
validlabels = np.load(open('validation_labels.npy','rb'))
valid_labels = []
for i in testlabels:
valid_labels.append(np.argmax(i))
test_data = np.load(open('bottleneck_features_test.npy','rb'))
testlabels = np.load(open('test_labels.npy','rb'))
test_labels = []
for i in testlabels:
test_labels.append(np.argmax(i))
weights_list = ['saved_models/tflearning-weights-improvement-10-0.81.hdf5','saved_models/tflearning-weights-improvement02-12-0.82.hdf5',
'saved_models/tflearning-weights-improvement03-12-0.81.hdf5','saved_models/tflearningwclassweights-weights-improvement-09-0.79.hdf5',
'saved_models/tflearningwclassweights02-weights-improvement-18-0.83.hdf5','saved_models/tflearningwclassweights03-weights-improvement-16-0.84.hdf5']
results = []
for w in weights_list:
# load the model
curr_model = load_a_model('bestmodel.json', w)
# compile the model
curr_model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
# evaluate the model on validation data
preds = []
preds2 = []
for i in valid_data:
preds.append(np.argmax(curr_model.predict(np.reshape(i,(1,7,7,512)))))
score1 = f1_score(valid_labels,preds,average='weighted')
for i in test_data:
preds2.append(np.argmax(curr_model.predict(np.reshape(i,(1,7,7,512)))))
score2 = f1_score(test_labels,preds2,average='weighted')
results.append((w,score1,score2))
In [48]:
for i in results:
print(i)
In [51]:
mod1f1 = np.asarray([0.45550380982340388,0.43193409762925405,0.60125851884784454,0.58524908466045955,
0.50327894029919606,0.51602038040925202])
mod2f1 = np.asarray([0.53766010109643325,0.53826810752583143,0.51525538147146177,0.48612158295479657,
0.55480671346617594,0.54717601001541061])
mod3f1 = np.asarray([0.80033412858312825,0.78127462016004723, 0.81800105338356222,0.78243978588131813,
0.79346342843245155,0.79560844383175566])
mod4f1 = np.asarray([0.79653512885039224,0.79104560682662206,0.83553250703650894,0.79685692584553824,
0.83845005576817555,0.79642359974662913])
f1_trials = [mod1f1, mod2f1, mod3f1, mod4f1]
for i in f1_trials:
print(i.mean())
print(sem(i))
print('\n')
In [11]:
traindf = pd.DataFrame((np.asarray(train_list)))
traindf['Label'] = np.asarray(train_targets)
traindf.head()
Out[11]:
In [13]:
class_names = {'Label':{0:'acerolas',1:'apples',2:'apricots',3:'avocados',4:'bananas',5:'blackberries',
6:'blueberries',7:'cantaloupes',8:'cherries',9:'coconuts',10:'figs',11:'grapefruits',
12:'grapes',13:'guava',14:'honneydew_melon',15:'kiwifruit',16:'lemons',17:'limes',
18:'mangos',19:'nectarine',20:'olives',21:'onion',22:'orange',23:'passionfruit',
24:'peaches',25:'pears',26:'pineapples',27:'plums',28:'pomegranates',
29:'potato',30:'raspberries',31:'strawberries',32:'tomatoes',33:'watermelon'}}
traindf.replace(to_replace=class_names, inplace=True)
In [14]:
grouped = traindf.groupby('Label').mean()
In [21]:
grouped.mean(axis=1).sort_values().plot(kind='bar', figsize=(10,10))
plt.ylabel('Normalized pixel intensity')
plt.savefig('PixInten.png')
In [ ]: