The visualization notebook includes code for generating some of the visualizations used in the project, the dummy model, and the baseline SVM model.


In [1]:
import tensorflow as tf
import matplotlib.pyplot as plt
import os
import numpy as np
import pandas as pd
#from IPython.display import display, Image
from scipy.ndimage import imread
import os, shutil
import sys
import random
import time
import pickle
from keras.preprocessing.image import ImageDataGenerator
from keras.preprocessing.image import array_to_img, img_to_array, load_img
from keras.models import Sequential, Model
from keras.layers import Conv2D, MaxPooling2D
from keras.layers import Activation, Dropout, Flatten, Dense
from keras import backend as K
from keras.callbacks import ModelCheckpoint  
from keras import applications
import random
from PIL import Image
from scipy.stats import sem
from keras.models import model_from_json
from sklearn.metrics import f1_score
%matplotlib inline
#if K.image_data_format() == 'channels_first':
#    input_shape = (3, img_width, img_height)
#else:
#    input_shape = (img_width, img_height, 3)


Using TensorFlow backend.

In [2]:
# create a dict to hold the class weights
class_weight= dict()
# loop through the folders containing the images, use the folder name as the key
for folders in os.listdir('data'):
    for label in os.listdir('data/'+folders):
        if label in class_weight.keys():
            class_weight[label] += len(os.listdir('data/'+folders+'/'+label))
        else:
            class_weight[label] = len(os.listdir('data/'+folders+'/'+label))
# plot items in each class before normalizing values
fig, ax = plt.subplots(figsize=(15,5))
plt.bar(range(len(class_weight)), class_weight.values(), align='center')
plt.xticks(range(len(class_weight)), list(class_weight.keys()))
ax.set_xticklabels(ax.xaxis.get_majorticklabels(), rotation=90)
plt.ylabel('Count')
plt.title('Number of images per class')
plt.show()



In [3]:
sum(class_weight.values())


Out[3]:
3373

Dummy classifier

Find the majority of class in the training set, then predict only that class on the validation set. Dummy accuracy is # of majority class / # in validation set


In [4]:
train_set = dict()
for label in os.listdir('data/train'):
    if label in train_set.keys():
        train_set[label] += len(os.listdir('data/train/'+label))
    else:
        train_set[label] = len(os.listdir('data/train/'+label))
print(max(train_set, key=lambda key: train_set[key]))
print(train_set['apples'])
num_train_maj = train_set['apples']


apples
241

In [5]:
valid_set = dict()
for label in os.listdir('data/valid'):
    if label in valid_set.keys():
        valid_set[label] += len(os.listdir('data/valid/'+label))
    else:
        valid_set[label] = len(os.listdir('data/valid/'+label))
print(sum(valid_set.values()))
num_valid = sum(valid_set.values())


675

In [6]:
dummy_acc = num_train_maj / num_valid
print(dummy_acc)


0.35703703703703704

Benchmark: SVM

load each image as a vector of pixels train SVM score on validation set


In [2]:
from sklearn.datasets import load_files       
from keras.utils import np_utils
import numpy as np
from glob import glob
from sklearn import preprocessing

# define function to load train, test, and validation datasets
def load_dataset(path):
    data = load_files(path)
    fruit_files = np.array(data['filenames'])
    le = preprocessing.LabelEncoder()
    fruit_targets = le.fit_transform(np.array(data['target']))
    return fruit_files, fruit_targets

# load train, test, and validation datasets
train_files, train_targets = load_dataset('data/train')
valid_files, valid_targets = load_dataset('data/valid')
test_files, test_targets = load_dataset('data/test')

# load list of dog names
fruit_names = [item[11:-1] for item in sorted(glob("data/train/*/"))]

# print statistics about the dataset
print('There are %d total fruit categories.' % len(fruit_names))
print('There are %s total fruit images.\n' % len(np.hstack([train_files, valid_files, test_files])))
print('There are %d training fruit images.' % len(train_files))
print('There are %d validation fruit images.' % len(valid_files))
print('There are %d test fruit images.'% len(test_files))


There are 34 total fruit categories.
There are 3373 total fruit images.

There are 2009 training fruit images.
There are 675 validation fruit images.
There are 689 test fruit images.

In [3]:
# assert that the correct data type
print(type(train_files))


<class 'numpy.ndarray'>

In [4]:
# function that shuffles and re-splits training and validation lists
# this function is used for each new round of training the SVM model.

# zip a and b
tr = zip(train_files,train_targets)
vld = zip(valid_files,valid_targets)
abcd = list(tr)+list(vld)
random.shuffle(abcd)
tr = abcd[:2009]
vld = abcd[2009:]
train_files, train_targets = zip(*tr)
valid_files, valid_targets = zip(*vld)
train_files = np.asarray(train_files)
train_targets = np.asarray(train_targets)
valid_files = np.asarray(valid_files)
valid_targets = np.asarray(valid_targets)

print('There are %d training fruit images.' % len(train_files))
print('There are %d validation fruit images.' % len(valid_files))

# zip a and b
tr = zip(train_files,train_targets)
vld = zip(valid_files,valid_targets)
abcd = list(tr)+list(vld)
random.shuffle(abcd)
tr = abcd[:2009]
vld = abcd[2009:]
train_files2, train_targets2 = zip(*tr)
valid_files2, valid_targets2 = zip(*vld)
train_files2 = np.asarray(train_files2)
train_targets2 = np.asarray(train_targets2)
valid_files2 = np.asarray(valid_files2)
valid_targets2 = np.asarray(valid_targets2)

print('There are %d training fruit images.' % len(train_files2))
print('There are %d validation fruit images.' % len(valid_files2))


There are 2009 training fruit images.
There are 675 validation fruit images.
There are 2009 training fruit images.
There are 675 validation fruit images.

In [5]:
# assert variables are of the right type
print(type(train_targets))
print(type(train_files))


<class 'numpy.ndarray'>
<class 'numpy.ndarray'>

To create the training and validation sets for the SVM benchmark, I need several building blocks:

  1. a function that resizes and image
  2. a function that converts the image to a vector

In [6]:
# convert an image to an array
def jpg_image_to_array(image_path):
    """
    Loads JPEG image into 3D Numpy array of shape 
    (width, height, channels)
    """
    with Image.open(image_path) as image:
        im_arr = np.fromstring(image.tobytes(), dtype=np.uint8)
        try:
            im_arr = im_arr.reshape((image.size[1], image.size[0], 3))
            return im_arr
        except:
            return

def img_to_vector(im_arr):
    """
    Resizes an array representing a RGB image and returns a flattened array of the image
    """
    try:
        img = Image.fromarray(im_arr,'RGB')
        img = img.resize((250,250))
        img = np.asarray(img)/255 # normalize the images
        #print(img.shape)
        return img.flatten()#.reshape(1,-1)
    except:
        return

vec = img_to_vector(jpg_image_to_array(train_files[0]))
print(vec)
vec = vec.reshape((250,250,3))
plt.imshow(vec)
plt.show()


[ 0.44313725  0.42352941  0.41176471 ...,  0.60784314  0.55686275
  0.59607843]

In [7]:
for i in range(5):
    img = Image.fromarray(jpg_image_to_array(train_files[i]),'RGB')
    img = img.resize((250,250))
    img = np.asarray(img)
    #print(img.flatten()[0:9]/255)
    plt.imshow(img)
    plt.show()


When initially going through this workflow, it became apparent that some of the images had some formatting issues and would need to be removed from the dataset. The cell below performs that function.


In [8]:
train_list = []
train_bads = []
for i in train_files:
    # test to make sure image can be resized
    # if not store the file name in a list
    # else append it to the list of resized images
    obs = img_to_vector(jpg_image_to_array(i))
    if obs != None:
        train_list.append(obs)
    else:
        train_bads.append(i)
        
train_list2 = []
train_bads2 = []
for i in train_files2:
    # test to make sure image can be resized
    # if not store the file name in a list
    # else append it to the list of resized images
    obs = img_to_vector(jpg_image_to_array(i))
    if obs != None:
        train_list2.append(obs)
    else:
        train_bads2.append(i)


c:\users\john maxi\anaconda3\envs\tensorflowenv\lib\site-packages\ipykernel_launcher.py:8: FutureWarning: comparison to `None` will result in an elementwise object comparison in the future.
  
c:\users\john maxi\anaconda3\envs\tensorflowenv\lib\site-packages\ipykernel_launcher.py:20: FutureWarning: comparison to `None` will result in an elementwise object comparison in the future.

In [9]:
# some items were removed in the conversion of the images into the dataframe so need to remove the labels for those
# images from the train_targets list
print(train_bads)
print(np.where(train_files==train_bads[0]))
for i in train_bads:
    print(np.where(train_files==i))
    train_targets = np.delete(train_targets, np.where(train_files==i))
print(len(train_list)==len(train_targets))

# some items were removed in the conversion of the images into the dataframe so need to remove the labels for those
# images from the train_targets list
print(train_bads2)
print(np.where(train_files2==train_bads2[0]))
for i in train_bads2:
    print(np.where(train_files2==i))
    train_targets2 = np.delete(train_targets2, np.where(train_files2==i))
print(len(train_list2)==len(train_targets2))


['data/valid\\guava\\28.jpg', 'data/train\\bananas\\51.jpg', 'data/train\\cantaloupes\\7.jpg', 'data/train\\raspberries\\20.jpg', 'data/train\\guava\\11.jpg', 'data/train\\pineapples\\11.jpg', 'data/train\\cherries\\10.jpg', 'data/train\\pomegranates\\18.jpg']
(array([606], dtype=int64),)
(array([606], dtype=int64),)
(array([789], dtype=int64),)
(array([812], dtype=int64),)
(array([921], dtype=int64),)
(array([1407], dtype=int64),)
(array([1434], dtype=int64),)
(array([1532], dtype=int64),)
(array([1838], dtype=int64),)
True
['data/train\\raspberries\\20.jpg', 'data/train\\raspberries\\11.jpg', 'data/train\\cherries\\10.jpg', 'data/train\\cantaloupes\\7.jpg', 'data/valid\\guava\\28.jpg', 'data/train\\bananas\\51.jpg', 'data/train\\blackberries\\24.jpg', 'data/train\\tomatoes\\38.jpg', 'data/train\\guava\\11.jpg']
(array([150], dtype=int64),)
(array([150], dtype=int64),)
(array([185], dtype=int64),)
(array([299], dtype=int64),)
(array([512], dtype=int64),)
(array([691], dtype=int64),)
(array([1203], dtype=int64),)
(array([1323], dtype=int64),)
(array([1676], dtype=int64),)
(array([1753], dtype=int64),)
True

In [10]:
print(len(train_list))
print(len(train_targets))
print(np.unique(train_targets))

print(len(train_list2))
print(len(train_targets2))
print(np.unique(train_targets2))


2001
2001
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31 32 33]
2000
2000
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31 32 33]

In [11]:
from sklearn.svm import SVC
#class_weight='balanced'
clf = SVC(class_weight='balanced')
clf2 = SVC(class_weight='balanced')

In [12]:
print(type(train_list))


<class 'list'>

In [13]:
clf.fit(train_list, train_targets)
clf2.fit(train_list2, train_targets2)


Out[13]:
SVC(C=1.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [ ]:
from sklearn.externals import joblib
joblib.dump(clf, 'svm1.pkl') 
joblib.dump(clf2, 'svm2.pkl')
#clf = joblib.load('svm1.pkl') 
#clf2 = joblib.load('svm2.pkl')

In [7]:
valid_list = []
valid_bads = []
for i in valid_files:
    # test to make sure image can be resized
    # if not store the file name in a list
    # else append it to the list of resized images
    obs = img_to_vector(jpg_image_to_array(i))
    if obs != None:
        valid_list.append(obs)
    else:
        valid_bads.append(i)
valid_list = pd.DataFrame(valid_list)

valid_list2 = []
valid_bads2 = []
for i in valid_files2:
    # test to make sure image can be resized
    # if not store the file name in a list
    # else append it to the list of resized images
    obs = img_to_vector(jpg_image_to_array(i))
    if obs != None:
        valid_list2.append(obs)
    else:
        valid_bads2.append(i)
valid_list2 = pd.DataFrame(valid_list2)


c:\users\john maxi\anaconda3\envs\tensorflowenv\lib\site-packages\ipykernel_launcher.py:8: FutureWarning: comparison to `None` will result in an elementwise object comparison in the future.
  
c:\users\john maxi\anaconda3\envs\tensorflowenv\lib\site-packages\ipykernel_launcher.py:21: FutureWarning: comparison to `None` will result in an elementwise object comparison in the future.

In [8]:
# some items were removed in the conversion of the images into the dataframe so need to remove the labels for those
# images from the train_targets list
print(valid_bads)
print(np.where(valid_files==valid_bads[0]))
for i in valid_bads:
    print(np.where(valid_files==i))
    valid_targets = np.delete(valid_targets, np.where(valid_files==i))
print(len(valid_list)==len(valid_targets))

# some items were removed in the conversion of the images into the dataframe so need to remove the labels for those
# images from the train_targets list
print(valid_bads2)
print(np.where(valid_files2==valid_bads2[0]))
for i in valid_bads2:
    print(np.where(valid_files2==i))
    valid_targets2 = np.delete(valid_targets2, np.where(valid_files2==i))
print(len(valid_list2)==len(valid_targets2))


['data/train\\guava\\11.jpg', 'data/valid\\guava\\28.jpg', 'data/train\\tomatoes\\38.jpg']
(array([190], dtype=int64),)
(array([190], dtype=int64),)
(array([261], dtype=int64),)
(array([464], dtype=int64),)
True
['data/train\\raspberries\\20.jpg', 'data/train\\pomegranates\\18.jpg', 'data/train\\guava\\11.jpg']
(array([391], dtype=int64),)
(array([391], dtype=int64),)
(array([471], dtype=int64),)
(array([526], dtype=int64),)
True

In [9]:
print(valid_list.shape)
print(valid_targets.shape)
print(np.unique(valid_targets))

print(valid_list2.shape)
print(valid_targets2.shape)
print(np.unique(valid_targets2))


(672, 187500)
(672,)
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31 32 33]
(672, 187500)
(672,)
[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
 26 27 28 29 30 31 32 33]

In [18]:
print('acc1: ', clf.score(valid_list, valid_targets))
print('acc2: ', clf2.score(valid_list2, valid_targets2))
from sklearn.metrics import f1_score
print('f1 1: ', f1_score(valid_targets, clf.predict(valid_list),average='weighted'))
print('f1 2: ', f1_score(valid_targets2, clf2.predict(valid_list2),average='weighted'))
#f1 1:  0.409576629999
#f1 2:  0.40152619965


acc1:  0.448888888889
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-18-949803039f50> in <module>()
      1 print('acc1: ', clf.score(valid_list, valid_targets))
----> 2 print('acc2: ', clf2.score(valid_list2, valid_targets2))
      3 from sklearn.metrics import f1_score
      4 print('f1 1: ', f1_score(valid_targets, clf.predict(valid_list),average='weighted'))
      5 print('f1 2: ', f1_score(valid_targets2, clf2.predict(valid_list2),average='weighted'))

c:\users\john maxi\anaconda3\envs\tensorflowenv\lib\site-packages\sklearn\base.py in score(self, X, y, sample_weight)
    347         """
    348         from .metrics import accuracy_score
--> 349         return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
    350 
    351 

c:\users\john maxi\anaconda3\envs\tensorflowenv\lib\site-packages\sklearn\metrics\classification.py in accuracy_score(y_true, y_pred, normalize, sample_weight)
    170 
    171     # Compute accuracy for each possible representation
--> 172     y_type, y_true, y_pred = _check_targets(y_true, y_pred)
    173     if y_type.startswith('multilabel'):
    174         differing_labels = count_nonzero(y_true - y_pred, axis=1)

c:\users\john maxi\anaconda3\envs\tensorflowenv\lib\site-packages\sklearn\metrics\classification.py in _check_targets(y_true, y_pred)
     70     y_pred : array or indicator matrix
     71     """
---> 72     check_consistent_length(y_true, y_pred)
     73     type_true = type_of_target(y_true)
     74     type_pred = type_of_target(y_pred)

c:\users\john maxi\anaconda3\envs\tensorflowenv\lib\site-packages\sklearn\utils\validation.py in check_consistent_length(*arrays)
    179     if len(uniques) > 1:
    180         raise ValueError("Found input variables with inconsistent numbers of"
--> 181                          " samples: %r" % [int(l) for l in lengths])
    182 
    183 

ValueError: Found input variables with inconsistent numbers of samples: [675, 672]

In [ ]:
print(type(valid_list))

In [ ]:
#print(train_list)
i=798
print(clf.predict(train_list[i].reshape(1,-1)))
print(train_targets[i])
vec = train_list[i].reshape((250,250,3))
plt.imshow(vec)
plt.show()

In [21]:
#0.3857566765578635 trial 1
#0.414925373134 trial 2
#0.41369047619 trial 3
#0.428783382789 trial 4
#0.46884272997 trial 5
#acc1:  0.439821693908
#acc2:  0.433878157504
# acc1:  0.448888888889
svm_trials = np.asarray([0.3857566765578635,0.414925373134,0.41369047619,0.428783382789,0.46884272997])
print(svm_trials.mean())
from scipy.stats import sem
print(sem(svm_trials))


0.422399727728
0.0135501545142

In [22]:
test_list = []
test_bads = []
for i in test_files:
    # test to make sure image can be resized
    # if not store the file name in a list
    # else append it to the list of resized images
    obs = img_to_vector(jpg_image_to_array(i))
    if obs != None:
        test_list.append(obs)
    else:
        test_list.append(i)
test_list = pd.DataFrame(test_list)
        

# some items were removed in the conversion of the images into the dataframe so need to remove the labels for those
# images from the train_targets list
print(test_bads)
print(np.where(test_files==test_bads[0]))
for i in test_bads:
    print(np.where(test_files==i))
    test_targets = np.delete(test_targets, np.where(test_files==i))
print(len(test_list)==len(test_targets))

print(len(test_list))
print(len(test_targets))
print(np.unique(test_targets))


c:\users\john maxi\anaconda3\envs\tensorflowenv\lib\site-packages\ipykernel_launcher.py:8: FutureWarning: comparison to `None` will result in an elementwise object comparison in the future.
  
['data/train\\cantaloupes\\7.jpg', 'data/train\\raspberries\\20.jpg', 'data/train\\blackberries\\24.jpg', 'data/valid\\guava\\28.jpg', 'data/train\\tomatoes\\38.jpg', 'data/train\\pineapples\\11.jpg', 'data/train\\bananas\\51.jpg', 'data/train\\pomegranates\\18.jpg', 'data/train\\raspberries\\11.jpg', 'data/train\\guava\\11.jpg']
(array([139], dtype=int64),)
(array([139], dtype=int64),)
(array([211], dtype=int64),)
(array([433], dtype=int64),)
(array([726], dtype=int64),)
(array([840], dtype=int64),)
(array([975], dtype=int64),)
(array([1084], dtype=int64),)
(array([1566], dtype=int64),)
(array([1640], dtype=int64),)
(array([1975], dtype=int64),)
False
689
689
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31 32 33]

In [23]:
# this cell gets ride of string characters and converts them to 0.5. 
# The string values somehow got into the data? Uncomment the lines to find any cells
# that have string data.

#for i in test_list.columns:
#    obj =  test_list[i].dtype=='object'
#    if obj == True:
#        print(i)
#        test_list[i] = pd.to_numeric(test_list[i])

# these lines convert columns with string values to numeric and then replace NaN
# values with 0.5
test_list = test_list.apply(lambda x: pd.to_numeric(x, errors='coerce'))
test_list = test_list.fillna(0.5)

In [24]:
# classifier scores on the test data
print(clf.score(test_list, test_targets))
print(clf2.score(test_list, test_targets))


0.428156748911
0.438316400581

In [25]:
#0.44702467344 test 1
#0.438316400581 test 2
#0.428156748911 test 3
#
svm_tests = np.asarray([0.44702467344,0.438316400581,0.428156748911,0.438316400581])
print(svm_tests.mean())
print(sem(svm_tests))


0.437953555878
0.00385709209476

The cells below have the aggregated data from the CNN's and are used for calculating basic stats.


In [2]:
cnn1_trials = np.asarray([.6419,0.6464,0.5918])
print(cnn1_trials.mean())

print(sem(cnn1_trials))


0.6267
0.0174982856303

In [3]:
cnn2_trials = np.asarray([0.5387,0.5144,0.5660])
print(cnn2_trials.mean())
print(sem(cnn2_trials))


0.5397
0.0149040263016

In [4]:
cnn3_trials = np.asarray([0.8095,0.8244,0.8051])
print(cnn3_trials.mean())
print(sem(cnn3_trials))


0.813
0.00583980593285

In [5]:
cnn4_trials = np.asarray([0.7917,0.8348,0.7917])
print(cnn4_trials.mean())
print(sem(cnn4_trials))


0.806066666667
0.0143666666667

In [32]:
# model 1
#first_try.h5
#weights.best.from_scratch.02.hdf5
#weights.best.from_scratch.03.hdf5
# model 2
#class-weights-weights-improvement-26-0.54.hdf5
#class-weights-weights-improvement02-14-0.51.hdf5
#class-weights-weights-improvement03-25-0.57.hdf5
# model 3
#tflearning-weights-improvement-10-0.81.hdf5
#tflearning-weights-improvement02-12-0.82.hdf5
#tflearning-weights-improvement03-12-0.81.hdf5
# model 4
#tflearningwclassweights-weights-improvement-09-0.79.hdf5
#tflearningwclassweights02-weights-improvement-18-0.83.hdf5
#tflearningwclassweights03-weights-improvement-16-0.84.hdf5
def load_a_model(model, weights):
    json_file = open(model, 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    loaded_model = model_from_json(loaded_model_json)
    # load weights into new model
    loaded_model.load_weights(weights)
    print("Loaded model from disk")
    return loaded_model

weights_list = ['first_try.h5','saved_models/weights.best.from_scratch.02.hdf5','saved_models/weights.best.from_scratch.03.hdf5',
               'saved_models/class-weights-weights-improvement-26-0.54.hdf5', 'saved_models/class-weights-weights-improvement02-14-0.51.hdf5',
                'saved_models/class-weights-weights-improvement03-25-0.57.hdf5']
results = []
for w in weights_list:
    # load the model
    curr_model = load_a_model('scratch_model.json', w)
    # compile the model
    curr_model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    # evaluate the model on validation data
    preds = []
    preds2 = []
    for i in valid_list.index:
        preds.append(np.argmax(curr_model.predict(np.reshape(valid_list.iloc[i],(1,250,250,3)))))
    score1 = f1_score(valid_targets,preds,average='weighted')
    for i in valid_list2.index:
        preds2.append(np.argmax(curr_model.predict(np.reshape(valid_list2.iloc[i],(1,250,250,3)))))
    score2 = f1_score(valid_targets2,preds2,average='weighted')
    results.append((w,score1,score2))


Loaded model from disk
c:\users\john maxi\anaconda3\envs\tensorflowenv\lib\site-packages\numpy\core\fromnumeric.py:57: FutureWarning: reshape is deprecated and will raise in a subsequent release. Please use .values.reshape(...) instead
  return getattr(obj, method)(*args, **kwds)
c:\users\john maxi\anaconda3\envs\tensorflowenv\lib\site-packages\sklearn\metrics\classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
Loaded model from disk
Loaded model from disk
Loaded model from disk
Loaded model from disk
Loaded model from disk
c:\users\john maxi\anaconda3\envs\tensorflowenv\lib\site-packages\sklearn\metrics\classification.py:1115: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no true samples.
  'recall', 'true', average, warn_for)

In [33]:
print(results)


[('first_try.h5', 0.45550380982340388, 0.43193409762925405), ('saved_models/weights.best.from_scratch.02.hdf5', 0.60125851884784454, 0.58524908466045955), ('saved_models/weights.best.from_scratch.03.hdf5', 0.50327894029919606, 0.51602038040925202), ('saved_models/class-weights-weights-improvement-26-0.54.hdf5', 0.53766010109643325, 0.53826810752583143), ('saved_models/class-weights-weights-improvement02-14-0.51.hdf5', 0.51525538147146177, 0.48612158295479657), ('saved_models/class-weights-weights-improvement03-25-0.57.hdf5', 0.55480671346617594, 0.54717601001541061)]

In [47]:
valid_data = np.load(open('bottleneck_features_validation.npy','rb'))
validlabels = np.load(open('validation_labels.npy','rb'))
valid_labels = []
for i in testlabels:
    valid_labels.append(np.argmax(i))
    
test_data = np.load(open('bottleneck_features_test.npy','rb'))
testlabels = np.load(open('test_labels.npy','rb'))
test_labels = []
for i in testlabels:
    test_labels.append(np.argmax(i))

weights_list = ['saved_models/tflearning-weights-improvement-10-0.81.hdf5','saved_models/tflearning-weights-improvement02-12-0.82.hdf5',
                'saved_models/tflearning-weights-improvement03-12-0.81.hdf5','saved_models/tflearningwclassweights-weights-improvement-09-0.79.hdf5',
                'saved_models/tflearningwclassweights02-weights-improvement-18-0.83.hdf5','saved_models/tflearningwclassweights03-weights-improvement-16-0.84.hdf5']
results = []
for w in weights_list:
    # load the model
    curr_model = load_a_model('bestmodel.json', w)
    # compile the model
    curr_model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    # evaluate the model on validation data
    preds = []
    preds2 = []
    for i in valid_data:
        preds.append(np.argmax(curr_model.predict(np.reshape(i,(1,7,7,512)))))
    score1 = f1_score(valid_labels,preds,average='weighted')
    for i in test_data:
        preds2.append(np.argmax(curr_model.predict(np.reshape(i,(1,7,7,512)))))
    score2 = f1_score(test_labels,preds2,average='weighted')
    results.append((w,score1,score2))


Loaded model from disk
c:\users\john maxi\anaconda3\envs\tensorflowenv\lib\site-packages\sklearn\metrics\classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
Loaded model from disk
Loaded model from disk
Loaded model from disk
Loaded model from disk
Loaded model from disk

In [48]:
for i in results:
    print(i)


('saved_models/tflearning-weights-improvement-10-0.81.hdf5', 0.80033412858312825, 0.78127462016004723)
('saved_models/tflearning-weights-improvement02-12-0.82.hdf5', 0.81800105338356222, 0.78243978588131813)
('saved_models/tflearning-weights-improvement03-12-0.81.hdf5', 0.79346342843245155, 0.79560844383175566)
('saved_models/tflearningwclassweights-weights-improvement-09-0.79.hdf5', 0.79653512885039224, 0.79104560682662206)
('saved_models/tflearningwclassweights02-weights-improvement-18-0.83.hdf5', 0.83553250703650894, 0.79685692584553824)
('saved_models/tflearningwclassweights03-weights-improvement-16-0.84.hdf5', 0.83845005576817555, 0.79642359974662913)

In [51]:
mod1f1 = np.asarray([0.45550380982340388,0.43193409762925405,0.60125851884784454,0.58524908466045955,
         0.50327894029919606,0.51602038040925202])
mod2f1 = np.asarray([0.53766010109643325,0.53826810752583143,0.51525538147146177,0.48612158295479657,
         0.55480671346617594,0.54717601001541061])
mod3f1 = np.asarray([0.80033412858312825,0.78127462016004723, 0.81800105338356222,0.78243978588131813,
          0.79346342843245155,0.79560844383175566])
mod4f1 = np.asarray([0.79653512885039224,0.79104560682662206,0.83553250703650894,0.79685692584553824,
          0.83845005576817555,0.79642359974662913])
f1_trials = [mod1f1, mod2f1, mod3f1, mod4f1]
for i in f1_trials:
    print(i.mean())
    print(sem(i))
    print('\n')


0.515540805278
0.0276597640061


0.529881316088
0.0102957302584


0.795186910045
0.0054959623326


0.809140637346
0.00885910902493



In [11]:
traindf = pd.DataFrame((np.asarray(train_list)))
traindf['Label'] = np.asarray(train_targets)
traindf.head()


Out[11]:
0 1 2 3 4 5 6 7 8 9 ... 187491 187492 187493 187494 187495 187496 187497 187498 187499 Label
0 0.443137 0.423529 0.411765 0.486275 0.427451 0.439216 0.490196 0.439216 0.466667 0.470588 ... 0.600000 0.552941 0.592157 0.600000 0.549020 0.588235 0.607843 0.556863 0.596078 19
1 0.650980 0.564706 0.576471 0.674510 0.588235 0.600000 0.686275 0.600000 0.611765 0.698039 ... 0.839216 0.788235 0.827451 0.831373 0.780392 0.819608 0.835294 0.784314 0.823529 19
2 0.333333 0.333333 0.341176 0.360784 0.360784 0.368627 0.372549 0.372549 0.380392 0.392157 ... 0.752941 0.756863 0.776471 0.741176 0.745098 0.764706 0.745098 0.749020 0.768627 24
3 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 ... 0.956863 0.929412 0.905882 0.956863 0.929412 0.905882 0.956863 0.929412 0.905882 12
4 0.125490 0.117647 0.066667 0.129412 0.113725 0.066667 0.113725 0.117647 0.062745 0.145098 ... 0.380392 0.270588 0.223529 0.356863 0.247059 0.192157 0.352941 0.270588 0.196078 29

5 rows × 187501 columns


In [13]:
class_names = {'Label':{0:'acerolas',1:'apples',2:'apricots',3:'avocados',4:'bananas',5:'blackberries',
               6:'blueberries',7:'cantaloupes',8:'cherries',9:'coconuts',10:'figs',11:'grapefruits',
               12:'grapes',13:'guava',14:'honneydew_melon',15:'kiwifruit',16:'lemons',17:'limes',
               18:'mangos',19:'nectarine',20:'olives',21:'onion',22:'orange',23:'passionfruit',
               24:'peaches',25:'pears',26:'pineapples',27:'plums',28:'pomegranates',
               29:'potato',30:'raspberries',31:'strawberries',32:'tomatoes',33:'watermelon'}}
traindf.replace(to_replace=class_names, inplace=True)

In [14]:
grouped = traindf.groupby('Label').mean()

In [21]:
grouped.mean(axis=1).sort_values().plot(kind='bar', figsize=(10,10))
plt.ylabel('Normalized pixel intensity')
plt.savefig('PixInten.png')



In [ ]: