In [3]:
# imports
import numpy as np
import pickle
from skimage.feature import peak_local_max as plm
from skimage.filter import vsobel, hsobel
from os import listdir
from pylab import imread
from time import time

def extract_features(image_path_list):
    """ Given a list of directories to image files, extract features from images and return a single
        numpy array containing those features, appropriately formatted for classifier prediction. """
    feature_list = []
    k = 100 # number of evenly spaced percentage announcements (should be <= 100)
    print("     Feature extraction completion:")
    announcements = [(i+1)*len(image_path_list)/k for i in range(k)] # Announce the % complete to the user at these points
    for i, image_path in enumerate(image_path_list):
        image_array = imread(image_path)
        feature_list.append([feature_1(image_array), # image size
                             feature_2(image_array), # mean red-channel
                             feature_3(image_array), # mean green-channel
                             feature_4(image_array), # mean blue-channel
                             feature_5(image_array), # mean luminosity
                             feature_6(image_array), # median luminosity
                             feature_7(image_array), # standard deviation luminosity
                             feature_8(image_array), # median red-channel
                             feature_9(image_array), # median green-channel
                             feature_10(image_array), # median blue-channel
                             feature_11(image_array), # standard deviation red-channel
                             feature_12(image_array), # standard deviation green-channel
                             feature_13(image_array), # standard deviation blue-channel
                             feature_14(image_array), # mean luminosity of vertical edge map
                             feature_15(image_array), # median luminosity of vertical edge map
                             feature_16(image_array), # standard deviation luminosity of vertical edge map
                             feature_17(image_array), # mean luminosity of horizontal edge map
                             feature_18(image_array), # median luminosity of horizontal edge map
                             feature_19(image_array), # standard deviation luminosity of horizontal edge map
                             feature_20(image_array), # pixels above threshold lum for horizontal edge map
                             feature_21(image_array), # pixels above threshold lum for vertical edge map
                             feature_22(image_array), # aspect ratio of image
                             feature_23(image_array) # number of image peaks
                             ])
        
        # Give the user progress updates regarding how far along feature extraction is (only works if not parallel)
        if (i+1) in announcements:
            print("{0:.0f}%...".format(100.0*i/len(image_path_list))),
    print('')
    return np.array(feature_list) # easier indexing

#--------------------------------- Start Features list ---------------------------------
def feature_1(image_array):
    """ Return the size of the image, in pixels """
    return image_array.size

def feature_2(image_array):
    """ Return the average red-channel value for the picture (in 0-255 scale) """
    if len(image_array.shape) == 3:
        return image_array[:,:,0].mean()
    else:
        return image_array.mean()

def feature_3(image_array):
    """ Return the average blue-channel value for the picture (in 0-255 scale) """
    if len(image_array.shape) == 3:
        return image_array[:,:,1].mean()
    else:
        return image_array.mean()

def feature_4(image_array):
    """ Return the average green-channel value for the picture (in 0-255 scale) """
    if len(image_array.shape) == 3:
        return image_array[:,:,2].mean()
    else:
        return image_array.mean()

def feature_5(image_array):
    """ Return the average luminosity value for the picture (in 0-255 scale) """
    if len(image_array.shape) == 3:
        return image_array.mean(axis=2).mean()
    else:
        return image_array.mean()

def feature_6(image_array):
    """ Returns the median pixels luminosity """
    if len(image_array.shape) == 3:
        image_array = image_array.mean(axis=2)
    return np.median(image_array)

def feature_7(image_array):
    """ Returns the standard deviation of the pixels' luminosity """
    if len(image_array.shape) == 3:
        image_array = image_array.mean(axis=2)
    return np.std(image_array)

def feature_8(image_array):
    """ Return the median red-channel value for the picture (in 0-255 scale) """
    if len(image_array.shape) == 3:
        return np.median(image_array[:,:,0])
    else:
        return np.median(image_array)

def feature_9(image_array):
    """ Return the median blue-channel value for the picture (in 0-255 scale) """
    if len(image_array.shape) == 3:
        return np.median(image_array[:,:,1])
    else:
        return np.median(image_array)

def feature_10(image_array):
    """ Return the median green-channel value for the picture (in 0-255 scale) """
    if len(image_array.shape) == 3:
        return np.median(image_array[:,:,2])
    else:
        return np.median(image_array)

def feature_11(image_array):
    """ Return the standard deviation of the red-channel value for the picture (in 0-255 scale) """
    if len(image_array.shape) == 3:
        return np.std(image_array[:,:,0])
    else:
        return np.std(image_array)

def feature_12(image_array):
    """ Return the the standard deviation of the blue-channel value for the picture (in 0-255 scale) """
    if len(image_array.shape) == 3:
        return np.std(image_array[:,:,1])
    else:
        return np.std(image_array)

def feature_13(image_array):
    """ Return the the standard deviation of the green-channel value for the picture (in 0-255 scale) """
    if len(image_array.shape) == 3:
        return np.std(image_array[:,:,2])
    else:
        return np.std(image_array)

def feature_14(image_array):
    """ Return the average luminosity for vertical edges map """
    if len(image_array.shape) == 3:
        image_array = image_array.mean(axis=2)
    return np.mean(vsobel(image_array))

def feature_15(image_array):
    """ Returns the median luminosity for vertical edges map """
    if len(image_array.shape) == 3:
        image_array = image_array.mean(axis=2)
    return np.median(vsobel(image_array))

def feature_16(image_array):
    """ Returns the standard deviation of the luminosity for vertical edges map """
    if len(image_array.shape) == 3:
        image_array = image_array.mean(axis=2)
    return np.std(vsobel(image_array))

def feature_17(image_array):
    """ Return the average luminosity for horizontal edges map """
    if len(image_array.shape) == 3:
        image_array = image_array.mean(axis=2)
    return np.mean(hsobel(image_array))

def feature_18(image_array):
    """ Returns the median luminosity for horizontal edges map """
    if len(image_array.shape) == 3:
        image_array = image_array.mean(axis=2)
    return np.median(hsobel(image_array))

def feature_19(image_array):
    """ Returns the standard deviation of the luminosity for horizontal edges map """
    if len(image_array.shape) == 3:
        image_array = image_array.mean(axis=2)
    return np.std(hsobel(image_array))

def feature_20(image_array):
    """ Returns the fraction of pixels above a threshold of the luminosity 
    for the horizontal edges map """
    thresh = 20 # Based on looking at histograms of edge maps of pictures
    if len(image_array.shape) == 3:
        image_array = image_array.mean(axis=2)
    h_edge = hsobel(image_array)
    return 1.0*sum((h_edge >= thresh).flatten())/h_edge.size

def feature_21(image_array):
    """ Returns the fraction of pixels above a threshold of the luminosity 
    for the vertical edges map """
    thresh = 20 # Based on looking at histograms of edge maps of pictures
    if len(image_array.shape) == 3:
        image_array = image_array.mean(axis=2)
    v_edge = vsobel(image_array)
    return 1.0*sum((v_edge >= thresh).flatten())/v_edge.size

def feature_22(image_array):
    """ Returns the aspect ratio of the image """
    if len(image_array.shape) == 3:
        image_array = image_array.mean(axis=2)
    (height, width) = image_array.shape
    return 1.0*height/width

def feature_23(image_array):
    """ Returns the number of image peaks """
    return len(plm(image_array, min_distance=50))
#--------------------------------- End Features list ---------------------------------

def collect_paths(imDirectory):
    """ Given a validation directory, returns a list of image path strings for that directory.  Assumes
        no sub-folders within the directory; just image files. """
    image_paths = []
    image_names_filtered = [] # remove any files beginning with .
    image_names = listdir(imDirectory)
    for name in image_names:
        if name[0] != '.':
            image_paths.append(imDirectory + "/" + name)
            image_names_filtered.append(name)
        else:
            print "bad image '" + name +"' was skipped!"
    return (image_paths, image_names_filtered)

def generate_feature_set(images_directory):
    """ takes in the directory for the validation set, and creates / returns a feature set correctly
        formatted for prediction by the classifier """
    image_paths = collect_paths(images_directory)
    print "\t Now beginning rectangularization of validation images..."
    before_extraction = time()
    features = extract_features(image_paths[0])
    after_extraction = time()
    print("\nFeature extraction complete after {0:.2f} seconds, or {1:.4f} seconds per image, for {2:.0f} total images."\
          .format(after_extraction-before_extraction,(after_extraction-before_extraction)/float(len(image_paths[0])),
                  len(image_paths[0])))
    print("Feature set contains {0:.0f} instances, each with {1:.0f} extracted features."\
          .format(features.shape[0], features.shape[1]))
    return (features, image_paths[1])

#--------------------------------- Main Function ---------------------------------
def run_final_classifier(path, forest="./trained_classifier.p"):
    """ Main function.  path = path to directory of validation images, forest = trained_classifier pickle,
        generated from hw4_classifier_dev ipython notebook.  Creates a file with predictions of classes
        of validation images """
    clf = pickle.load( open( forest, "rb" ) ) # load up the classifier
    (X,val_images) = generate_feature_set(path) # generate feature set from the images
    Y_pred = clf.predict(X)
    
    # Create output file
    my_str = "filename" + " "*22 + "predicted_class\n" + \
         "---------------------------------------------\n"
    for i, prediction in enumerate(Y_pred):
        my_str += val_images[i] + " "*(30-len(val_images[i])) + prediction + "\n"
    with open("Output.txt", "w") as text_file:
        text_file.write(my_str)
    print "\nSee the file 'Output.txt' for the classifier's predictions."
    
    return (Y_pred, val_images)

#--------------------------------- Main Program & Instructions ---------------------------------
if __name__ == '__main__':
    print("Use the function: (predictions, file_names) = run_final_classifier(path, forest) to evaluate classifier on validation set")


Use the function: (predictions, file_names) = run_final_classifier(path, forest) to evaluate classifier on validation set

In [7]:
(predictions, file_names) = run_final_classifier('/path_to/validation_image_set/goes_here')