In [3]:
#Import libraries for doing image analysis
from skimage.io import imread
from skimage.transform import resize
from sklearn.ensemble import RandomForestClassifier as RF
import glob
import os
from sklearn import cross_validation
from sklearn.cross_validation import StratifiedKFold as KFold
from sklearn.metrics import classification_report
from matplotlib import pyplot as plt
from matplotlib import colors
from pylab import cm
from skimage import segmentation
from skimage.morphology import watershed
from skimage import measure
from skimage import morphology
import numpy as np
import pandas as pd
from scipy import ndimage
from skimage.feature import peak_local_max
# make graphics inline
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
In [13]:
# get the classnames from the directory structure
directory_names = list(set(glob.glob(os.path.join("train", "*"))\
).difference(set(glob.glob(os.path.join("train","*.*")))))
In [18]:
# Example image
# This example was chosen for because it has two noncontinguous pieces
# that will make the segmentation example more illustrative
example_file = glob.glob(os.path.join(directory_names[6],"*.jpg"))[9]
print example_file
im = imread(example_file, as_grey=True)
plt.imshow(im, cmap=cm.gray)
plt.show()
In [27]:
# First we threshold the image by only taking values greater than the mean to reduce noise in the image
# to use later as a mask
f = plt.figure(figsize=(12,3))
imthr = im.copy()
imthr = np.where(im > np.mean(im),0.,1.0)
sub1 = plt.subplot(1,4,1)
plt.imshow(im, cmap=cm.gray)
sub1.set_title("Original Image")
sub2 = plt.subplot(1,4,2)
plt.imshow(imthr, cmap=cm.gray_r)
sub2.set_title("Thresholded Image")
imdilated = morphology.dilation(imthr, np.ones((4,4)))
sub3 = plt.subplot(1, 4, 3)
plt.imshow(imdilated, cmap=cm.gray_r)
sub3.set_title("Dilated Image")
labels = measure.label(imdilated)
labels = imthr*labels
labels = labels.astype(int)
sub4 = plt.subplot(1, 4, 4)
sub4.set_title("Labeled Image")
plt.imshow(labels)
Out[27]:
In [20]:
# calculate common region properties for each region within the segmentation
regions = measure.regionprops(labels)
# find the largest nonzero region
def getLargestRegion(props=regions, labelmap=labels, imagethres=imthr):
regionmaxprop = None
for regionprop in props:
# check to see if the region is at least 50% nonzero
if sum(imagethres[labelmap == regionprop.label])*1.0/regionprop.area < 0.50:
continue
if regionmaxprop is None:
regionmaxprop = regionprop
if regionmaxprop.filled_area < regionprop.filled_area:
regionmaxprop = regionprop
return regionmaxprop
In [21]:
regionmax = getLargestRegion()
plt.imshow(np.where(labels == regionmax.label,1.0,0.0))
plt.show()
In [28]:
print regionmax.minor_axis_length/regionmax.major_axis_length
In [29]:
def getMinorMajorRatio(image):
image = image.copy()
# Create the thresholded image to eliminate some of the background
imagethr = np.where(image > np.mean(image),0.,1.0)
#Dilate the image
imdilated = morphology.dilation(imagethr, np.ones((4,4)))
# Create the label list
label_list = measure.label(imdilated)
label_list = imagethr*label_list
label_list = label_list.astype(int)
region_list = measure.regionprops(label_list)
maxregion = getLargestRegion(region_list, label_list, imagethr)
# guard against cases where the segmentation fails by providing zeros
ratio = 0.0
if ((not maxregion is None) and (maxregion.major_axis_length != 0.0)):
ratio = 0.0 if maxregion is None else maxregion.minor_axis_length*1.0 / maxregion.major_axis_length
return ratio
In [31]:
# Rescale the images and create the combined metrics and training labels
#get the total training images
numberofImages = 0
for folder in directory_names:
for fileNameDir in os.walk(folder):
for fileName in fileNameDir[2]:
# Only read in the images
if fileName[-4:] != ".jpg":
continue
numberofImages += 1
# We'll rescale the images to be 25x25
maxPixel = 25
imageSize = maxPixel * maxPixel
num_rows = numberofImages # one row for each image in the training dataset
num_features = imageSize + 1 # for our ratio
# X is the feature vector with one row of features per image
# consisting of the pixel values and our metric
X = np.zeros((num_rows, num_features), dtype=float)
# y is the numeric class label
y = np.zeros((num_rows))
files = []
# Generate training data
i = 0
label = 0
# List of string of class names
namesClasses = list()
print "Reading images"
# Navigate through the list of directories
for folder in directory_names:
# Append the string class name for each class
currentClass = folder.split(os.pathsep)[-1]
namesClasses.append(currentClass)
for fileNameDir in os.walk(folder):
for fileName in fileNameDir[2]:
# Only read in the images
if fileName[-4:] != ".jpg":
continue
# Read in the images and create the features
nameFileImage = "{0}{1}{2}".format(fileNameDir[0], os.sep, fileName)
image = imread(nameFileImage, as_grey=True)
files.append(nameFileImage)
axisratio = getMinorMajorRatio(image)
image = resize(image, (maxPixel, maxPixel))
# Store the rescaled image pixels and the axis ratio
X[i, 0:imageSize] = np.reshape(image, (1, imageSize))
X[i, imageSize] = axisratio
# Store the classlabel
y[i] = label
i += 1
# report progress for each 5% done
report = [int((j+1)*num_rows/20.) for j in range(20)]
if i in report: print np.ceil(i *100.0 / num_rows), "% done"
label += 1
In [56]:
# Loop through the classes two at a time and compare their distributions of the Width/Length Ratio
#Create a DataFrame object to make subsetting the data on the class
df = pd.DataFrame({"class": y[:], "ratio": X[:, num_features-1]})
f = plt.figure(figsize=(30, 20))
#we suppress zeros and choose a few large classes to better highlight the distributions.
df = df.loc[df["ratio"] > 0]
minimumSize = 20
counts = df["class"].value_counts()
largeclasses = [int(x) for x in list(counts.loc[counts > minimumSize].index)]
# Loop through 40 of the classes
for j in range(0,40,2):
subfig = plt.subplot(4, 5, j/2 +1)
# Plot the normalized histograms for two classes
classind1 = largeclasses[j]
classind2 = largeclasses[j+1]
n, bins,p = plt.hist(df.loc[df["class"] == classind1]["ratio"].values,\
alpha=0.5, bins=[x*0.01 for x in range(100)], \
label=namesClasses[classind1].split(os.sep)[-1], normed=1)
n2, bins,p = plt.hist(df.loc[df["class"] == (classind2)]["ratio"].values,\
alpha=0.5, bins=bins, label=namesClasses[classind2].split(os.sep)[-1],normed=1)
subfig.set_ylim([0.,10.])
plt.legend(loc='upper right')
plt.xlabel("Width/Length Ratio")
In [67]:
# Loop through the classes two at a time and compare their distributions of the Width/Length Ratio
#Create a DataFrame object to make subsetting the data on the class
df = pd.DataFrame({"class": y[:], "ratio": X[:, num_features-1]})
f = plt.figure(figsize=(30, 20))
#we suppress zeros and choose a few large classes to better highlight the distributions.
df = df.loc[df["ratio"] > 0]
minimumSize = 20
counts = df["class"].value_counts()
largeclasses = [int(x) for x in list(counts.loc[counts > minimumSize].index)]
# Loop through 40 of the classes
for j in range(0,40,2):
subfig = plt.subplot(4, 5, j/2 +1)
# Plot the normalized histograms for two classes
classind1 = largeclasses[j]
classind2 = largeclasses[j+1]
n, bins,p = plt.hist(df.loc[df["class"] == classind1]["ratio"].values,\
alpha=0.5, bins=[x*0.01 for x in range(100)], \
label=namesClasses[classind1].split(os.sep)[-1], normed=1)
n2, bins,p = plt.hist(df.loc[df["class"] == (classind2)]["ratio"].values,\
alpha=0.5, bins=bins, label=namesClasses[classind2].split(os.sep)[-1],normed=1)
subfig.set_ylim([0.,10.])
plt.legend(loc='upper right')
plt.xlabel("Width/Length Ratio")
In [68]:
print "Training"
# n_estimators is the number of decision trees
# max_features also known as m_try is set to the default value of the square root of the number of features
clf = RF(n_estimators=100, n_jobs=3);
scores = cross_validation.cross_val_score(clf, X, y, cv=5, n_jobs=1);
print "Accuracy of all classes"
print np.mean(scores)
In [85]:
kf = KFold(y, n_folds=5)
y_pred = y * 0
for train, test in kf:
X_train, X_test, y_train, y_test = X[train,:], X[test,:], y[train], y[test]
clf = RF(n_estimators=100, n_jobs=3)
clf.fit(X_train, y_train)
y_pred[test] = clf.predict(X_test)
# commented for now - uncomment to see list of classes and matching
# print classification_report(y, y_pred, target_names=namesClasses)
In [70]:
def multiclass_log_loss(y_true, y_pred, eps=1e-15):
"""Multi class version of Logarithmic Loss metric.
https://www.kaggle.com/wiki/MultiClassLogLoss
Parameters
----------
y_true : array, shape = [n_samples]
true class, intergers in [0, n_classes - 1)
y_pred : array, shape = [n_samples, n_classes]
Returns
-------
loss : float
"""
predictions = np.clip(y_pred, eps, 1 - eps)
# normalize row sums to 1
predictions /= predictions.sum(axis=1)[:, np.newaxis]
actual = np.zeros(y_pred.shape)
n_samples = actual.shape[0]
actual[np.arange(n_samples), y_true.astype(int)] = 1
vectsum = np.sum(actual * np.log(predictions))
loss = -1.0 / n_samples * vectsum
return loss
In [71]:
# Get the probability predictions for computing the log-loss function
kf = KFold(y, n_folds=5)
# prediction probabilities number of samples, by number of classes
y_pred = np.zeros((len(y),len(set(y))))
for train, test in kf:
X_train, X_test, y_train, y_test = X[train,:], X[test,:], y[train], y[test]
clf = RF(n_estimators=100, n_jobs=3)
clf.fit(X_train, y_train)
y_pred[test] = clf.predict_proba(X_test)
In [72]:
multiclass_log_loss(y, y_pred)
Out[72]:
In [100]:
### below is a test sequence prepared by CLS
# I repeat these just so I can vary them here with out affecting script above
maxPixel = 25
imageSize = maxPixel * maxPixel
num_rows = numberofImages # one row for each image in the training dataset
num_features = imageSize + 1 # for our ratio
image = imread("train/heteropod/153969.jpg", as_grey=True)
f = plt.figure(figsize=(12,3))
sub1 = plt.subplot(1,2,1)
plt.imshow(image, cmap=cm.gray)
axisratio = getMinorMajorRatio(image)
image_resized = resize(image, (maxPixel, maxPixel))
sub2 = plt.subplot(1,2,2)
plt.imshow(image_resized, cmap=cm.gray)
# Store the rescaled image pixels and the axis ratio
#X[i, 0:imageSize] = np.reshape(image, (1, imageSize))
#X[i, imageSize] = axisratio
Out[100]:
In [98]: