Consider the addition of these features: fourier shape descriptors convexityDefects - # of defects, avg, and std


In [1]:
#Import libraries for doing image analysis
from skimage.io import imread
from skimage.transform import resize
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.semi_supervised import LabelPropagation
from sklearn.semi_supervised import LabelSpreading
import glob
import os
from sklearn import cross_validation
from sklearn.cross_validation import StratifiedKFold as KFold
from sklearn.metrics import classification_report
from matplotlib import pyplot as plt
from matplotlib import colors
from pylab import cm
from skimage import segmentation
from skimage.morphology import watershed
from skimage import measure
from skimage import morphology
import numpy as np
import pandas as pd
from scipy import ndimage
from skimage.feature import peak_local_max
import cv2
# make graphics inline
%matplotlib inline
from feature import extract_feats as ef

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
data_path = '../../data/ndsb/'
# get the classnames from the directory structure
directory_names = list(set(glob.glob(os.path.join(data_path,"train", "*"))\
 ).difference(set(glob.glob(os.path.join(data_path,"train","*.*")))))

In [4]:
# Example image
# This example was chosen for because it has two noncontinguous pieces
# that will make the segmentation example more illustrative
example_file = glob.glob(os.path.join(directory_names[0],"*.jpg"))[5]
print example_file
im = imread(example_file, as_grey=True)
plt.imshow(im, cmap=cm.gray, interpolation='none')
plt.show()
im.shape


../../data/ndsb/train/copepod_calanoid_octomoms/116800.jpg
Out[4]:
(54, 40)

In [5]:
# First we threshold the image by only taking values greater than the mean to reduce noise in the image
# to use later as a mask
f = plt.figure(figsize=(12,3))
imthr = im.copy()
imthr = np.where(im > np.mean(im),0.,1.0)
sub1 = plt.subplot(1,4,1)
plt.imshow(im, cmap=cm.gray)
sub1.set_title("Original Image")

sub2 = plt.subplot(1,4,2)
plt.imshow(imthr, cmap=cm.gray_r)
sub2.set_title("Thresholded Image")

imdilated = morphology.dilation(imthr, np.ones((4,4)))
sub3 = plt.subplot(1, 4, 3)
plt.imshow(imdilated, cmap=cm.gray_r)
sub3.set_title("Dilated Image")

labels = measure.label(imdilated)
labels = imthr*labels
labels = labels.astype(int)
sub4 = plt.subplot(1, 4, 4)
sub4.set_title("Labeled Image")
plt.imshow(labels)


Out[5]:
<matplotlib.image.AxesImage at 0x7f71234dff90>

In [6]:
# calculate common region properties for each region within the segmentation
regions = measure.regionprops(labels)
# find the largest nonzero region
def getLargestRegion(props=regions, labelmap=labels, imagethres=imthr):
    regionmaxprop = None
    for regionprop in props:
        # check to see if the region is at least 50% nonzero
        if sum(imagethres[labelmap == regionprop.label])*1.0/regionprop.area < 0.50:
            continue
        if regionmaxprop is None:
            regionmaxprop = regionprop
        if regionmaxprop.filled_area < regionprop.filled_area:
            regionmaxprop = regionprop
    return regionmaxprop

In [7]:
regionmax = getLargestRegion()
plt.imshow(np.where(labels == regionmax.label,1.0,0.0))
plt.show()



In [8]:
print regionmax.minor_axis_length/regionmax.major_axis_length


0.811746705492

In [9]:
def getMinorMajorRatio(image):
    image = image.copy()
    # Create the thresholded image to eliminate some of the background
    imagethr = np.where(image > np.mean(image),0.,1.0)

    #Dilate the image
    imdilated = morphology.dilation(imagethr, np.ones((4,4)))

    # Create the label list
    label_list = measure.label(imdilated)
    label_list = imagethr*label_list
    label_list = label_list.astype(int)
    
    region_list = measure.regionprops(label_list)
    maxregion = getLargestRegion(region_list, label_list, imagethr)
    
    # guard against cases where the segmentation fails by providing zeros
    ratio = 0.0
    if ((not maxregion is None) and  (maxregion.major_axis_length != 0.0)):
        ratio = 0.0 if maxregion is None else  maxregion.minor_axis_length*1.0 / maxregion.major_axis_length
    return ratio

In [10]:
# Rescale the images and create the combined metrics and training labels

#get the total training images
numberofImages = 0
for folder in directory_names:
    for fileNameDir in os.walk(folder):   
        for fileName in fileNameDir[2]:
             # Only read in the images
            if fileName[-4:] != ".jpg":
              continue
            numberofImages += 1

# We'll rescale the images to be 25x25
maxPixel = 25
imageSize = maxPixel * maxPixel
num_rows = numberofImages # one row for each image in the training dataset
num_features = 15

# X is the feature vector with one row of features per image
# consisting of the pixel values and our metric
X = np.zeros((num_rows, num_features), dtype=float)
# y is the numeric class label 
y = np.zeros((num_rows))

files = []
# Generate training data
i = 0    
label = 0
# List of string of class names
namesClasses = list()

print "Reading images"
# Navigate through the list of directories
for folder in directory_names:
    # Append the string class name for each class
    currentClass = folder.split(os.pathsep)[-1]
    namesClasses.append(currentClass)
    for fileNameDir in os.walk(folder):   
        for fileName in fileNameDir[2]:
            # Only read in the images
            if fileName[-4:] != ".jpg":
              continue
            
            # Read in the images and create the features
            nameFileImage = "{0}{1}{2}".format(fileNameDir[0], os.sep, fileName)
            X[i, :] = ef.im_features(nameFileImage)
            
            # Store the classlabel
            y[i] = label
            i += 1
            # report progress for each 5% done  
            report = [int((j+1)*num_rows/20.) for j in range(20)]
            if i in report: print np.ceil(i *100.0 / num_rows), "% done"
    label += 1


Reading images
5.0 % done
10.0 % done
15.0 % done
20.0 % done
25.0 % done
30.0 % done
35.0 % done
40.0 % done
45.0 % done
50.0 % done
55.0 % done
60.0 % done
65.0 % done
70.0 % done
75.0 % done
80.0 % done
85.0 % done
90.0 % done
95.0 % done
100.0 % done

In [11]:
# Loop through the classes two at a time and compare their distributions of the Width/Length Ratio

#Create a DataFrame object to make subsetting the data on the class 
df = pd.DataFrame({"class": y[:], "ratio": X[:, -2]})

f = plt.figure(figsize=(30, 20))
#we suppress zeros and choose a few large classes to better highlight the distributions.
df = df.loc[df["ratio"] > 0]
minimumSize = 20 
counts = df["class"].value_counts()
largeclasses = [int(x) for x in list(counts.loc[counts > minimumSize].index)]
# Loop through 40 of the classes and compare adjacent classes (step by 2)
for j in range(0,40,2):
    subfig = plt.subplot(4, 5, j/2 +1)
    # Plot the normalized histograms for two classes
    classind1 = largeclasses[j]
    classind2 = largeclasses[j+1]
    n, bins,p = plt.hist(df.loc[df["class"] == classind1]["ratio"].values,\
                         alpha=0.5, bins=[x*0.01 for x in range(100)], \
                         label=namesClasses[classind1].split(os.sep)[-1], normed=1)

    n2, bins,p = plt.hist(df.loc[df["class"] == (classind2)]["ratio"].values,\
                          alpha=0.5, bins=bins, label=namesClasses[classind2].split(os.sep)[-1],normed=1)
    subfig.set_ylim([0.,10.])
    plt.legend(loc='upper right')
    plt.xlabel("Width/Length Ratio")



In [12]:
print "Training"
# n_estimators is the number of decision trees
# max_features also known as m_try is set to the default value of the square root of the number of features
clf = RF(n_estimators=100, n_jobs=3);
scores = cross_validation.cross_val_score(clf, X, y, cv=5, n_jobs=1);
print "Accuracy of all classes"
print np.mean(scores)


Training
Accuracy of all classes
0.433335861696

In [13]:
kf = KFold(y, n_folds=5)
y_pred = y * 0
for train, test in kf:
    X_train, X_test, y_train, y_test = X[train,:], X[test,:], y[train], y[test]
    clf = RF(n_estimators=100, n_jobs=3)
    clf.fit(X_train, y_train)
    y_pred[test] = clf.predict(X_test)
class_names = [os.path.basename(path) for path in namesClasses]
print classification_report(y, y_pred, target_names=class_names)


                                               precision    recall  f1-score   support

                    copepod_calanoid_octomoms       0.08      0.02      0.03        49
     echinoderm_seacucumber_auricularia_larva       0.14      0.08      0.10        96
                                protist_other       0.40      0.52      0.45      1172
                            shrimp-like_other       0.00      0.00      0.00        52
    copepod_calanoid_large_side_antennatucked       0.28      0.24      0.26       106
                                 fecal_pellet       0.32      0.24      0.28       511
                    copepod_cyclopoid_copilia       0.64      0.23      0.34        30
                          protist_fuzzy_olive       0.51      0.58      0.54       372
                           shrimp_sergestidae       0.28      0.35      0.31       153
                         hydromedusae_liriope       0.00      0.00      0.00        19
                 appendicularian_slight_curve       0.23      0.26      0.24       532
                 siphonophore_physonect_young       0.00      0.00      0.00        21
                      appendicularian_s_shape       0.29      0.35      0.32       696
              hydromedusae_bell_and_tentacles       0.20      0.08      0.11        75
               echinoderm_larva_pluteus_early       0.34      0.14      0.20        92
                   copepod_calanoid_flatheads       0.30      0.12      0.17       178
                     hydromedusae_narco_young       0.27      0.34      0.30       336
                      fish_larvae_medium_body       0.27      0.34      0.30        85
                       trichodesmium_multiple       0.00      0.00      0.00        54
                            diatom_chain_tube       0.39      0.37      0.38       500
                             crustacean_other       0.19      0.15      0.17       201
           siphonophore_calycophoran_abylidae       0.26      0.25      0.25       212
                                  shrimp_zoea       0.15      0.11      0.13       174
                    hydromedusae_partial_dark       0.32      0.17      0.22       190
                           trichodesmium_puff       0.74      0.86      0.79      1979
                        fish_larvae_deep_body       0.00      0.00      0.00        10
                     fish_larvae_leptocephali       0.00      0.00      0.00        31
                     hydromedusae_solmundella       0.35      0.29      0.32       123
                    copepod_cyclopoid_oithona       0.43      0.54      0.48       899
                            pteropod_triangle       0.29      0.14      0.19        65
        echinoderm_larva_seastar_brachiolaria       0.55      0.82      0.66       536
                                    heteropod       0.00      0.00      0.00        10
             ctenophore_cydippid_no_tentacles       0.33      0.02      0.04        42
                                tunicate_salp       0.61      0.75      0.68       236
                       copepod_calanoid_large       0.32      0.36      0.34       286
                    unknown_blobs_and_smudges       0.26      0.19      0.22       317
      siphonophore_calycophoran_sphaeronectes       0.32      0.26      0.29       179
 siphonophore_calycophoran_sphaeronectes_stem       0.07      0.02      0.03        57
                          protist_dark_center       0.33      0.05      0.08       108
                             hydromedusae_h15       0.08      0.03      0.04        35
                                  euphausiids       0.19      0.10      0.13       136
                  invertebrate_larvae_other_A       0.00      0.00      0.00        14
                  invertebrate_larvae_other_B       0.00      0.00      0.00        24
                                       ephyra       0.00      0.00      0.00        14
                                 protist_star       0.57      0.46      0.51       113
                appendicularian_fritillaridae       0.00      0.00      0.00        16
                           trichodesmium_tuft       0.30      0.41      0.35       678
                   copepod_calanoid_eucalanus       0.41      0.27      0.33        96
           hydromedusae_shapeA_sideview_small       0.18      0.08      0.11       274
                acantharia_protist_big_center       0.00      0.00      0.00        13
                               detritus_other       0.19      0.22      0.20       914
                        copepod_calanoid_eggs       0.43      0.25      0.32       173
                            chaetognath_other       0.51      0.64      0.57      1934
               echinoderm_larva_pluteus_typeC       0.40      0.31      0.35        80
                          hydromedusae_shapeB       0.07      0.01      0.02       150
                          hydromedusae_shapeA       0.54      0.67      0.60       412
                       hydromedusae_haliscera       0.49      0.56      0.52       229
        hydromedusae_typeD_bell_and_tentacles       0.40      0.32      0.36        56
   siphonophore_calycophoran_rocketship_adult       0.35      0.36      0.36       135
                               unknown_sticks       0.26      0.13      0.17       175
                         hydromedusae_aglaura       0.42      0.28      0.34       127
                   tornaria_acorn_worm_larvae       0.53      0.45      0.49        38
                            euphausiids_young       0.00      0.00      0.00        38
                            ctenophore_cestid       0.53      0.61      0.57       113
                                copepod_other       0.00      0.00      0.00        24
                      chaetognath_non_sagitta       0.64      0.72      0.68       815
                           trochophore_larvae       0.00      0.00      0.00        29
                                   polychaete       0.17      0.06      0.09       131
                        fish_larvae_thin_body       0.30      0.16      0.21        64
        hydromedusae_haliscera_small_sideview       0.00      0.00      0.00         9
                     appendicularian_straight       0.14      0.06      0.09       242
                              shrimp_caridean       0.00      0.00      0.00        49
                            radiolarian_chain       0.12      0.04      0.06       287
               copepod_cyclopoid_oithona_eggs       0.48      0.68      0.56      1189
              echinoderm_larva_pluteus_urchin       0.06      0.01      0.02        88
                          diatom_chain_string       0.45      0.59      0.51       519
                         detritus_filamentous       0.14      0.08      0.10       394
                            jellies_tentacles       0.41      0.34      0.37       141
          echinoderm_larva_seastar_bipinnaria       0.47      0.55      0.51       385
                             copepod_calanoid       0.32      0.28      0.30       681
                            protist_noctiluca       0.41      0.37      0.39       625
                      tunicate_doliolid_nurse       0.23      0.16      0.19       417
                                detritus_blob       0.21      0.14      0.17       363
                         trichodesmium_bowtie       0.36      0.54      0.43       708
                    hydromedusae_sideview_big       0.27      0.12      0.17        76
                           pteropod_butterfly       0.21      0.06      0.10       108
                            ctenophore_lobate       0.72      0.61      0.66        38
                               chordate_type1       0.54      0.68      0.60        77
                           acantharia_protist       0.67      0.76      0.71       889
         echinoderm_larva_pluteus_brittlestar       0.50      0.22      0.31        36
                           hydromedusae_other       0.00      0.00      0.00        12
                     siphonophore_other_parts       0.00      0.00      0.00        29
siphonophore_calycophoran_sphaeronectes_young       0.22      0.18      0.20       247
                                     decapods       0.00      0.00      0.00        55
                    hydromedusae_narcomedusae       0.35      0.24      0.29       132
                                    amphipods       0.00      0.00      0.00        49
                       pteropod_theco_dev_seq       0.00      0.00      0.00        13
                                echinopluteus       0.38      0.33      0.35        27
   siphonophore_calycophoran_rocketship_young       0.33      0.32      0.32       483
          copepod_calanoid_small_longantennae       0.40      0.41      0.40        87
                               artifacts_edge       0.72      0.55      0.62       170
                           radiolarian_colony       0.19      0.18      0.18       158
                            tunicate_doliolid       0.31      0.19      0.23       439
                         siphonophore_partial       0.29      0.07      0.11        30
                         unknown_unclassified       0.08      0.02      0.03       425
                       fish_larvae_myctophids       0.45      0.49      0.47       114
                       siphonophore_physonect       0.44      0.34      0.38       128
                ctenophore_cydippid_tentacles       0.00      0.00      0.00        53
                        hydromedusae_solmaris       0.58      0.78      0.67       703
                             tunicate_partial       0.56      0.81      0.66       352
                           hydromedusae_typeE       0.00      0.00      0.00        14
                           hydromedusae_typeD       0.00      0.00      0.00        43
                           hydromedusae_typeF       0.28      0.16      0.21        61
              copepod_calanoid_frillyAntennae       0.22      0.03      0.06        63
                          chaetognath_sagitta       0.42      0.33      0.37       694
                                    artifacts       0.34      0.36      0.35       393
                      acantharia_protist_halo       0.40      0.25      0.31        71
                      hydromedusae_narco_dark       0.00      0.00      0.00        23
                         tunicate_salp_chains       0.62      0.42      0.50        73
                                   stomatopod       0.00      0.00      0.00        24
                   fish_larvae_very_thin_body       1.00      0.12      0.22        16

                                  avg / total       0.39      0.43      0.40     30336


In [14]:
def multiclass_log_loss(y_true, y_pred, eps=1e-15):
    """Multi class version of Logarithmic Loss metric.
    https://www.kaggle.com/wiki/MultiClassLogLoss

    Parameters
    ----------
    y_true : array, shape = [n_samples]
            true class, intergers in [0, n_classes - 1)
    y_pred : array, shape = [n_samples, n_classes]

    Returns
    -------
    loss : float
    """
    predictions = np.clip(y_pred, eps, 1 - eps)

    # normalize row sums to 1
    predictions /= predictions.sum(axis=1)[:, np.newaxis]

    actual = np.zeros(y_pred.shape)
    n_samples = actual.shape[0]
    actual[np.arange(n_samples), y_true.astype(int)] = 1
    vectsum = np.sum(actual * np.log(predictions))
    loss = -1.0 / n_samples * vectsum
    return loss

In [15]:
# Get the probability predictions for computing the log-loss function
kf = KFold(y, n_folds=5)
# prediction probabilities number of samples, by number of classes
y_pred = np.zeros((len(y),len(set(y))))
for fold_ii, (train, test) in enumerate(kf):
    print 'Fold:', fold_ii
    X_train, X_test, y_train, y_test = X[train,:], X[test,:], y[train], y[test]
    clf = RF(n_estimators=100, n_jobs=3)
#     clf = LabelSpreading()
#     X_train = np.r_[X_train, X_test]
#     y_train = np.r_[y_train, -np.ones(len(y_test))]
    clf.fit(X_train, y_train)
    y_pred[test] = clf.predict_proba(X_test)


Fold: 0
---------------------------------------------------------------------------
MemoryError                               Traceback (most recent call last)
<ipython-input-15-bfdf41a7ee1c> in <module>()
     10     X_train = np.r_[X_train, X_test]
     11     y_train = np.r_[y_train, -np.ones(len(y_test))]
---> 12     clf.fit(X_train, y_train)
     13     y_pred[test] = clf.predict_proba(X_test)

/usr/lib/python2.7/site-packages/sklearn/semi_supervised/label_propagation.pyc in fit(self, X, y)
    211 
    212         # actual graph construction (implementations should override this)
--> 213         graph_matrix = self._build_graph()
    214 
    215         # label construction

/usr/lib/python2.7/site-packages/sklearn/semi_supervised/label_propagation.pyc in _build_graph(self)
    415         affinity_matrix = self._get_kernel(self.X_)
    416         laplacian = graph_laplacian(affinity_matrix, normed=True)
--> 417         laplacian = -laplacian
    418         if sparse.isspmatrix(laplacian):
    419             diag_mask = (laplacian.row == laplacian.col)

MemoryError: 

In [ ]:
multiclass_log_loss(y, y_pred)