In [ ]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score
import numpy as np
from skimage.io import imread, imshow, imsave
from skimage.feature import blob_doh
from skimage.color import rgb2gray
from skimage.transform import resize
from glob import glob
import os
from sklearn.feature_extraction.text import CountVectorizer

In [12]:
%matplotlib inline

In [2]:
trainlabel = pd.read_csv('../Data/train.csv')

In [4]:
whaleIDset = set(trainlabel['whaleID'])

In [6]:
print(len(whaleIDset), len(trainlabel))


(447, 4544)

In [7]:
trainlabel


Out[7]:
Image whaleID
0 w_7812.jpg whale_48813
1 w_4598.jpg whale_09913
2 w_3828.jpg whale_45062
3 w_8734.jpg whale_74162
4 w_3251.jpg whale_99558
5 w_8063.jpg whale_59255
6 w_5624.jpg whale_87291
7 w_825.jpg whale_33152
8 w_3974.jpg whale_88147
9 w_3049.jpg whale_77693
10 w_6660.jpg whale_74625
11 w_3909.jpg whale_06069
12 w_2632.jpg whale_62939
13 w_7754.jpg whale_67036
14 w_1681.jpg whale_78785
15 w_4130.jpg whale_75682
16 w_1194.jpg whale_08017
17 w_5390.jpg whale_23574
18 w_8923.jpg whale_06069
19 w_8790.jpg whale_46169
20 w_4522.jpg whale_11242
21 w_6695.jpg whale_03227
22 w_1052.jpg whale_24815
23 w_7212.jpg whale_36231
24 w_735.jpg whale_89615
25 w_3805.jpg whale_64989
26 w_707.jpg whale_52342
27 w_9271.jpg whale_63948
28 w_7097.jpg whale_80947
29 w_3779.jpg whale_80247
... ... ...
4514 w_8898.jpg whale_63816
4515 w_8913.jpg whale_00195
4516 w_8938.jpg whale_15615
4517 w_8971.jpg whale_64937
4518 w_8983.jpg whale_95370
4519 w_9010.jpg whale_95091
4520 w_9016.jpg whale_72327
4521 w_9057.jpg whale_98645
4522 w_9070.jpg whale_22118
4523 w_9080.jpg whale_66711
4524 w_9118.jpg whale_16406
4525 w_9138.jpg whale_23525
4526 w_9186.jpg whale_00195
4527 w_9191.jpg whale_98746
4528 w_9211.jpg whale_12820
4529 w_9246.jpg whale_33152
4530 w_9248.jpg whale_64634
4531 w_9281.jpg whale_86527
4532 w_9299.jpg whale_21213
4533 w_9304.jpg whale_85464
4534 w_9320.jpg whale_51195
4535 w_9342.jpg whale_85464
4536 w_9379.jpg whale_28892
4537 w_9397.jpg whale_17528
4538 w_9398.jpg whale_16406
4539 w_9399.jpg whale_09651
4540 w_9403.jpg whale_06967
4541 w_9428.jpg whale_55333
4542 w_9450.jpg whale_24458
4543 w_9468.jpg whale_66205

4544 rows × 2 columns


In [29]:
im1 = imread('../../BigData/kaggle-right-whale/imgs_subset/w_300.jpg')

In [30]:
imshow(im1)


Out[30]:
<matplotlib.image.AxesImage at 0x10cadbc50>

In [17]:
image_gray = rgb2gray(im1)

In [25]:
blobs_doh = blob_doh(image_gray, max_sigma=300, threshold=0.1, min_sigma=2)

In [26]:
import matplotlib.pyplot as plt
fig, ax = plt.subplots(1, 1)
ax.imshow(im1, interpolation='nearest')
for blob in blobs_doh:
    y, x, r = blob
    c = plt.Circle((x, y), r, color='red', linewidth=2, fill=False)
    ax.add_patch(c)



In [40]:
im2 = resize(im1,(200,300,3))

In [41]:
imshow(im2)


Out[41]:
<matplotlib.image.AxesImage at 0x118c4b090>

In [43]:
files = glob('../../BigData/kaggle-right-whale/imgs/*jpg')

In [ ]:
for file in files:
    im1 = imread(file)
    imsmall = resize(im1, (200,300,3))
    splitfile = os.path.splitext(file)
    fsmall = splitfile[0] + '_small' + splitfile[1]
    imsave(fsmall, imsmall)

Make the training set


In [ ]:
dloc = '../../BigData/kaggle-right-whale/imgs/'

In [ ]:
features = []
for image in trainlabel['Image']:
    dfile = dloc + image
    splitfile = os.path.splitext(file)
    fsmall = splitfile[0] + '_small' + splitfile[1]
    imdata = imread(fsmall)
    features.append(imdata.flatten())

In [ ]:
labels = trainlabel['whaleID']

In [ ]:
vectorizer = CountVectorizer(min_df=1)

In [5]:
X = features
y = vectorizer.fit_transform(labels).toarray()

In [7]:
clf = RandomForestClassifier()

In [10]:
scores = cross_val_score(clf, X, y, cv=5)
print(scores, scores.mean(), scores.std())


(array([ 0.93884593,  0.93466619,  0.94082629,  0.93664404,  0.93806575]), 0.93780963793315875, 0.0020726574870475502)

In [11]:
clf.fit(X, y)


Out[11]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [12]:
test = pd.read_csv(dloc + 'test.csv')

In [14]:
Xtest = test

In [15]:
ypredict = clf.predict(Xtest)

In [16]:
ypredict


Out[16]:
array([2, 0, 9, ..., 3, 9, 2])

In [17]:
dfpredict = pd.DataFrame(ypredict)

In [22]:
dfpredict.columns = ['Label']

In [26]:
dfpredict['ImageId'] = np.arange(28000) + 1

In [27]:
dfpredict.to_csv(dloc + 'predict_RFbenchmark.csv', index=False)

In [ ]: