In [ ]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score
import numpy as np
from skimage.io import imread, imshow, imsave
from skimage.feature import blob_doh
from skimage.color import rgb2gray
from skimage.transform import resize
from glob import glob
import os
from sklearn.feature_extraction.text import CountVectorizer
In [12]:
%matplotlib inline
In [2]:
trainlabel = pd.read_csv('../Data/train.csv')
In [4]:
whaleIDset = set(trainlabel['whaleID'])
In [6]:
print(len(whaleIDset), len(trainlabel))
In [7]:
trainlabel
Out[7]:
In [29]:
im1 = imread('../../BigData/kaggle-right-whale/imgs_subset/w_300.jpg')
In [30]:
imshow(im1)
Out[30]:
In [17]:
image_gray = rgb2gray(im1)
In [25]:
blobs_doh = blob_doh(image_gray, max_sigma=300, threshold=0.1, min_sigma=2)
In [26]:
import matplotlib.pyplot as plt
fig, ax = plt.subplots(1, 1)
ax.imshow(im1, interpolation='nearest')
for blob in blobs_doh:
y, x, r = blob
c = plt.Circle((x, y), r, color='red', linewidth=2, fill=False)
ax.add_patch(c)
In [40]:
im2 = resize(im1,(200,300,3))
In [41]:
imshow(im2)
Out[41]:
In [43]:
files = glob('../../BigData/kaggle-right-whale/imgs/*jpg')
In [ ]:
for file in files:
im1 = imread(file)
imsmall = resize(im1, (200,300,3))
splitfile = os.path.splitext(file)
fsmall = splitfile[0] + '_small' + splitfile[1]
imsave(fsmall, imsmall)
In [ ]:
dloc = '../../BigData/kaggle-right-whale/imgs/'
In [ ]:
features = []
for image in trainlabel['Image']:
dfile = dloc + image
splitfile = os.path.splitext(file)
fsmall = splitfile[0] + '_small' + splitfile[1]
imdata = imread(fsmall)
features.append(imdata.flatten())
In [ ]:
labels = trainlabel['whaleID']
In [ ]:
vectorizer = CountVectorizer(min_df=1)
In [5]:
X = features
y = vectorizer.fit_transform(labels).toarray()
In [7]:
clf = RandomForestClassifier()
In [10]:
scores = cross_val_score(clf, X, y, cv=5)
print(scores, scores.mean(), scores.std())
In [11]:
clf.fit(X, y)
Out[11]:
In [12]:
test = pd.read_csv(dloc + 'test.csv')
In [14]:
Xtest = test
In [15]:
ypredict = clf.predict(Xtest)
In [16]:
ypredict
Out[16]:
In [17]:
dfpredict = pd.DataFrame(ypredict)
In [22]:
dfpredict.columns = ['Label']
In [26]:
dfpredict['ImageId'] = np.arange(28000) + 1
In [27]:
dfpredict.to_csv(dloc + 'predict_RFbenchmark.csv', index=False)
In [ ]: