In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score
import numpy as np

In [2]:
trainlabel = pd.read_csv('../Data/train.csv')

In [4]:
whaleIDset = set(trainlabel['whaleID'])

In [6]:
print(len(whaleIDset), len(trainlabel))


(447, 4544)

In [5]:
Xcol = train.columns[1:]
ycol = 'label'
X = train[Xcol]
y = train[ycol]

In [7]:
clf = RandomForestClassifier()

In [10]:
scores = cross_val_score(clf, X, y, cv=5)
print(scores, scores.mean(), scores.std())


(array([ 0.93884593,  0.93466619,  0.94082629,  0.93664404,  0.93806575]), 0.93780963793315875, 0.0020726574870475502)

In [11]:
clf.fit(X, y)


Out[11]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [12]:
test = pd.read_csv(dloc + 'test.csv')

In [14]:
Xtest = test

In [15]:
ypredict = clf.predict(Xtest)

In [16]:
ypredict


Out[16]:
array([2, 0, 9, ..., 3, 9, 2])

In [17]:
dfpredict = pd.DataFrame(ypredict)

In [22]:
dfpredict.columns = ['Label']

In [26]:
dfpredict['ImageId'] = np.arange(28000) + 1

In [27]:
dfpredict.to_csv(dloc + 'predict_RFbenchmark.csv', index=False)

In [ ]: