In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score
import numpy as np
In [2]:
trainlabel = pd.read_csv('../Data/train.csv')
In [4]:
whaleIDset = set(trainlabel['whaleID'])
In [6]:
print(len(whaleIDset), len(trainlabel))
In [5]:
Xcol = train.columns[1:]
ycol = 'label'
X = train[Xcol]
y = train[ycol]
In [7]:
clf = RandomForestClassifier()
In [10]:
scores = cross_val_score(clf, X, y, cv=5)
print(scores, scores.mean(), scores.std())
In [11]:
clf.fit(X, y)
Out[11]:
In [12]:
test = pd.read_csv(dloc + 'test.csv')
In [14]:
Xtest = test
In [15]:
ypredict = clf.predict(Xtest)
In [16]:
ypredict
Out[16]:
In [17]:
dfpredict = pd.DataFrame(ypredict)
In [22]:
dfpredict.columns = ['Label']
In [26]:
dfpredict['ImageId'] = np.arange(28000) + 1
In [27]:
dfpredict.to_csv(dloc + 'predict_RFbenchmark.csv', index=False)
In [ ]: