In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV
In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train = train.dropna()
test = test.dropna()
train['month'] = pd.DatetimeIndex(train['datetime']).month
train['week'] = pd.DatetimeIndex(train['datetime']).week
train['hour'] = pd.DatetimeIndex(train['datetime']).hour
test['month'] = pd.DatetimeIndex(test['datetime']).month
test['week'] = pd.DatetimeIndex(test['datetime']).week
test['hour'] = pd.DatetimeIndex(test['datetime']).hour
trainX = train[['season', 'holiday', 'workingday', 'weather', 'temp', 'atemp', 'humidity', 'windspeed', 'month', 'week', 'hour']].values.astype("float32")
trainY = train['count'].values.astype("int")
testX = test[['season', 'holiday', 'workingday', 'weather', 'temp', 'atemp', 'humidity', 'windspeed', 'month', 'week', 'hour']].values.astype("float32")
In [5]:
parameters = {'min_samples_split' : [20], 'n_estimators' : [100, 200]}
clf = GridSearchCV(RandomForestClassifier(), parameters, n_jobs=1, verbose=1)
clf.fit(trainX, trainY)
print("Best score: %0.3f" % clf.best_score_)
print("Best parameters set:")
best_parameters = clf.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
print("\t%s: %r" % (param_name, best_parameters[param_name]))
# Get best model
best_model = clf.best_estimator_
# Fit model with best parameters optimized for quadratic_weighted_kappa
best_model.fit(trainX, trainY)
preds = best_model.predict(testX)
# Create your submission file
submission = pd.DataFrame({"datetime": test['datetime'], "count": preds})
submission.to_csv("submission.csv", index=False)
In [6]:
# LB score : 0.72923
submission
Out[6]: