In [1]:
import pandas as pd

In [3]:
ls


Data_Exploratory.py.ipynb  sampleSubmission.csv       test.csv                   train.csv

In [9]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [10]:
print 'train columns'
print train.columns
print "=" * 100
print 'test columns'
print test.columns


train columns
Index([u'datetime', u'season', u'holiday', u'workingday', u'weather', u'temp',
       u'atemp', u'humidity', u'windspeed', u'casual', u'registered',
       u'count'],
      dtype='object')
====================================================================================================
test columns
Index([u'datetime', u'season', u'holiday', u'workingday', u'weather', u'temp',
       u'atemp', u'humidity', u'windspeed'],
      dtype='object')

In [11]:
# datetime - hourly date + timestamp  
# season -  1 = spring, 2 = summer, 3 = fall, 4 = winter 
# holiday - whether the day is considered a holiday
# workingday - whether the day is neither a weekend nor holiday
# weather - 1: Clear, Few clouds, Partly cloudy 
#           2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist 
#           3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds 
#           4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog 
# temp - temperature in Celsius
# atemp - "feels like" temperature in Celsius
# humidity - relative humidity
# windspeed - wind speed
# casual - number of non-registered user rentals initiated
# registered - number of registered user rentals initiated
# count - number of total rentals

In [25]:
train.head(5)


Out[25]:
datetime season holiday workingday weather temp atemp humidity windspeed casual registered count
0 2011-01-01 00:00:00 1 0 0 1 9.84 14.395 81 0 3 13 16
1 2011-01-01 01:00:00 1 0 0 1 9.02 13.635 80 0 8 32 40
2 2011-01-01 02:00:00 1 0 0 1 9.02 13.635 80 0 5 27 32
3 2011-01-01 03:00:00 1 0 0 1 9.84 14.395 75 0 3 10 13
4 2011-01-01 04:00:00 1 0 0 1 9.84 14.395 75 0 0 1 1

In [17]:
test.head(5)


Out[17]:
datetime season holiday workingday weather temp atemp humidity windspeed
0 2011-01-20 00:00:00 1 0 1 1 10.66 11.365 56 26.0027
1 2011-01-20 01:00:00 1 0 1 1 10.66 13.635 56 0.0000
2 2011-01-20 02:00:00 1 0 1 1 10.66 13.635 56 0.0000
3 2011-01-20 03:00:00 1 0 1 1 10.66 12.880 56 11.0014
4 2011-01-20 04:00:00 1 0 1 1 10.66 12.880 56 11.0014

In [31]:
# nan vlaues가 있는지 조사해보자
print train.isnull().any(0)
print 
print test.isnull().any(0)
# train : humidity windspeed casual registered count 에 null값이 있군
# test : null값이 없다


datetime      False
season        False
holiday       False
workingday    False
weather       False
temp          False
atemp         False
humidity       True
windspeed      True
casual         True
registered     True
count          True
dtype: bool

datetime      False
season        False
holiday       False
workingday    False
weather       False
temp          False
atemp         False
humidity      False
windspeed     False
dtype: bool

In [38]:
# null값은 한개가 있네. 일부러 이렇게 한걸까?
train.isnull().sum()


Out[38]:
datetime      0
season        0
holiday       0
workingday    0
weather       0
temp          0
atemp         0
humidity      1
windspeed     1
casual        1
registered    1
count         1
dtype: int64

In [40]:
# null값을 제거하자
train = train.dropna()
train.isnull().sum()


Out[40]:
datetime      0
season        0
holiday       0
workingday    0
weather       0
temp          0
atemp         0
humidity      0
windspeed     0
casual        0
registered    0
count         0
dtype: int64

In [54]:
# month, week, hour을 새로운 feature로 추가해보자
train['month'] = pd.DatetimeIndex(train['datetime']).month
train['week'] = pd.DatetimeIndex(train['datetime']).week
train['hour'] = pd.DatetimeIndex(train['datetime']).hour

train.head(3)


Out[54]:
datetime season holiday workingday weather temp atemp humidity windspeed casual registered count month week hour
0 2011-01-01 00:00:00 1 0 0 1 9.84 14.395 81 0 3 13 16 1 52 0
1 2011-01-01 01:00:00 1 0 0 1 9.02 13.635 80 0 8 32 40 1 52 1
2 2011-01-01 02:00:00 1 0 0 1 9.02 13.635 80 0 5 27 32 1 52 2

In [56]:
train.describe()


Out[56]:
season holiday workingday weather temp atemp humidity windspeed casual registered count month week hour
count 10168.000000 10168.000000 10168.000000 10168.000000 10168.000000 10168.000000 10168.000000 10168.000000 10168.000000 10168.000000 10168.000000 10168.000000 10168.000000 10168.000000
mean 2.401161 0.028226 0.684304 1.413356 20.612419 24.045256 61.453482 12.904401 36.600315 152.659717 189.260031 6.160405 24.343725 11.542388
std 1.079448 0.165626 0.464815 0.636089 7.876058 8.574981 19.318252 8.226758 50.457351 149.697541 180.673870 3.272325 14.268918 6.916436
min 1.000000 0.000000 0.000000 1.000000 0.820000 0.760000 0.000000 0.000000 0.000000 0.000000 1.000000 1.000000 1.000000 0.000000
25% 1.000000 0.000000 0.000000 1.000000 13.940000 16.665000 46.000000 7.001500 4.000000 35.000000 41.000000 3.000000 11.000000 6.000000
50% 2.000000 0.000000 1.000000 1.000000 21.320000 25.000000 61.000000 12.998000 17.000000 115.000000 141.000000 6.000000 24.000000 12.000000
75% 3.000000 0.000000 1.000000 2.000000 27.060000 31.060000 77.000000 16.997900 50.000000 218.000000 281.000000 9.000000 36.000000 18.000000
max 4.000000 1.000000 1.000000 4.000000 41.000000 45.455000 100.000000 56.996900 367.000000 886.000000 977.000000 12.000000 52.000000 23.000000

In [65]:
features = train[['season', 'holiday', 'workingday', 'weather', 'temp', 'atemp', 'humidity', 'windspeed', 'month', 'week', 'hour']].values.astype("float32")
labels = train['count'].values.astype("int")

In [66]:
print features
print labels


[[  1.   0.   0. ...,   1.  52.   0.]
 [  1.   0.   0. ...,   1.  52.   1.]
 [  1.   0.   0. ...,   1.  52.   2.]
 ..., 
 [  4.   0.   1. ...,  11.  45.  23.]
 [  4.   0.   1. ...,  11.  45.   0.]
 [  4.   0.   1. ...,  11.  45.   1.]]
[16 40 32 ..., 87 55 20]

In [67]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV

In [68]:
parameters = {'min_samples_split' : [20], 'n_estimators' : [100, 200]}
clf = GridSearchCV(RandomForestClassifier(), parameters, n_jobs=1, verbose=1)

In [69]:
clf.fit(features, labels)


/Users/dikien/anaconda/lib/python2.7/site-packages/sklearn/cross_validation.py:413: Warning: The least populated class in y has only 1 members, which is too few. The minimum number of labels for any class cannot be less than n_folds=3.
  % (min_labels, self.n_folds)), Warning)
[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    4.9s
Fitting 3 folds for each of 2 candidates, totalling 6 fits
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   48.1s finished
Out[69]:
GridSearchCV(cv=None,
       estimator=RandomForestClassifier(bootstrap=True, compute_importances=None,
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_density=None, min_samples_leaf=1,
            min_samples_split=2, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0),
       fit_params={}, iid=True, loss_func=None, n_jobs=1,
       param_grid={'min_samples_split': [20], 'n_estimators': [100, 200]},
       pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,
       verbose=1)

In [78]:
print("Best score: %0.3f" % clf.best_score_)
print("Best parameters set:")
best_parameters = clf.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))


Best score: 0.017
Best parameters set:
	min_samples_split: 20
	n_estimators: 100

In [80]:
sample = pd.read_csv('sampleSubmission.csv')
sample.head(2)


Out[80]:
datetime count
0 2011-01-20 00:00:00 0
1 2011-01-20 01:00:00 0