notebook.community

Edit and run



In [1]:

    
import pandas as pd



In [3]:

    
ls









    



Data_Exploratory.py.ipynb  sampleSubmission.csv       test.csv                   train.csv



In [9]:

    
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')



In [10]:

    
print 'train columns'
print train.columns
print "=" * 100
print 'test columns'
print test.columns









    



train columns
Index([u'datetime', u'season', u'holiday', u'workingday', u'weather', u'temp',
       u'atemp', u'humidity', u'windspeed', u'casual', u'registered',
       u'count'],
      dtype='object')
====================================================================================================
test columns
Index([u'datetime', u'season', u'holiday', u'workingday', u'weather', u'temp',
       u'atemp', u'humidity', u'windspeed'],
      dtype='object')



In [11]:

    
# datetime - hourly date + timestamp  
# season -  1 = spring, 2 = summer, 3 = fall, 4 = winter 
# holiday - whether the day is considered a holiday
# workingday - whether the day is neither a weekend nor holiday
# weather - 1: Clear, Few clouds, Partly cloudy 
#           2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist 
#           3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds 
#           4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog 
# temp - temperature in Celsius
# atemp - "feels like" temperature in Celsius
# humidity - relative humidity
# windspeed - wind speed
# casual - number of non-registered user rentals initiated
# registered - number of registered user rentals initiated
# count - number of total rentals



In [25]:

    
train.head(5)









    Out[25]:






  
    
      
      datetime
      season
      holiday
      workingday
      weather
      temp
      atemp
      humidity
      windspeed
      casual
      registered
      count
    
  
  
    
      0
      2011-01-01 00:00:00
      1
      0
      0
      1
      9.84
      14.395
      81
      0
      3
      13
      16
    
    
      1
      2011-01-01 01:00:00
      1
      0
      0
      1
      9.02
      13.635
      80
      0
      8
      32
      40
    
    
      2
      2011-01-01 02:00:00
      1
      0
      0
      1
      9.02
      13.635
      80
      0
      5
      27
      32
    
    
      3
      2011-01-01 03:00:00
      1
      0
      0
      1
      9.84
      14.395
      75
      0
      3
      10
      13
    
    
      4
      2011-01-01 04:00:00
      1
      0
      0
      1
      9.84
      14.395
      75
      0
      0
      1
      1



In [17]:

    
test.head(5)









    Out[17]:






  
    
      
      datetime
      season
      holiday
      workingday
      weather
      temp
      atemp
      humidity
      windspeed
    
  
  
    
      0
      2011-01-20 00:00:00
      1
      0
      1
      1
      10.66
      11.365
      56
      26.0027
    
    
      1
      2011-01-20 01:00:00
      1
      0
      1
      1
      10.66
      13.635
      56
      0.0000
    
    
      2
      2011-01-20 02:00:00
      1
      0
      1
      1
      10.66
      13.635
      56
      0.0000
    
    
      3
      2011-01-20 03:00:00
      1
      0
      1
      1
      10.66
      12.880
      56
      11.0014
    
    
      4
      2011-01-20 04:00:00
      1
      0
      1
      1
      10.66
      12.880
      56
      11.0014



In [31]:

    
# nan vlaues가 있는지 조사해보자
print train.isnull().any(0)
print 
print test.isnull().any(0)
# train : humidity windspeed casual registered count 에 null값이 있군
# test : null값이 없다









    



datetime      False
season        False
holiday       False
workingday    False
weather       False
temp          False
atemp         False
humidity       True
windspeed      True
casual         True
registered     True
count          True
dtype: bool

datetime      False
season        False
holiday       False
workingday    False
weather       False
temp          False
atemp         False
humidity      False
windspeed     False
dtype: bool



In [38]:

    
# null값은 한개가 있네. 일부러 이렇게 한걸까?
train.isnull().sum()









    Out[38]:





datetime      0
season        0
holiday       0
workingday    0
weather       0
temp          0
atemp         0
humidity      1
windspeed     1
casual        1
registered    1
count         1
dtype: int64



In [40]:

    
# null값을 제거하자
train = train.dropna()
train.isnull().sum()









    Out[40]:





datetime      0
season        0
holiday       0
workingday    0
weather       0
temp          0
atemp         0
humidity      0
windspeed     0
casual        0
registered    0
count         0
dtype: int64



In [54]:

    
# month, week, hour을 새로운 feature로 추가해보자
train['month'] = pd.DatetimeIndex(train['datetime']).month
train['week'] = pd.DatetimeIndex(train['datetime']).week
train['hour'] = pd.DatetimeIndex(train['datetime']).hour

train.head(3)









    Out[54]:






  
    
      
      datetime
      season
      holiday
      workingday
      weather
      temp
      atemp
      humidity
      windspeed
      casual
      registered
      count
      month
      week
      hour
    
  
  
    
      0
      2011-01-01 00:00:00
      1
      0
      0
      1
      9.84
      14.395
      81
      0
      3
      13
      16
      1
      52
      0
    
    
      1
      2011-01-01 01:00:00
      1
      0
      0
      1
      9.02
      13.635
      80
      0
      8
      32
      40
      1
      52
      1
    
    
      2
      2011-01-01 02:00:00
      1
      0
      0
      1
      9.02
      13.635
      80
      0
      5
      27
      32
      1
      52
      2



In [56]:

    
train.describe()









    Out[56]:






  
    
      
      season
      holiday
      workingday
      weather
      temp
      atemp
      humidity
      windspeed
      casual
      registered
      count
      month
      week
      hour
    
  
  
    
      count
      10168.000000
      10168.000000
      10168.000000
      10168.000000
      10168.000000
      10168.000000
      10168.000000
      10168.000000
      10168.000000
      10168.000000
      10168.000000
      10168.000000
      10168.000000
      10168.000000
    
    
      mean
      2.401161
      0.028226
      0.684304
      1.413356
      20.612419
      24.045256
      61.453482
      12.904401
      36.600315
      152.659717
      189.260031
      6.160405
      24.343725
      11.542388
    
    
      std
      1.079448
      0.165626
      0.464815
      0.636089
      7.876058
      8.574981
      19.318252
      8.226758
      50.457351
      149.697541
      180.673870
      3.272325
      14.268918
      6.916436
    
    
      min
      1.000000
      0.000000
      0.000000
      1.000000
      0.820000
      0.760000
      0.000000
      0.000000
      0.000000
      0.000000
      1.000000
      1.000000
      1.000000
      0.000000
    
    
      25%
      1.000000
      0.000000
      0.000000
      1.000000
      13.940000
      16.665000
      46.000000
      7.001500
      4.000000
      35.000000
      41.000000
      3.000000
      11.000000
      6.000000
    
    
      50%
      2.000000
      0.000000
      1.000000
      1.000000
      21.320000
      25.000000
      61.000000
      12.998000
      17.000000
      115.000000
      141.000000
      6.000000
      24.000000
      12.000000
    
    
      75%
      3.000000
      0.000000
      1.000000
      2.000000
      27.060000
      31.060000
      77.000000
      16.997900
      50.000000
      218.000000
      281.000000
      9.000000
      36.000000
      18.000000
    
    
      max
      4.000000
      1.000000
      1.000000
      4.000000
      41.000000
      45.455000
      100.000000
      56.996900
      367.000000
      886.000000
      977.000000
      12.000000
      52.000000
      23.000000



In [65]:

    
features = train[['season', 'holiday', 'workingday', 'weather', 'temp', 'atemp', 'humidity', 'windspeed', 'month', 'week', 'hour']].values.astype("float32")
labels = train['count'].values.astype("int")



In [66]:

    
print features
print labels









    



[[  1.   0.   0. ...,   1.  52.   0.]
 [  1.   0.   0. ...,   1.  52.   1.]
 [  1.   0.   0. ...,   1.  52.   2.]
 ..., 
 [  4.   0.   1. ...,  11.  45.  23.]
 [  4.   0.   1. ...,  11.  45.   0.]
 [  4.   0.   1. ...,  11.  45.   1.]]
[16 40 32 ..., 87 55 20]



In [67]:

    
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV



In [68]:

    
parameters = {'min_samples_split' : [20], 'n_estimators' : [100, 200]}
clf = GridSearchCV(RandomForestClassifier(), parameters, n_jobs=1, verbose=1)



In [69]:

    
clf.fit(features, labels)









    



/Users/dikien/anaconda/lib/python2.7/site-packages/sklearn/cross_validation.py:413: Warning: The least populated class in y has only 1 members, which is too few. The minimum number of labels for any class cannot be less than n_folds=3.
  % (min_labels, self.n_folds)), Warning)
[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    4.9s






    



Fitting 3 folds for each of 2 candidates, totalling 6 fits






    



[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   48.1s finished






    Out[69]:





GridSearchCV(cv=None,
       estimator=RandomForestClassifier(bootstrap=True, compute_importances=None,
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_density=None, min_samples_leaf=1,
            min_samples_split=2, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0),
       fit_params={}, iid=True, loss_func=None, n_jobs=1,
       param_grid={'min_samples_split': [20], 'n_estimators': [100, 200]},
       pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,
       verbose=1)



In [78]:

    
print("Best score: %0.3f" % clf.best_score_)
print("Best parameters set:")
best_parameters = clf.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))









    



Best score: 0.017
Best parameters set:
	min_samples_split: 20
	n_estimators: 100



In [80]:

    
sample = pd.read_csv('sampleSubmission.csv')
sample.head(2)









    Out[80]:






  
    
      
      datetime
      count
    
  
  
    
      0
      2011-01-20 00:00:00
      0
    
    
      1
      2011-01-20 01:00:00
      0

	datetime	season	weather	temp	atemp	humidity	casual	registered	count
0	2011-01-01 00:00:00	1	1	9.84	14.395	81	3	13	16
1	2011-01-01 01:00:00	1	1	9.02	13.635	80	8	32	40
2	2011-01-01 02:00:00	1	1	9.02	13.635	80	5	27	32
3	2011-01-01 03:00:00	1	1	9.84	14.395	75	3	10	13
4	2011-01-01 04:00:00	1	1	9.84	14.395	75	0	1	1

	datetime	season	workingday	weather	temp	atemp	humidity	windspeed
0	2011-01-20 00:00:00	1	1	1	10.66	11.365	56	26.0027
1	2011-01-20 01:00:00	1	1	1	10.66	13.635	56	0.0000
2	2011-01-20 02:00:00	1	1	1	10.66	13.635	56	0.0000
3	2011-01-20 03:00:00	1	1	1	10.66	12.880	56	11.0014
4	2011-01-20 04:00:00	1	1	1	10.66	12.880	56	11.0014

	season	holiday	workingday	weather	temp	atemp	humidity	windspeed	casual	registered	count	month	week	hour
count	10168.000000	10168.000000	10168.000000	10168.000000	10168.000000	10168.000000	10168.000000	10168.000000	10168.000000	10168.000000	10168.000000	10168.000000	10168.000000	10168.000000
mean	2.401161	0.028226	0.684304	1.413356	20.612419	24.045256	61.453482	12.904401	36.600315	152.659717	189.260031	6.160405	24.343725	11.542388
std	1.079448	0.165626	0.464815	0.636089	7.876058	8.574981	19.318252	8.226758	50.457351	149.697541	180.673870	3.272325	14.268918	6.916436
min	1.000000	0.000000	0.000000	1.000000	0.820000	0.760000	0.000000	0.000000	0.000000	0.000000	1.000000	1.000000	1.000000	0.000000
25%	1.000000	0.000000	0.000000	1.000000	13.940000	16.665000	46.000000	7.001500	4.000000	35.000000	41.000000	3.000000	11.000000	6.000000
50%	2.000000	0.000000	1.000000	1.000000	21.320000	25.000000	61.000000	12.998000	17.000000	115.000000	141.000000	6.000000	24.000000	12.000000
75%	3.000000	0.000000	1.000000	2.000000	27.060000	31.060000	77.000000	16.997900	50.000000	218.000000	281.000000	9.000000	36.000000	18.000000
max	4.000000	1.000000	1.000000	4.000000	41.000000	45.455000	100.000000	56.996900	367.000000	886.000000	977.000000	12.000000	52.000000	23.000000