In [120]:
import numpy as np
import pandas as pd
dataDf = pd.read_csv('/Users/wy/Desktop/Bike Sharing Demand/train.csv')

kaggle - https://www.kaggle.com/c/bike-sharing-demand

Data Fields

datetime - hourly date + timestamp

season - 1 = spring, 2 = summer, 3 = fall, 4 = winter

holiday - whether the day is considered a holiday

workingday - whether the day is neither a weekend nor holiday

weather - 1: Clear, Few clouds, Partly cloudy, Partly cloudy

2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist

3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds

4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog

temp - temperature in Celsius

atemp - "feels like" temperature in Celsius

humidity - relative humidity

windspeed - wind speed

casual - number of non-registered user rentals initiated

registered - number of registered user rentals initiated

count - number of total rentals


In [121]:
dataDf.head()


Out[121]:
datetime season holiday workingday weather temp atemp humidity windspeed casual registered count
0 2011-01-01 00:00:00 1 0 0 1 9.84 14.395 81 0 3 13 16
1 2011-01-01 01:00:00 1 0 0 1 9.02 13.635 80 0 8 32 40
2 2011-01-01 02:00:00 1 0 0 1 9.02 13.635 80 0 5 27 32
3 2011-01-01 03:00:00 1 0 0 1 9.84 14.395 75 0 3 10 13
4 2011-01-01 04:00:00 1 0 0 1 9.84 14.395 75 0 0 1 1

In [97]:
dataDf.tail()


Out[97]:
datetime season holiday workingday weather temp atemp humidity windspeed casual registered count
10881 2012-12-19 19:00:00 4 0 1 1 15.58 19.695 50 26.0027 7 329 336
10882 2012-12-19 20:00:00 4 0 1 1 14.76 17.425 57 15.0013 10 231 241
10883 2012-12-19 21:00:00 4 0 1 1 13.94 15.910 61 15.0013 4 164 168
10884 2012-12-19 22:00:00 4 0 1 1 13.94 17.425 61 6.0032 12 117 129
10885 2012-12-19 23:00:00 4 0 1 1 13.12 16.665 66 8.9981 4 84 88

In [124]:
dataDf['count'].describe()


Out[124]:
count    10886.000000
mean       191.574132
std        181.144454
min          1.000000
25%         42.000000
50%        145.000000
75%        284.000000
max        977.000000
Name: count, dtype: float64

In [99]:
dataDf


Out[99]:
datetime season holiday workingday weather temp atemp humidity windspeed casual registered count
0 2011-01-01 00:00:00 1 0 0 1 9.84 14.395 81 0.0000 3 13 16
1 2011-01-01 01:00:00 1 0 0 1 9.02 13.635 80 0.0000 8 32 40
2 2011-01-01 02:00:00 1 0 0 1 9.02 13.635 80 0.0000 5 27 32
3 2011-01-01 03:00:00 1 0 0 1 9.84 14.395 75 0.0000 3 10 13
4 2011-01-01 04:00:00 1 0 0 1 9.84 14.395 75 0.0000 0 1 1
5 2011-01-01 05:00:00 1 0 0 2 9.84 12.880 75 6.0032 0 1 1
6 2011-01-01 06:00:00 1 0 0 1 9.02 13.635 80 0.0000 2 0 2
7 2011-01-01 07:00:00 1 0 0 1 8.20 12.880 86 0.0000 1 2 3
8 2011-01-01 08:00:00 1 0 0 1 9.84 14.395 75 0.0000 1 7 8
9 2011-01-01 09:00:00 1 0 0 1 13.12 17.425 76 0.0000 8 6 14
10 2011-01-01 10:00:00 1 0 0 1 15.58 19.695 76 16.9979 12 24 36
11 2011-01-01 11:00:00 1 0 0 1 14.76 16.665 81 19.0012 26 30 56
12 2011-01-01 12:00:00 1 0 0 1 17.22 21.210 77 19.0012 29 55 84
13 2011-01-01 13:00:00 1 0 0 2 18.86 22.725 72 19.9995 47 47 94
14 2011-01-01 14:00:00 1 0 0 2 18.86 22.725 72 19.0012 35 71 106
15 2011-01-01 15:00:00 1 0 0 2 18.04 21.970 77 19.9995 40 70 110
16 2011-01-01 16:00:00 1 0 0 2 17.22 21.210 82 19.9995 41 52 93
17 2011-01-01 17:00:00 1 0 0 2 18.04 21.970 82 19.0012 15 52 67
18 2011-01-01 18:00:00 1 0 0 3 17.22 21.210 88 16.9979 9 26 35
19 2011-01-01 19:00:00 1 0 0 3 17.22 21.210 88 16.9979 6 31 37
20 2011-01-01 20:00:00 1 0 0 2 16.40 20.455 87 16.9979 11 25 36
21 2011-01-01 21:00:00 1 0 0 2 16.40 20.455 87 12.9980 3 31 34
22 2011-01-01 22:00:00 1 0 0 2 16.40 20.455 94 15.0013 11 17 28
23 2011-01-01 23:00:00 1 0 0 2 18.86 22.725 88 19.9995 15 24 39
24 2011-01-02 00:00:00 1 0 0 2 18.86 22.725 88 19.9995 4 13 17
25 2011-01-02 01:00:00 1 0 0 2 18.04 21.970 94 16.9979 1 16 17
26 2011-01-02 02:00:00 1 0 0 2 17.22 21.210 100 19.0012 1 8 9
27 2011-01-02 03:00:00 1 0 0 2 18.86 22.725 94 12.9980 2 4 6
28 2011-01-02 04:00:00 1 0 0 2 18.86 22.725 94 12.9980 2 1 3
29 2011-01-02 06:00:00 1 0 0 3 17.22 21.210 77 19.9995 0 2 2
... ... ... ... ... ... ... ... ... ... ... ... ...
10856 2012-12-18 18:00:00 4 0 1 1 15.58 19.695 46 22.0028 13 512 525
10857 2012-12-18 19:00:00 4 0 1 1 15.58 19.695 46 26.0027 19 334 353
10858 2012-12-18 20:00:00 4 0 1 1 14.76 16.665 50 16.9979 4 264 268
10859 2012-12-18 21:00:00 4 0 1 1 14.76 17.425 50 15.0013 9 159 168
10860 2012-12-18 22:00:00 4 0 1 1 13.94 16.665 49 0.0000 5 127 132
10861 2012-12-18 23:00:00 4 0 1 1 13.94 17.425 49 6.0032 1 80 81
10862 2012-12-19 00:00:00 4 0 1 1 12.30 15.910 61 0.0000 6 35 41
10863 2012-12-19 01:00:00 4 0 1 1 12.30 15.910 65 6.0032 1 14 15
10864 2012-12-19 02:00:00 4 0 1 1 11.48 15.150 65 6.0032 1 2 3
10865 2012-12-19 03:00:00 4 0 1 1 10.66 13.635 75 8.9981 0 5 5
10866 2012-12-19 04:00:00 4 0 1 1 9.84 12.120 75 8.9981 1 6 7
10867 2012-12-19 05:00:00 4 0 1 1 10.66 14.395 75 6.0032 2 29 31
10868 2012-12-19 06:00:00 4 0 1 1 9.84 12.880 75 6.0032 3 109 112
10869 2012-12-19 07:00:00 4 0 1 1 10.66 13.635 75 8.9981 3 360 363
10870 2012-12-19 08:00:00 4 0 1 1 9.84 12.880 87 7.0015 13 665 678
10871 2012-12-19 09:00:00 4 0 1 1 11.48 14.395 75 7.0015 8 309 317
10872 2012-12-19 10:00:00 4 0 1 1 13.12 16.665 70 7.0015 17 147 164
10873 2012-12-19 11:00:00 4 0 1 1 16.40 20.455 54 15.0013 31 169 200
10874 2012-12-19 12:00:00 4 0 1 1 16.40 20.455 54 19.0012 33 203 236
10875 2012-12-19 13:00:00 4 0 1 1 17.22 21.210 50 12.9980 30 183 213
10876 2012-12-19 14:00:00 4 0 1 1 17.22 21.210 50 12.9980 33 185 218
10877 2012-12-19 15:00:00 4 0 1 1 17.22 21.210 50 19.0012 28 209 237
10878 2012-12-19 16:00:00 4 0 1 1 17.22 21.210 50 23.9994 37 297 334
10879 2012-12-19 17:00:00 4 0 1 1 16.40 20.455 50 26.0027 26 536 562
10880 2012-12-19 18:00:00 4 0 1 1 15.58 19.695 50 23.9994 23 546 569
10881 2012-12-19 19:00:00 4 0 1 1 15.58 19.695 50 26.0027 7 329 336
10882 2012-12-19 20:00:00 4 0 1 1 14.76 17.425 57 15.0013 10 231 241
10883 2012-12-19 21:00:00 4 0 1 1 13.94 15.910 61 15.0013 4 164 168
10884 2012-12-19 22:00:00 4 0 1 1 13.94 17.425 61 6.0032 12 117 129
10885 2012-12-19 23:00:00 4 0 1 1 13.12 16.665 66 8.9981 4 84 88

10886 rows × 12 columns


In [100]:
dataDf.groupby('season').sum()


Out[100]:
holiday workingday weather temp atemp humidity windspeed casual registered count
season
1 71 1828 3826 33656.90 40904.975 151216 39314.9233 41605 270893 312498
2 48 1893 3889 62376.58 72826.520 166311 36637.5229 129672 458610 588282
3 96 1845 3735 78680.64 88933.960 175250 31453.7195 142718 497944 640662
4 96 1846 3991 45519.02 54843.790 180919 31928.0527 78140 465894 544034

In [101]:
dataDf.groupby('holiday').sum()


Out[101]:
season workingday weather temp atemp humidity windspeed casual registered count
holiday
0 26448 7412 15008 213937.18 250232.655 654382 135229.1083 376964 1650704 2027668
1 839 0 433 6295.96 7276.590 19314 4105.1101 15171 42637 57808

In [102]:
dataDf.groupby('weather').sum()


Out[102]:
season holiday workingday temp atemp humidity windspeed casual registered count
weather
1 17959 204 4839 147846.82 172565.755 407907 92723.1626 289900 1186163 1476063
2 7171 92 1937 55587.80 65387.220 195831 34517.8506 87246 419914 507160
3 2156 15 635 16790.32 19544.905 69872 12087.2020 14983 87106 102089
4 1 0 1 8.20 11.365 86 6.0032 6 158 164

In [129]:
labels = []
for count in dataDf["count"] :
    if count < 42:
        labels.append(1)
    elif 42 <= count < 145:
        labels.append(2)
    elif 145 <= count < 284:
        labels.append(3)
    else:
        labels.append(4)

In [131]:
dataDf['labels'] = pd.Series(labels)

In [132]:
dataDf


Out[132]:
datetime season holiday workingday weather temp atemp humidity windspeed casual registered count labels
0 2011-01-01 00:00:00 1 0 0 1 9.84 14.395 81 0.0000 3 13 16 1
1 2011-01-01 01:00:00 1 0 0 1 9.02 13.635 80 0.0000 8 32 40 1
2 2011-01-01 02:00:00 1 0 0 1 9.02 13.635 80 0.0000 5 27 32 1
3 2011-01-01 03:00:00 1 0 0 1 9.84 14.395 75 0.0000 3 10 13 1
4 2011-01-01 04:00:00 1 0 0 1 9.84 14.395 75 0.0000 0 1 1 1
5 2011-01-01 05:00:00 1 0 0 2 9.84 12.880 75 6.0032 0 1 1 1
6 2011-01-01 06:00:00 1 0 0 1 9.02 13.635 80 0.0000 2 0 2 1
7 2011-01-01 07:00:00 1 0 0 1 8.20 12.880 86 0.0000 1 2 3 1
8 2011-01-01 08:00:00 1 0 0 1 9.84 14.395 75 0.0000 1 7 8 1
9 2011-01-01 09:00:00 1 0 0 1 13.12 17.425 76 0.0000 8 6 14 1
10 2011-01-01 10:00:00 1 0 0 1 15.58 19.695 76 16.9979 12 24 36 1
11 2011-01-01 11:00:00 1 0 0 1 14.76 16.665 81 19.0012 26 30 56 2
12 2011-01-01 12:00:00 1 0 0 1 17.22 21.210 77 19.0012 29 55 84 2
13 2011-01-01 13:00:00 1 0 0 2 18.86 22.725 72 19.9995 47 47 94 2
14 2011-01-01 14:00:00 1 0 0 2 18.86 22.725 72 19.0012 35 71 106 2
15 2011-01-01 15:00:00 1 0 0 2 18.04 21.970 77 19.9995 40 70 110 2
16 2011-01-01 16:00:00 1 0 0 2 17.22 21.210 82 19.9995 41 52 93 2
17 2011-01-01 17:00:00 1 0 0 2 18.04 21.970 82 19.0012 15 52 67 2
18 2011-01-01 18:00:00 1 0 0 3 17.22 21.210 88 16.9979 9 26 35 1
19 2011-01-01 19:00:00 1 0 0 3 17.22 21.210 88 16.9979 6 31 37 1
20 2011-01-01 20:00:00 1 0 0 2 16.40 20.455 87 16.9979 11 25 36 1
21 2011-01-01 21:00:00 1 0 0 2 16.40 20.455 87 12.9980 3 31 34 1
22 2011-01-01 22:00:00 1 0 0 2 16.40 20.455 94 15.0013 11 17 28 1
23 2011-01-01 23:00:00 1 0 0 2 18.86 22.725 88 19.9995 15 24 39 1
24 2011-01-02 00:00:00 1 0 0 2 18.86 22.725 88 19.9995 4 13 17 1
25 2011-01-02 01:00:00 1 0 0 2 18.04 21.970 94 16.9979 1 16 17 1
26 2011-01-02 02:00:00 1 0 0 2 17.22 21.210 100 19.0012 1 8 9 1
27 2011-01-02 03:00:00 1 0 0 2 18.86 22.725 94 12.9980 2 4 6 1
28 2011-01-02 04:00:00 1 0 0 2 18.86 22.725 94 12.9980 2 1 3 1
29 2011-01-02 06:00:00 1 0 0 3 17.22 21.210 77 19.9995 0 2 2 1
... ... ... ... ... ... ... ... ... ... ... ... ... ...
10856 2012-12-18 18:00:00 4 0 1 1 15.58 19.695 46 22.0028 13 512 525 4
10857 2012-12-18 19:00:00 4 0 1 1 15.58 19.695 46 26.0027 19 334 353 4
10858 2012-12-18 20:00:00 4 0 1 1 14.76 16.665 50 16.9979 4 264 268 3
10859 2012-12-18 21:00:00 4 0 1 1 14.76 17.425 50 15.0013 9 159 168 3
10860 2012-12-18 22:00:00 4 0 1 1 13.94 16.665 49 0.0000 5 127 132 2
10861 2012-12-18 23:00:00 4 0 1 1 13.94 17.425 49 6.0032 1 80 81 2
10862 2012-12-19 00:00:00 4 0 1 1 12.30 15.910 61 0.0000 6 35 41 1
10863 2012-12-19 01:00:00 4 0 1 1 12.30 15.910 65 6.0032 1 14 15 1
10864 2012-12-19 02:00:00 4 0 1 1 11.48 15.150 65 6.0032 1 2 3 1
10865 2012-12-19 03:00:00 4 0 1 1 10.66 13.635 75 8.9981 0 5 5 1
10866 2012-12-19 04:00:00 4 0 1 1 9.84 12.120 75 8.9981 1 6 7 1
10867 2012-12-19 05:00:00 4 0 1 1 10.66 14.395 75 6.0032 2 29 31 1
10868 2012-12-19 06:00:00 4 0 1 1 9.84 12.880 75 6.0032 3 109 112 2
10869 2012-12-19 07:00:00 4 0 1 1 10.66 13.635 75 8.9981 3 360 363 4
10870 2012-12-19 08:00:00 4 0 1 1 9.84 12.880 87 7.0015 13 665 678 4
10871 2012-12-19 09:00:00 4 0 1 1 11.48 14.395 75 7.0015 8 309 317 4
10872 2012-12-19 10:00:00 4 0 1 1 13.12 16.665 70 7.0015 17 147 164 3
10873 2012-12-19 11:00:00 4 0 1 1 16.40 20.455 54 15.0013 31 169 200 3
10874 2012-12-19 12:00:00 4 0 1 1 16.40 20.455 54 19.0012 33 203 236 3
10875 2012-12-19 13:00:00 4 0 1 1 17.22 21.210 50 12.9980 30 183 213 3
10876 2012-12-19 14:00:00 4 0 1 1 17.22 21.210 50 12.9980 33 185 218 3
10877 2012-12-19 15:00:00 4 0 1 1 17.22 21.210 50 19.0012 28 209 237 3
10878 2012-12-19 16:00:00 4 0 1 1 17.22 21.210 50 23.9994 37 297 334 4
10879 2012-12-19 17:00:00 4 0 1 1 16.40 20.455 50 26.0027 26 536 562 4
10880 2012-12-19 18:00:00 4 0 1 1 15.58 19.695 50 23.9994 23 546 569 4
10881 2012-12-19 19:00:00 4 0 1 1 15.58 19.695 50 26.0027 7 329 336 4
10882 2012-12-19 20:00:00 4 0 1 1 14.76 17.425 57 15.0013 10 231 241 3
10883 2012-12-19 21:00:00 4 0 1 1 13.94 15.910 61 15.0013 4 164 168 3
10884 2012-12-19 22:00:00 4 0 1 1 13.94 17.425 61 6.0032 12 117 129 2
10885 2012-12-19 23:00:00 4 0 1 1 13.12 16.665 66 8.9981 4 84 88 2

10886 rows × 13 columns


In [105]:
from sklearn.neighbors import KNeighborsClassifier

In [134]:
dataDf_data = dataDf.as_matrix(columns=["season","holiday","season","holiday","workingday","weather","temp","atemp","humidity","windspeed","casual","registered"])
dataDf_target = dataDf["labels"].as_matrix()

In [135]:
dataDf_data


Out[135]:
array([[   1.    ,    0.    ,    1.    , ...,    0.    ,    3.    ,   13.    ],
       [   1.    ,    0.    ,    1.    , ...,    0.    ,    8.    ,   32.    ],
       [   1.    ,    0.    ,    1.    , ...,    0.    ,    5.    ,   27.    ],
       ..., 
       [   4.    ,    0.    ,    4.    , ...,   15.0013,    4.    ,  164.    ],
       [   4.    ,    0.    ,    4.    , ...,    6.0032,   12.    ,  117.    ],
       [   4.    ,    0.    ,    4.    , ...,    8.9981,    4.    ,   84.    ]])

In [136]:
dataDf_target


Out[136]:
array([1, 1, 1, ..., 3, 2, 2])

In [139]:
from IPython.display import Image
Image(filename='/Users/wy/Desktop/Bike Sharing Demand/image.png')


Out[139]:

In [137]:
# 洗牌 打亂dataDf_data
indices = np.random.permutation(len(dataDf_data))
indices


Out[137]:
array([10594,  9207,  9999, ...,   692,  2992,  8242])

In [138]:
dataDf_data_train = dataDf_data[indices[:-10]]
dataDf_target_train = dataDf_target[indices[:-10]]

dataDf_data_test = dataDf_data[indices[-10:]]
dataDf_target_test = dataDf_target[indices[-10:]]

In [146]:
knn = KNeighborsClassifier()

In [147]:
knn.fit(dataDf_data_train, dataDf_target_train)


Out[147]:
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_neighbors=5, p=2, weights='uniform')

In [148]:
knn.predict(dataDf_data_test)


Out[148]:
array([4, 1, 2, 1, 3, 1, 1, 1, 2, 4])

In [149]:
dataDf_target_test


Out[149]:
array([4, 1, 2, 1, 3, 1, 1, 1, 2, 4])

In [150]:
knn = KNeighborsClassifier(n_neighbors=10,weights='distance')

In [151]:
knn.fit(dataDf_data_train, dataDf_target_train)


Out[151]:
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_neighbors=10, p=2, weights='distance')

In [152]:
knn.predict(dataDf_data_test)


Out[152]:
array([4, 1, 2, 1, 3, 1, 1, 1, 2, 4])

In [153]:
dataDf_target_test


Out[153]:
array([4, 1, 2, 1, 3, 1, 1, 1, 2, 4])