In [120]:
import numpy as np
import pandas as pd
dataDf = pd.read_csv('/Users/wy/Desktop/Bike Sharing Demand/train.csv')
In [121]:
dataDf.head()
Out[121]:
datetime
season
holiday
workingday
weather
temp
atemp
humidity
windspeed
casual
registered
count
0
2011-01-01 00:00:00
1
0
0
1
9.84
14.395
81
0
3
13
16
1
2011-01-01 01:00:00
1
0
0
1
9.02
13.635
80
0
8
32
40
2
2011-01-01 02:00:00
1
0
0
1
9.02
13.635
80
0
5
27
32
3
2011-01-01 03:00:00
1
0
0
1
9.84
14.395
75
0
3
10
13
4
2011-01-01 04:00:00
1
0
0
1
9.84
14.395
75
0
0
1
1
In [97]:
dataDf.tail()
Out[97]:
datetime
season
holiday
workingday
weather
temp
atemp
humidity
windspeed
casual
registered
count
10881
2012-12-19 19:00:00
4
0
1
1
15.58
19.695
50
26.0027
7
329
336
10882
2012-12-19 20:00:00
4
0
1
1
14.76
17.425
57
15.0013
10
231
241
10883
2012-12-19 21:00:00
4
0
1
1
13.94
15.910
61
15.0013
4
164
168
10884
2012-12-19 22:00:00
4
0
1
1
13.94
17.425
61
6.0032
12
117
129
10885
2012-12-19 23:00:00
4
0
1
1
13.12
16.665
66
8.9981
4
84
88
In [124]:
dataDf['count'].describe()
Out[124]:
count 10886.000000
mean 191.574132
std 181.144454
min 1.000000
25% 42.000000
50% 145.000000
75% 284.000000
max 977.000000
Name: count, dtype: float64
In [99]:
dataDf
Out[99]:
datetime
season
holiday
workingday
weather
temp
atemp
humidity
windspeed
casual
registered
count
0
2011-01-01 00:00:00
1
0
0
1
9.84
14.395
81
0.0000
3
13
16
1
2011-01-01 01:00:00
1
0
0
1
9.02
13.635
80
0.0000
8
32
40
2
2011-01-01 02:00:00
1
0
0
1
9.02
13.635
80
0.0000
5
27
32
3
2011-01-01 03:00:00
1
0
0
1
9.84
14.395
75
0.0000
3
10
13
4
2011-01-01 04:00:00
1
0
0
1
9.84
14.395
75
0.0000
0
1
1
5
2011-01-01 05:00:00
1
0
0
2
9.84
12.880
75
6.0032
0
1
1
6
2011-01-01 06:00:00
1
0
0
1
9.02
13.635
80
0.0000
2
0
2
7
2011-01-01 07:00:00
1
0
0
1
8.20
12.880
86
0.0000
1
2
3
8
2011-01-01 08:00:00
1
0
0
1
9.84
14.395
75
0.0000
1
7
8
9
2011-01-01 09:00:00
1
0
0
1
13.12
17.425
76
0.0000
8
6
14
10
2011-01-01 10:00:00
1
0
0
1
15.58
19.695
76
16.9979
12
24
36
11
2011-01-01 11:00:00
1
0
0
1
14.76
16.665
81
19.0012
26
30
56
12
2011-01-01 12:00:00
1
0
0
1
17.22
21.210
77
19.0012
29
55
84
13
2011-01-01 13:00:00
1
0
0
2
18.86
22.725
72
19.9995
47
47
94
14
2011-01-01 14:00:00
1
0
0
2
18.86
22.725
72
19.0012
35
71
106
15
2011-01-01 15:00:00
1
0
0
2
18.04
21.970
77
19.9995
40
70
110
16
2011-01-01 16:00:00
1
0
0
2
17.22
21.210
82
19.9995
41
52
93
17
2011-01-01 17:00:00
1
0
0
2
18.04
21.970
82
19.0012
15
52
67
18
2011-01-01 18:00:00
1
0
0
3
17.22
21.210
88
16.9979
9
26
35
19
2011-01-01 19:00:00
1
0
0
3
17.22
21.210
88
16.9979
6
31
37
20
2011-01-01 20:00:00
1
0
0
2
16.40
20.455
87
16.9979
11
25
36
21
2011-01-01 21:00:00
1
0
0
2
16.40
20.455
87
12.9980
3
31
34
22
2011-01-01 22:00:00
1
0
0
2
16.40
20.455
94
15.0013
11
17
28
23
2011-01-01 23:00:00
1
0
0
2
18.86
22.725
88
19.9995
15
24
39
24
2011-01-02 00:00:00
1
0
0
2
18.86
22.725
88
19.9995
4
13
17
25
2011-01-02 01:00:00
1
0
0
2
18.04
21.970
94
16.9979
1
16
17
26
2011-01-02 02:00:00
1
0
0
2
17.22
21.210
100
19.0012
1
8
9
27
2011-01-02 03:00:00
1
0
0
2
18.86
22.725
94
12.9980
2
4
6
28
2011-01-02 04:00:00
1
0
0
2
18.86
22.725
94
12.9980
2
1
3
29
2011-01-02 06:00:00
1
0
0
3
17.22
21.210
77
19.9995
0
2
2
...
...
...
...
...
...
...
...
...
...
...
...
...
10856
2012-12-18 18:00:00
4
0
1
1
15.58
19.695
46
22.0028
13
512
525
10857
2012-12-18 19:00:00
4
0
1
1
15.58
19.695
46
26.0027
19
334
353
10858
2012-12-18 20:00:00
4
0
1
1
14.76
16.665
50
16.9979
4
264
268
10859
2012-12-18 21:00:00
4
0
1
1
14.76
17.425
50
15.0013
9
159
168
10860
2012-12-18 22:00:00
4
0
1
1
13.94
16.665
49
0.0000
5
127
132
10861
2012-12-18 23:00:00
4
0
1
1
13.94
17.425
49
6.0032
1
80
81
10862
2012-12-19 00:00:00
4
0
1
1
12.30
15.910
61
0.0000
6
35
41
10863
2012-12-19 01:00:00
4
0
1
1
12.30
15.910
65
6.0032
1
14
15
10864
2012-12-19 02:00:00
4
0
1
1
11.48
15.150
65
6.0032
1
2
3
10865
2012-12-19 03:00:00
4
0
1
1
10.66
13.635
75
8.9981
0
5
5
10866
2012-12-19 04:00:00
4
0
1
1
9.84
12.120
75
8.9981
1
6
7
10867
2012-12-19 05:00:00
4
0
1
1
10.66
14.395
75
6.0032
2
29
31
10868
2012-12-19 06:00:00
4
0
1
1
9.84
12.880
75
6.0032
3
109
112
10869
2012-12-19 07:00:00
4
0
1
1
10.66
13.635
75
8.9981
3
360
363
10870
2012-12-19 08:00:00
4
0
1
1
9.84
12.880
87
7.0015
13
665
678
10871
2012-12-19 09:00:00
4
0
1
1
11.48
14.395
75
7.0015
8
309
317
10872
2012-12-19 10:00:00
4
0
1
1
13.12
16.665
70
7.0015
17
147
164
10873
2012-12-19 11:00:00
4
0
1
1
16.40
20.455
54
15.0013
31
169
200
10874
2012-12-19 12:00:00
4
0
1
1
16.40
20.455
54
19.0012
33
203
236
10875
2012-12-19 13:00:00
4
0
1
1
17.22
21.210
50
12.9980
30
183
213
10876
2012-12-19 14:00:00
4
0
1
1
17.22
21.210
50
12.9980
33
185
218
10877
2012-12-19 15:00:00
4
0
1
1
17.22
21.210
50
19.0012
28
209
237
10878
2012-12-19 16:00:00
4
0
1
1
17.22
21.210
50
23.9994
37
297
334
10879
2012-12-19 17:00:00
4
0
1
1
16.40
20.455
50
26.0027
26
536
562
10880
2012-12-19 18:00:00
4
0
1
1
15.58
19.695
50
23.9994
23
546
569
10881
2012-12-19 19:00:00
4
0
1
1
15.58
19.695
50
26.0027
7
329
336
10882
2012-12-19 20:00:00
4
0
1
1
14.76
17.425
57
15.0013
10
231
241
10883
2012-12-19 21:00:00
4
0
1
1
13.94
15.910
61
15.0013
4
164
168
10884
2012-12-19 22:00:00
4
0
1
1
13.94
17.425
61
6.0032
12
117
129
10885
2012-12-19 23:00:00
4
0
1
1
13.12
16.665
66
8.9981
4
84
88
10886 rows × 12 columns
In [100]:
dataDf.groupby('season').sum()
Out[100]:
holiday
workingday
weather
temp
atemp
humidity
windspeed
casual
registered
count
season
1
71
1828
3826
33656.90
40904.975
151216
39314.9233
41605
270893
312498
2
48
1893
3889
62376.58
72826.520
166311
36637.5229
129672
458610
588282
3
96
1845
3735
78680.64
88933.960
175250
31453.7195
142718
497944
640662
4
96
1846
3991
45519.02
54843.790
180919
31928.0527
78140
465894
544034
In [101]:
dataDf.groupby('holiday').sum()
Out[101]:
season
workingday
weather
temp
atemp
humidity
windspeed
casual
registered
count
holiday
0
26448
7412
15008
213937.18
250232.655
654382
135229.1083
376964
1650704
2027668
1
839
0
433
6295.96
7276.590
19314
4105.1101
15171
42637
57808
In [102]:
dataDf.groupby('weather').sum()
Out[102]:
season
holiday
workingday
temp
atemp
humidity
windspeed
casual
registered
count
weather
1
17959
204
4839
147846.82
172565.755
407907
92723.1626
289900
1186163
1476063
2
7171
92
1937
55587.80
65387.220
195831
34517.8506
87246
419914
507160
3
2156
15
635
16790.32
19544.905
69872
12087.2020
14983
87106
102089
4
1
0
1
8.20
11.365
86
6.0032
6
158
164
In [129]:
labels = []
for count in dataDf["count"] :
if count < 42:
labels.append(1)
elif 42 <= count < 145:
labels.append(2)
elif 145 <= count < 284:
labels.append(3)
else:
labels.append(4)
In [131]:
dataDf['labels'] = pd.Series(labels)
In [132]:
dataDf
Out[132]:
datetime
season
holiday
workingday
weather
temp
atemp
humidity
windspeed
casual
registered
count
labels
0
2011-01-01 00:00:00
1
0
0
1
9.84
14.395
81
0.0000
3
13
16
1
1
2011-01-01 01:00:00
1
0
0
1
9.02
13.635
80
0.0000
8
32
40
1
2
2011-01-01 02:00:00
1
0
0
1
9.02
13.635
80
0.0000
5
27
32
1
3
2011-01-01 03:00:00
1
0
0
1
9.84
14.395
75
0.0000
3
10
13
1
4
2011-01-01 04:00:00
1
0
0
1
9.84
14.395
75
0.0000
0
1
1
1
5
2011-01-01 05:00:00
1
0
0
2
9.84
12.880
75
6.0032
0
1
1
1
6
2011-01-01 06:00:00
1
0
0
1
9.02
13.635
80
0.0000
2
0
2
1
7
2011-01-01 07:00:00
1
0
0
1
8.20
12.880
86
0.0000
1
2
3
1
8
2011-01-01 08:00:00
1
0
0
1
9.84
14.395
75
0.0000
1
7
8
1
9
2011-01-01 09:00:00
1
0
0
1
13.12
17.425
76
0.0000
8
6
14
1
10
2011-01-01 10:00:00
1
0
0
1
15.58
19.695
76
16.9979
12
24
36
1
11
2011-01-01 11:00:00
1
0
0
1
14.76
16.665
81
19.0012
26
30
56
2
12
2011-01-01 12:00:00
1
0
0
1
17.22
21.210
77
19.0012
29
55
84
2
13
2011-01-01 13:00:00
1
0
0
2
18.86
22.725
72
19.9995
47
47
94
2
14
2011-01-01 14:00:00
1
0
0
2
18.86
22.725
72
19.0012
35
71
106
2
15
2011-01-01 15:00:00
1
0
0
2
18.04
21.970
77
19.9995
40
70
110
2
16
2011-01-01 16:00:00
1
0
0
2
17.22
21.210
82
19.9995
41
52
93
2
17
2011-01-01 17:00:00
1
0
0
2
18.04
21.970
82
19.0012
15
52
67
2
18
2011-01-01 18:00:00
1
0
0
3
17.22
21.210
88
16.9979
9
26
35
1
19
2011-01-01 19:00:00
1
0
0
3
17.22
21.210
88
16.9979
6
31
37
1
20
2011-01-01 20:00:00
1
0
0
2
16.40
20.455
87
16.9979
11
25
36
1
21
2011-01-01 21:00:00
1
0
0
2
16.40
20.455
87
12.9980
3
31
34
1
22
2011-01-01 22:00:00
1
0
0
2
16.40
20.455
94
15.0013
11
17
28
1
23
2011-01-01 23:00:00
1
0
0
2
18.86
22.725
88
19.9995
15
24
39
1
24
2011-01-02 00:00:00
1
0
0
2
18.86
22.725
88
19.9995
4
13
17
1
25
2011-01-02 01:00:00
1
0
0
2
18.04
21.970
94
16.9979
1
16
17
1
26
2011-01-02 02:00:00
1
0
0
2
17.22
21.210
100
19.0012
1
8
9
1
27
2011-01-02 03:00:00
1
0
0
2
18.86
22.725
94
12.9980
2
4
6
1
28
2011-01-02 04:00:00
1
0
0
2
18.86
22.725
94
12.9980
2
1
3
1
29
2011-01-02 06:00:00
1
0
0
3
17.22
21.210
77
19.9995
0
2
2
1
...
...
...
...
...
...
...
...
...
...
...
...
...
...
10856
2012-12-18 18:00:00
4
0
1
1
15.58
19.695
46
22.0028
13
512
525
4
10857
2012-12-18 19:00:00
4
0
1
1
15.58
19.695
46
26.0027
19
334
353
4
10858
2012-12-18 20:00:00
4
0
1
1
14.76
16.665
50
16.9979
4
264
268
3
10859
2012-12-18 21:00:00
4
0
1
1
14.76
17.425
50
15.0013
9
159
168
3
10860
2012-12-18 22:00:00
4
0
1
1
13.94
16.665
49
0.0000
5
127
132
2
10861
2012-12-18 23:00:00
4
0
1
1
13.94
17.425
49
6.0032
1
80
81
2
10862
2012-12-19 00:00:00
4
0
1
1
12.30
15.910
61
0.0000
6
35
41
1
10863
2012-12-19 01:00:00
4
0
1
1
12.30
15.910
65
6.0032
1
14
15
1
10864
2012-12-19 02:00:00
4
0
1
1
11.48
15.150
65
6.0032
1
2
3
1
10865
2012-12-19 03:00:00
4
0
1
1
10.66
13.635
75
8.9981
0
5
5
1
10866
2012-12-19 04:00:00
4
0
1
1
9.84
12.120
75
8.9981
1
6
7
1
10867
2012-12-19 05:00:00
4
0
1
1
10.66
14.395
75
6.0032
2
29
31
1
10868
2012-12-19 06:00:00
4
0
1
1
9.84
12.880
75
6.0032
3
109
112
2
10869
2012-12-19 07:00:00
4
0
1
1
10.66
13.635
75
8.9981
3
360
363
4
10870
2012-12-19 08:00:00
4
0
1
1
9.84
12.880
87
7.0015
13
665
678
4
10871
2012-12-19 09:00:00
4
0
1
1
11.48
14.395
75
7.0015
8
309
317
4
10872
2012-12-19 10:00:00
4
0
1
1
13.12
16.665
70
7.0015
17
147
164
3
10873
2012-12-19 11:00:00
4
0
1
1
16.40
20.455
54
15.0013
31
169
200
3
10874
2012-12-19 12:00:00
4
0
1
1
16.40
20.455
54
19.0012
33
203
236
3
10875
2012-12-19 13:00:00
4
0
1
1
17.22
21.210
50
12.9980
30
183
213
3
10876
2012-12-19 14:00:00
4
0
1
1
17.22
21.210
50
12.9980
33
185
218
3
10877
2012-12-19 15:00:00
4
0
1
1
17.22
21.210
50
19.0012
28
209
237
3
10878
2012-12-19 16:00:00
4
0
1
1
17.22
21.210
50
23.9994
37
297
334
4
10879
2012-12-19 17:00:00
4
0
1
1
16.40
20.455
50
26.0027
26
536
562
4
10880
2012-12-19 18:00:00
4
0
1
1
15.58
19.695
50
23.9994
23
546
569
4
10881
2012-12-19 19:00:00
4
0
1
1
15.58
19.695
50
26.0027
7
329
336
4
10882
2012-12-19 20:00:00
4
0
1
1
14.76
17.425
57
15.0013
10
231
241
3
10883
2012-12-19 21:00:00
4
0
1
1
13.94
15.910
61
15.0013
4
164
168
3
10884
2012-12-19 22:00:00
4
0
1
1
13.94
17.425
61
6.0032
12
117
129
2
10885
2012-12-19 23:00:00
4
0
1
1
13.12
16.665
66
8.9981
4
84
88
2
10886 rows × 13 columns
In [105]:
from sklearn.neighbors import KNeighborsClassifier
In [134]:
dataDf_data = dataDf.as_matrix(columns=["season","holiday","season","holiday","workingday","weather","temp","atemp","humidity","windspeed","casual","registered"])
dataDf_target = dataDf["labels"].as_matrix()
In [135]:
dataDf_data
Out[135]:
array([[ 1. , 0. , 1. , ..., 0. , 3. , 13. ],
[ 1. , 0. , 1. , ..., 0. , 8. , 32. ],
[ 1. , 0. , 1. , ..., 0. , 5. , 27. ],
...,
[ 4. , 0. , 4. , ..., 15.0013, 4. , 164. ],
[ 4. , 0. , 4. , ..., 6.0032, 12. , 117. ],
[ 4. , 0. , 4. , ..., 8.9981, 4. , 84. ]])
In [136]:
dataDf_target
Out[136]:
array([1, 1, 1, ..., 3, 2, 2])
In [139]:
from IPython.display import Image
Image(filename='/Users/wy/Desktop/Bike Sharing Demand/image.png')
Out[139]:
In [137]:
# 洗牌 打亂dataDf_data
indices = np.random.permutation(len(dataDf_data))
indices
Out[137]:
array([10594, 9207, 9999, ..., 692, 2992, 8242])
In [138]:
dataDf_data_train = dataDf_data[indices[:-10]]
dataDf_target_train = dataDf_target[indices[:-10]]
dataDf_data_test = dataDf_data[indices[-10:]]
dataDf_target_test = dataDf_target[indices[-10:]]
In [146]:
knn = KNeighborsClassifier()
In [147]:
knn.fit(dataDf_data_train, dataDf_target_train)
Out[147]:
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
metric_params=None, n_neighbors=5, p=2, weights='uniform')
In [148]:
knn.predict(dataDf_data_test)
Out[148]:
array([4, 1, 2, 1, 3, 1, 1, 1, 2, 4])
In [149]:
dataDf_target_test
Out[149]:
array([4, 1, 2, 1, 3, 1, 1, 1, 2, 4])
In [150]:
knn = KNeighborsClassifier(n_neighbors=10,weights='distance')
In [151]:
knn.fit(dataDf_data_train, dataDf_target_train)
Out[151]:
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
metric_params=None, n_neighbors=10, p=2, weights='distance')
In [152]:
knn.predict(dataDf_data_test)
Out[152]:
array([4, 1, 2, 1, 3, 1, 1, 1, 2, 4])
In [153]:
dataDf_target_test
Out[153]:
array([4, 1, 2, 1, 3, 1, 1, 1, 2, 4])
Content source: wy36101299/knn-Bike-Sharing-Demand
Similar notebooks: