In [3]:
#先把数据读进来
import pandas as pd
data = pd.read_csv('kaggle_bike_competition_train.csv', header = 0, error_bad_lines=False)
In [30]:
#看一眼数据长什么样
data.head()
Out[30]:
datetime
season
holiday
workingday
weather
temp
atemp
humidity
windspeed
casual
registered
count
0
2011-01-01 00:00:00
1
0
0
1
9.84
14.395
81
0.0
3
13
16
1
2011-01-01 01:00:00
1
0
0
1
9.02
13.635
80
0.0
8
32
40
2
2011-01-01 02:00:00
1
0
0
1
9.02
13.635
80
0.0
5
27
32
3
2011-01-01 03:00:00
1
0
0
1
9.84
14.395
75
0.0
3
10
13
4
2011-01-01 04:00:00
1
0
0
1
9.84
14.395
75
0.0
0
1
1
In [32]:
# 处理时间字段
temp = pd.DatetimeIndex(data['datetime'])
data['date'] = temp.date
data['time'] = temp.time
data.head()
Out[32]:
datetime
season
holiday
workingday
weather
temp
atemp
humidity
windspeed
casual
registered
count
date
time
0
2011-01-01 00:00:00
1
0
0
1
9.84
14.395
81
0.0
3
13
16
2011-01-01
00:00:00
1
2011-01-01 01:00:00
1
0
0
1
9.02
13.635
80
0.0
8
32
40
2011-01-01
01:00:00
2
2011-01-01 02:00:00
1
0
0
1
9.02
13.635
80
0.0
5
27
32
2011-01-01
02:00:00
3
2011-01-01 03:00:00
1
0
0
1
9.84
14.395
75
0.0
3
10
13
2011-01-01
03:00:00
4
2011-01-01 04:00:00
1
0
0
1
9.84
14.395
75
0.0
0
1
1
2011-01-01
04:00:00
In [33]:
# 设定hour这个小时字段
data['hour'] = pd.to_datetime(data.time, format="%H:%M:%S")
data['hour'] = pd.Index(data['hour']).hour
data
Out[33]:
datetime
season
holiday
workingday
weather
temp
atemp
humidity
windspeed
casual
registered
count
date
time
hour
0
2011-01-01 00:00:00
1
0
0
1
9.84
14.395
81
0.0000
3
13
16
2011-01-01
00:00:00
0
1
2011-01-01 01:00:00
1
0
0
1
9.02
13.635
80
0.0000
8
32
40
2011-01-01
01:00:00
1
2
2011-01-01 02:00:00
1
0
0
1
9.02
13.635
80
0.0000
5
27
32
2011-01-01
02:00:00
2
3
2011-01-01 03:00:00
1
0
0
1
9.84
14.395
75
0.0000
3
10
13
2011-01-01
03:00:00
3
4
2011-01-01 04:00:00
1
0
0
1
9.84
14.395
75
0.0000
0
1
1
2011-01-01
04:00:00
4
5
2011-01-01 05:00:00
1
0
0
2
9.84
12.880
75
6.0032
0
1
1
2011-01-01
05:00:00
5
6
2011-01-01 06:00:00
1
0
0
1
9.02
13.635
80
0.0000
2
0
2
2011-01-01
06:00:00
6
7
2011-01-01 07:00:00
1
0
0
1
8.20
12.880
86
0.0000
1
2
3
2011-01-01
07:00:00
7
8
2011-01-01 08:00:00
1
0
0
1
9.84
14.395
75
0.0000
1
7
8
2011-01-01
08:00:00
8
9
2011-01-01 09:00:00
1
0
0
1
13.12
17.425
76
0.0000
8
6
14
2011-01-01
09:00:00
9
10
2011-01-01 10:00:00
1
0
0
1
15.58
19.695
76
16.9979
12
24
36
2011-01-01
10:00:00
10
11
2011-01-01 11:00:00
1
0
0
1
14.76
16.665
81
19.0012
26
30
56
2011-01-01
11:00:00
11
12
2011-01-01 12:00:00
1
0
0
1
17.22
21.210
77
19.0012
29
55
84
2011-01-01
12:00:00
12
13
2011-01-01 13:00:00
1
0
0
2
18.86
22.725
72
19.9995
47
47
94
2011-01-01
13:00:00
13
14
2011-01-01 14:00:00
1
0
0
2
18.86
22.725
72
19.0012
35
71
106
2011-01-01
14:00:00
14
15
2011-01-01 15:00:00
1
0
0
2
18.04
21.970
77
19.9995
40
70
110
2011-01-01
15:00:00
15
16
2011-01-01 16:00:00
1
0
0
2
17.22
21.210
82
19.9995
41
52
93
2011-01-01
16:00:00
16
17
2011-01-01 17:00:00
1
0
0
2
18.04
21.970
82
19.0012
15
52
67
2011-01-01
17:00:00
17
18
2011-01-01 18:00:00
1
0
0
3
17.22
21.210
88
16.9979
9
26
35
2011-01-01
18:00:00
18
19
2011-01-01 19:00:00
1
0
0
3
17.22
21.210
88
16.9979
6
31
37
2011-01-01
19:00:00
19
20
2011-01-01 20:00:00
1
0
0
2
16.40
20.455
87
16.9979
11
25
36
2011-01-01
20:00:00
20
21
2011-01-01 21:00:00
1
0
0
2
16.40
20.455
87
12.9980
3
31
34
2011-01-01
21:00:00
21
22
2011-01-01 22:00:00
1
0
0
2
16.40
20.455
94
15.0013
11
17
28
2011-01-01
22:00:00
22
23
2011-01-01 23:00:00
1
0
0
2
18.86
22.725
88
19.9995
15
24
39
2011-01-01
23:00:00
23
24
2011-01-02 00:00:00
1
0
0
2
18.86
22.725
88
19.9995
4
13
17
2011-01-02
00:00:00
0
25
2011-01-02 01:00:00
1
0
0
2
18.04
21.970
94
16.9979
1
16
17
2011-01-02
01:00:00
1
26
2011-01-02 02:00:00
1
0
0
2
17.22
21.210
100
19.0012
1
8
9
2011-01-02
02:00:00
2
27
2011-01-02 03:00:00
1
0
0
2
18.86
22.725
94
12.9980
2
4
6
2011-01-02
03:00:00
3
28
2011-01-02 04:00:00
1
0
0
2
18.86
22.725
94
12.9980
2
1
3
2011-01-02
04:00:00
4
29
2011-01-02 06:00:00
1
0
0
3
17.22
21.210
77
19.9995
0
2
2
2011-01-02
06:00:00
6
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
10856
2012-12-18 18:00:00
4
0
1
1
15.58
19.695
46
22.0028
13
512
525
2012-12-18
18:00:00
18
10857
2012-12-18 19:00:00
4
0
1
1
15.58
19.695
46
26.0027
19
334
353
2012-12-18
19:00:00
19
10858
2012-12-18 20:00:00
4
0
1
1
14.76
16.665
50
16.9979
4
264
268
2012-12-18
20:00:00
20
10859
2012-12-18 21:00:00
4
0
1
1
14.76
17.425
50
15.0013
9
159
168
2012-12-18
21:00:00
21
10860
2012-12-18 22:00:00
4
0
1
1
13.94
16.665
49
0.0000
5
127
132
2012-12-18
22:00:00
22
10861
2012-12-18 23:00:00
4
0
1
1
13.94
17.425
49
6.0032
1
80
81
2012-12-18
23:00:00
23
10862
2012-12-19 00:00:00
4
0
1
1
12.30
15.910
61
0.0000
6
35
41
2012-12-19
00:00:00
0
10863
2012-12-19 01:00:00
4
0
1
1
12.30
15.910
65
6.0032
1
14
15
2012-12-19
01:00:00
1
10864
2012-12-19 02:00:00
4
0
1
1
11.48
15.150
65
6.0032
1
2
3
2012-12-19
02:00:00
2
10865
2012-12-19 03:00:00
4
0
1
1
10.66
13.635
75
8.9981
0
5
5
2012-12-19
03:00:00
3
10866
2012-12-19 04:00:00
4
0
1
1
9.84
12.120
75
8.9981
1
6
7
2012-12-19
04:00:00
4
10867
2012-12-19 05:00:00
4
0
1
1
10.66
14.395
75
6.0032
2
29
31
2012-12-19
05:00:00
5
10868
2012-12-19 06:00:00
4
0
1
1
9.84
12.880
75
6.0032
3
109
112
2012-12-19
06:00:00
6
10869
2012-12-19 07:00:00
4
0
1
1
10.66
13.635
75
8.9981
3
360
363
2012-12-19
07:00:00
7
10870
2012-12-19 08:00:00
4
0
1
1
9.84
12.880
87
7.0015
13
665
678
2012-12-19
08:00:00
8
10871
2012-12-19 09:00:00
4
0
1
1
11.48
14.395
75
7.0015
8
309
317
2012-12-19
09:00:00
9
10872
2012-12-19 10:00:00
4
0
1
1
13.12
16.665
70
7.0015
17
147
164
2012-12-19
10:00:00
10
10873
2012-12-19 11:00:00
4
0
1
1
16.40
20.455
54
15.0013
31
169
200
2012-12-19
11:00:00
11
10874
2012-12-19 12:00:00
4
0
1
1
16.40
20.455
54
19.0012
33
203
236
2012-12-19
12:00:00
12
10875
2012-12-19 13:00:00
4
0
1
1
17.22
21.210
50
12.9980
30
183
213
2012-12-19
13:00:00
13
10876
2012-12-19 14:00:00
4
0
1
1
17.22
21.210
50
12.9980
33
185
218
2012-12-19
14:00:00
14
10877
2012-12-19 15:00:00
4
0
1
1
17.22
21.210
50
19.0012
28
209
237
2012-12-19
15:00:00
15
10878
2012-12-19 16:00:00
4
0
1
1
17.22
21.210
50
23.9994
37
297
334
2012-12-19
16:00:00
16
10879
2012-12-19 17:00:00
4
0
1
1
16.40
20.455
50
26.0027
26
536
562
2012-12-19
17:00:00
17
10880
2012-12-19 18:00:00
4
0
1
1
15.58
19.695
50
23.9994
23
546
569
2012-12-19
18:00:00
18
10881
2012-12-19 19:00:00
4
0
1
1
15.58
19.695
50
26.0027
7
329
336
2012-12-19
19:00:00
19
10882
2012-12-19 20:00:00
4
0
1
1
14.76
17.425
57
15.0013
10
231
241
2012-12-19
20:00:00
20
10883
2012-12-19 21:00:00
4
0
1
1
13.94
15.910
61
15.0013
4
164
168
2012-12-19
21:00:00
21
10884
2012-12-19 22:00:00
4
0
1
1
13.94
17.425
61
6.0032
12
117
129
2012-12-19
22:00:00
22
10885
2012-12-19 23:00:00
4
0
1
1
13.12
16.665
66
8.9981
4
84
88
2012-12-19
23:00:00
23
10886 rows × 15 columns
In [35]:
# 我们对时间类的特征做处理,产出一个星期几的类别型变量
data['dayofweek'] = pd.DatetimeIndex(data.date).dayofweek
# 对时间类特征处理,产出一个时间长度变量
data['dateDays'] = (data.date - data.date[0]).astype('timedelta64[D]')
data
Out[35]:
datetime
season
holiday
workingday
weather
temp
atemp
humidity
windspeed
casual
registered
count
date
time
hour
dayofweek
dateDays
0
2011-01-01 00:00:00
1
0
0
1
9.84
14.395
81
0.0000
3
13
16
2011-01-01
00:00:00
0
5
0.0
1
2011-01-01 01:00:00
1
0
0
1
9.02
13.635
80
0.0000
8
32
40
2011-01-01
01:00:00
1
5
0.0
2
2011-01-01 02:00:00
1
0
0
1
9.02
13.635
80
0.0000
5
27
32
2011-01-01
02:00:00
2
5
0.0
3
2011-01-01 03:00:00
1
0
0
1
9.84
14.395
75
0.0000
3
10
13
2011-01-01
03:00:00
3
5
0.0
4
2011-01-01 04:00:00
1
0
0
1
9.84
14.395
75
0.0000
0
1
1
2011-01-01
04:00:00
4
5
0.0
5
2011-01-01 05:00:00
1
0
0
2
9.84
12.880
75
6.0032
0
1
1
2011-01-01
05:00:00
5
5
0.0
6
2011-01-01 06:00:00
1
0
0
1
9.02
13.635
80
0.0000
2
0
2
2011-01-01
06:00:00
6
5
0.0
7
2011-01-01 07:00:00
1
0
0
1
8.20
12.880
86
0.0000
1
2
3
2011-01-01
07:00:00
7
5
0.0
8
2011-01-01 08:00:00
1
0
0
1
9.84
14.395
75
0.0000
1
7
8
2011-01-01
08:00:00
8
5
0.0
9
2011-01-01 09:00:00
1
0
0
1
13.12
17.425
76
0.0000
8
6
14
2011-01-01
09:00:00
9
5
0.0
10
2011-01-01 10:00:00
1
0
0
1
15.58
19.695
76
16.9979
12
24
36
2011-01-01
10:00:00
10
5
0.0
11
2011-01-01 11:00:00
1
0
0
1
14.76
16.665
81
19.0012
26
30
56
2011-01-01
11:00:00
11
5
0.0
12
2011-01-01 12:00:00
1
0
0
1
17.22
21.210
77
19.0012
29
55
84
2011-01-01
12:00:00
12
5
0.0
13
2011-01-01 13:00:00
1
0
0
2
18.86
22.725
72
19.9995
47
47
94
2011-01-01
13:00:00
13
5
0.0
14
2011-01-01 14:00:00
1
0
0
2
18.86
22.725
72
19.0012
35
71
106
2011-01-01
14:00:00
14
5
0.0
15
2011-01-01 15:00:00
1
0
0
2
18.04
21.970
77
19.9995
40
70
110
2011-01-01
15:00:00
15
5
0.0
16
2011-01-01 16:00:00
1
0
0
2
17.22
21.210
82
19.9995
41
52
93
2011-01-01
16:00:00
16
5
0.0
17
2011-01-01 17:00:00
1
0
0
2
18.04
21.970
82
19.0012
15
52
67
2011-01-01
17:00:00
17
5
0.0
18
2011-01-01 18:00:00
1
0
0
3
17.22
21.210
88
16.9979
9
26
35
2011-01-01
18:00:00
18
5
0.0
19
2011-01-01 19:00:00
1
0
0
3
17.22
21.210
88
16.9979
6
31
37
2011-01-01
19:00:00
19
5
0.0
20
2011-01-01 20:00:00
1
0
0
2
16.40
20.455
87
16.9979
11
25
36
2011-01-01
20:00:00
20
5
0.0
21
2011-01-01 21:00:00
1
0
0
2
16.40
20.455
87
12.9980
3
31
34
2011-01-01
21:00:00
21
5
0.0
22
2011-01-01 22:00:00
1
0
0
2
16.40
20.455
94
15.0013
11
17
28
2011-01-01
22:00:00
22
5
0.0
23
2011-01-01 23:00:00
1
0
0
2
18.86
22.725
88
19.9995
15
24
39
2011-01-01
23:00:00
23
5
0.0
24
2011-01-02 00:00:00
1
0
0
2
18.86
22.725
88
19.9995
4
13
17
2011-01-02
00:00:00
0
6
1.0
25
2011-01-02 01:00:00
1
0
0
2
18.04
21.970
94
16.9979
1
16
17
2011-01-02
01:00:00
1
6
1.0
26
2011-01-02 02:00:00
1
0
0
2
17.22
21.210
100
19.0012
1
8
9
2011-01-02
02:00:00
2
6
1.0
27
2011-01-02 03:00:00
1
0
0
2
18.86
22.725
94
12.9980
2
4
6
2011-01-02
03:00:00
3
6
1.0
28
2011-01-02 04:00:00
1
0
0
2
18.86
22.725
94
12.9980
2
1
3
2011-01-02
04:00:00
4
6
1.0
29
2011-01-02 06:00:00
1
0
0
3
17.22
21.210
77
19.9995
0
2
2
2011-01-02
06:00:00
6
6
1.0
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
10856
2012-12-18 18:00:00
4
0
1
1
15.58
19.695
46
22.0028
13
512
525
2012-12-18
18:00:00
18
1
717.0
10857
2012-12-18 19:00:00
4
0
1
1
15.58
19.695
46
26.0027
19
334
353
2012-12-18
19:00:00
19
1
717.0
10858
2012-12-18 20:00:00
4
0
1
1
14.76
16.665
50
16.9979
4
264
268
2012-12-18
20:00:00
20
1
717.0
10859
2012-12-18 21:00:00
4
0
1
1
14.76
17.425
50
15.0013
9
159
168
2012-12-18
21:00:00
21
1
717.0
10860
2012-12-18 22:00:00
4
0
1
1
13.94
16.665
49
0.0000
5
127
132
2012-12-18
22:00:00
22
1
717.0
10861
2012-12-18 23:00:00
4
0
1
1
13.94
17.425
49
6.0032
1
80
81
2012-12-18
23:00:00
23
1
717.0
10862
2012-12-19 00:00:00
4
0
1
1
12.30
15.910
61
0.0000
6
35
41
2012-12-19
00:00:00
0
2
718.0
10863
2012-12-19 01:00:00
4
0
1
1
12.30
15.910
65
6.0032
1
14
15
2012-12-19
01:00:00
1
2
718.0
10864
2012-12-19 02:00:00
4
0
1
1
11.48
15.150
65
6.0032
1
2
3
2012-12-19
02:00:00
2
2
718.0
10865
2012-12-19 03:00:00
4
0
1
1
10.66
13.635
75
8.9981
0
5
5
2012-12-19
03:00:00
3
2
718.0
10866
2012-12-19 04:00:00
4
0
1
1
9.84
12.120
75
8.9981
1
6
7
2012-12-19
04:00:00
4
2
718.0
10867
2012-12-19 05:00:00
4
0
1
1
10.66
14.395
75
6.0032
2
29
31
2012-12-19
05:00:00
5
2
718.0
10868
2012-12-19 06:00:00
4
0
1
1
9.84
12.880
75
6.0032
3
109
112
2012-12-19
06:00:00
6
2
718.0
10869
2012-12-19 07:00:00
4
0
1
1
10.66
13.635
75
8.9981
3
360
363
2012-12-19
07:00:00
7
2
718.0
10870
2012-12-19 08:00:00
4
0
1
1
9.84
12.880
87
7.0015
13
665
678
2012-12-19
08:00:00
8
2
718.0
10871
2012-12-19 09:00:00
4
0
1
1
11.48
14.395
75
7.0015
8
309
317
2012-12-19
09:00:00
9
2
718.0
10872
2012-12-19 10:00:00
4
0
1
1
13.12
16.665
70
7.0015
17
147
164
2012-12-19
10:00:00
10
2
718.0
10873
2012-12-19 11:00:00
4
0
1
1
16.40
20.455
54
15.0013
31
169
200
2012-12-19
11:00:00
11
2
718.0
10874
2012-12-19 12:00:00
4
0
1
1
16.40
20.455
54
19.0012
33
203
236
2012-12-19
12:00:00
12
2
718.0
10875
2012-12-19 13:00:00
4
0
1
1
17.22
21.210
50
12.9980
30
183
213
2012-12-19
13:00:00
13
2
718.0
10876
2012-12-19 14:00:00
4
0
1
1
17.22
21.210
50
12.9980
33
185
218
2012-12-19
14:00:00
14
2
718.0
10877
2012-12-19 15:00:00
4
0
1
1
17.22
21.210
50
19.0012
28
209
237
2012-12-19
15:00:00
15
2
718.0
10878
2012-12-19 16:00:00
4
0
1
1
17.22
21.210
50
23.9994
37
297
334
2012-12-19
16:00:00
16
2
718.0
10879
2012-12-19 17:00:00
4
0
1
1
16.40
20.455
50
26.0027
26
536
562
2012-12-19
17:00:00
17
2
718.0
10880
2012-12-19 18:00:00
4
0
1
1
15.58
19.695
50
23.9994
23
546
569
2012-12-19
18:00:00
18
2
718.0
10881
2012-12-19 19:00:00
4
0
1
1
15.58
19.695
50
26.0027
7
329
336
2012-12-19
19:00:00
19
2
718.0
10882
2012-12-19 20:00:00
4
0
1
1
14.76
17.425
57
15.0013
10
231
241
2012-12-19
20:00:00
20
2
718.0
10883
2012-12-19 21:00:00
4
0
1
1
13.94
15.910
61
15.0013
4
164
168
2012-12-19
21:00:00
21
2
718.0
10884
2012-12-19 22:00:00
4
0
1
1
13.94
17.425
61
6.0032
12
117
129
2012-12-19
22:00:00
22
2
718.0
10885
2012-12-19 23:00:00
4
0
1
1
13.12
16.665
66
8.9981
4
84
88
2012-12-19
23:00:00
23
2
718.0
10886 rows × 17 columns
In [36]:
byday = data.groupby('dayofweek')
# 统计下没注册的用户租赁情况
byday['casual'].sum().reset_index()
Out[36]:
dayofweek
casual
0
0
46288
1
1
35365
2
2
34931
3
3
37283
4
4
47402
5
5
100782
6
6
90084
In [37]:
# 统计下注册的用户的租赁情况
byday['registered'].sum().reset_index()
Out[37]:
dayofweek
registered
0
0
249008
1
1
256620
2
2
257295
3
3
269118
4
4
255102
5
5
210736
6
6
195462
In [38]:
data['Saturday']=0
data.Saturday[data.dayofweek==5]=1
data['Sunday']=0
data.Sunday[data.dayofweek==6]=1
data
/opt/conda/envs/python2/lib/python2.7/site-packages/ipykernel/__main__.py:2: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
from ipykernel import kernelapp as app
/opt/conda/envs/python2/lib/python2.7/site-packages/ipykernel/__main__.py:5: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
Out[38]:
datetime
season
holiday
workingday
weather
temp
atemp
humidity
windspeed
casual
registered
count
date
time
hour
dayofweek
dateDays
Saturday
Sunday
0
2011-01-01 00:00:00
1
0
0
1
9.84
14.395
81
0.0000
3
13
16
2011-01-01
00:00:00
0
5
0.0
1
0
1
2011-01-01 01:00:00
1
0
0
1
9.02
13.635
80
0.0000
8
32
40
2011-01-01
01:00:00
1
5
0.0
1
0
2
2011-01-01 02:00:00
1
0
0
1
9.02
13.635
80
0.0000
5
27
32
2011-01-01
02:00:00
2
5
0.0
1
0
3
2011-01-01 03:00:00
1
0
0
1
9.84
14.395
75
0.0000
3
10
13
2011-01-01
03:00:00
3
5
0.0
1
0
4
2011-01-01 04:00:00
1
0
0
1
9.84
14.395
75
0.0000
0
1
1
2011-01-01
04:00:00
4
5
0.0
1
0
5
2011-01-01 05:00:00
1
0
0
2
9.84
12.880
75
6.0032
0
1
1
2011-01-01
05:00:00
5
5
0.0
1
0
6
2011-01-01 06:00:00
1
0
0
1
9.02
13.635
80
0.0000
2
0
2
2011-01-01
06:00:00
6
5
0.0
1
0
7
2011-01-01 07:00:00
1
0
0
1
8.20
12.880
86
0.0000
1
2
3
2011-01-01
07:00:00
7
5
0.0
1
0
8
2011-01-01 08:00:00
1
0
0
1
9.84
14.395
75
0.0000
1
7
8
2011-01-01
08:00:00
8
5
0.0
1
0
9
2011-01-01 09:00:00
1
0
0
1
13.12
17.425
76
0.0000
8
6
14
2011-01-01
09:00:00
9
5
0.0
1
0
10
2011-01-01 10:00:00
1
0
0
1
15.58
19.695
76
16.9979
12
24
36
2011-01-01
10:00:00
10
5
0.0
1
0
11
2011-01-01 11:00:00
1
0
0
1
14.76
16.665
81
19.0012
26
30
56
2011-01-01
11:00:00
11
5
0.0
1
0
12
2011-01-01 12:00:00
1
0
0
1
17.22
21.210
77
19.0012
29
55
84
2011-01-01
12:00:00
12
5
0.0
1
0
13
2011-01-01 13:00:00
1
0
0
2
18.86
22.725
72
19.9995
47
47
94
2011-01-01
13:00:00
13
5
0.0
1
0
14
2011-01-01 14:00:00
1
0
0
2
18.86
22.725
72
19.0012
35
71
106
2011-01-01
14:00:00
14
5
0.0
1
0
15
2011-01-01 15:00:00
1
0
0
2
18.04
21.970
77
19.9995
40
70
110
2011-01-01
15:00:00
15
5
0.0
1
0
16
2011-01-01 16:00:00
1
0
0
2
17.22
21.210
82
19.9995
41
52
93
2011-01-01
16:00:00
16
5
0.0
1
0
17
2011-01-01 17:00:00
1
0
0
2
18.04
21.970
82
19.0012
15
52
67
2011-01-01
17:00:00
17
5
0.0
1
0
18
2011-01-01 18:00:00
1
0
0
3
17.22
21.210
88
16.9979
9
26
35
2011-01-01
18:00:00
18
5
0.0
1
0
19
2011-01-01 19:00:00
1
0
0
3
17.22
21.210
88
16.9979
6
31
37
2011-01-01
19:00:00
19
5
0.0
1
0
20
2011-01-01 20:00:00
1
0
0
2
16.40
20.455
87
16.9979
11
25
36
2011-01-01
20:00:00
20
5
0.0
1
0
21
2011-01-01 21:00:00
1
0
0
2
16.40
20.455
87
12.9980
3
31
34
2011-01-01
21:00:00
21
5
0.0
1
0
22
2011-01-01 22:00:00
1
0
0
2
16.40
20.455
94
15.0013
11
17
28
2011-01-01
22:00:00
22
5
0.0
1
0
23
2011-01-01 23:00:00
1
0
0
2
18.86
22.725
88
19.9995
15
24
39
2011-01-01
23:00:00
23
5
0.0
1
0
24
2011-01-02 00:00:00
1
0
0
2
18.86
22.725
88
19.9995
4
13
17
2011-01-02
00:00:00
0
6
1.0
0
1
25
2011-01-02 01:00:00
1
0
0
2
18.04
21.970
94
16.9979
1
16
17
2011-01-02
01:00:00
1
6
1.0
0
1
26
2011-01-02 02:00:00
1
0
0
2
17.22
21.210
100
19.0012
1
8
9
2011-01-02
02:00:00
2
6
1.0
0
1
27
2011-01-02 03:00:00
1
0
0
2
18.86
22.725
94
12.9980
2
4
6
2011-01-02
03:00:00
3
6
1.0
0
1
28
2011-01-02 04:00:00
1
0
0
2
18.86
22.725
94
12.9980
2
1
3
2011-01-02
04:00:00
4
6
1.0
0
1
29
2011-01-02 06:00:00
1
0
0
3
17.22
21.210
77
19.9995
0
2
2
2011-01-02
06:00:00
6
6
1.0
0
1
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
10856
2012-12-18 18:00:00
4
0
1
1
15.58
19.695
46
22.0028
13
512
525
2012-12-18
18:00:00
18
1
717.0
0
0
10857
2012-12-18 19:00:00
4
0
1
1
15.58
19.695
46
26.0027
19
334
353
2012-12-18
19:00:00
19
1
717.0
0
0
10858
2012-12-18 20:00:00
4
0
1
1
14.76
16.665
50
16.9979
4
264
268
2012-12-18
20:00:00
20
1
717.0
0
0
10859
2012-12-18 21:00:00
4
0
1
1
14.76
17.425
50
15.0013
9
159
168
2012-12-18
21:00:00
21
1
717.0
0
0
10860
2012-12-18 22:00:00
4
0
1
1
13.94
16.665
49
0.0000
5
127
132
2012-12-18
22:00:00
22
1
717.0
0
0
10861
2012-12-18 23:00:00
4
0
1
1
13.94
17.425
49
6.0032
1
80
81
2012-12-18
23:00:00
23
1
717.0
0
0
10862
2012-12-19 00:00:00
4
0
1
1
12.30
15.910
61
0.0000
6
35
41
2012-12-19
00:00:00
0
2
718.0
0
0
10863
2012-12-19 01:00:00
4
0
1
1
12.30
15.910
65
6.0032
1
14
15
2012-12-19
01:00:00
1
2
718.0
0
0
10864
2012-12-19 02:00:00
4
0
1
1
11.48
15.150
65
6.0032
1
2
3
2012-12-19
02:00:00
2
2
718.0
0
0
10865
2012-12-19 03:00:00
4
0
1
1
10.66
13.635
75
8.9981
0
5
5
2012-12-19
03:00:00
3
2
718.0
0
0
10866
2012-12-19 04:00:00
4
0
1
1
9.84
12.120
75
8.9981
1
6
7
2012-12-19
04:00:00
4
2
718.0
0
0
10867
2012-12-19 05:00:00
4
0
1
1
10.66
14.395
75
6.0032
2
29
31
2012-12-19
05:00:00
5
2
718.0
0
0
10868
2012-12-19 06:00:00
4
0
1
1
9.84
12.880
75
6.0032
3
109
112
2012-12-19
06:00:00
6
2
718.0
0
0
10869
2012-12-19 07:00:00
4
0
1
1
10.66
13.635
75
8.9981
3
360
363
2012-12-19
07:00:00
7
2
718.0
0
0
10870
2012-12-19 08:00:00
4
0
1
1
9.84
12.880
87
7.0015
13
665
678
2012-12-19
08:00:00
8
2
718.0
0
0
10871
2012-12-19 09:00:00
4
0
1
1
11.48
14.395
75
7.0015
8
309
317
2012-12-19
09:00:00
9
2
718.0
0
0
10872
2012-12-19 10:00:00
4
0
1
1
13.12
16.665
70
7.0015
17
147
164
2012-12-19
10:00:00
10
2
718.0
0
0
10873
2012-12-19 11:00:00
4
0
1
1
16.40
20.455
54
15.0013
31
169
200
2012-12-19
11:00:00
11
2
718.0
0
0
10874
2012-12-19 12:00:00
4
0
1
1
16.40
20.455
54
19.0012
33
203
236
2012-12-19
12:00:00
12
2
718.0
0
0
10875
2012-12-19 13:00:00
4
0
1
1
17.22
21.210
50
12.9980
30
183
213
2012-12-19
13:00:00
13
2
718.0
0
0
10876
2012-12-19 14:00:00
4
0
1
1
17.22
21.210
50
12.9980
33
185
218
2012-12-19
14:00:00
14
2
718.0
0
0
10877
2012-12-19 15:00:00
4
0
1
1
17.22
21.210
50
19.0012
28
209
237
2012-12-19
15:00:00
15
2
718.0
0
0
10878
2012-12-19 16:00:00
4
0
1
1
17.22
21.210
50
23.9994
37
297
334
2012-12-19
16:00:00
16
2
718.0
0
0
10879
2012-12-19 17:00:00
4
0
1
1
16.40
20.455
50
26.0027
26
536
562
2012-12-19
17:00:00
17
2
718.0
0
0
10880
2012-12-19 18:00:00
4
0
1
1
15.58
19.695
50
23.9994
23
546
569
2012-12-19
18:00:00
18
2
718.0
0
0
10881
2012-12-19 19:00:00
4
0
1
1
15.58
19.695
50
26.0027
7
329
336
2012-12-19
19:00:00
19
2
718.0
0
0
10882
2012-12-19 20:00:00
4
0
1
1
14.76
17.425
57
15.0013
10
231
241
2012-12-19
20:00:00
20
2
718.0
0
0
10883
2012-12-19 21:00:00
4
0
1
1
13.94
15.910
61
15.0013
4
164
168
2012-12-19
21:00:00
21
2
718.0
0
0
10884
2012-12-19 22:00:00
4
0
1
1
13.94
17.425
61
6.0032
12
117
129
2012-12-19
22:00:00
22
2
718.0
0
0
10885
2012-12-19 23:00:00
4
0
1
1
13.12
16.665
66
8.9981
4
84
88
2012-12-19
23:00:00
23
2
718.0
0
0
10886 rows × 19 columns
In [39]:
# remove old data features
dataRel = data.drop(['datetime', 'count','date','time','dayofweek'], axis=1)
dataRel.head()
Out[39]:
season
holiday
workingday
weather
temp
atemp
humidity
windspeed
casual
registered
hour
dateDays
Saturday
Sunday
0
1
0
0
1
9.84
14.395
81
0.0
3
13
0
0.0
1
0
1
1
0
0
1
9.02
13.635
80
0.0
8
32
1
0.0
1
0
2
1
0
0
1
9.02
13.635
80
0.0
5
27
2
0.0
1
0
3
1
0
0
1
9.84
14.395
75
0.0
3
10
3
0.0
1
0
4
1
0
0
1
9.84
14.395
75
0.0
0
1
4
0.0
1
0
In [40]:
from sklearn.feature_extraction import DictVectorizer
# 我们把连续值的属性放入一个dict中
featureConCols = ['temp','atemp','humidity','windspeed','dateDays','hour']
dataFeatureCon = dataRel[featureConCols]
dataFeatureCon = dataFeatureCon.fillna( 'NA' ) #in case I missed any
X_dictCon = dataFeatureCon.T.to_dict().values()
# 把离散值的属性放到另外一个dict中
featureCatCols = ['season','holiday','workingday','weather','Saturday', 'Sunday']
dataFeatureCat = dataRel[featureCatCols]
dataFeatureCat = dataFeatureCat.fillna( 'NA' ) #in case I missed any
X_dictCat = dataFeatureCat.T.to_dict().values()
# 向量化特征
vec = DictVectorizer(sparse = False)
X_vec_cat = vec.fit_transform(X_dictCat)
X_vec_con = vec.fit_transform(X_dictCon)
In [41]:
dataFeatureCon.head()
Out[41]:
temp
atemp
humidity
windspeed
dateDays
hour
0
9.84
14.395
81
0.0
0.0
0
1
9.02
13.635
80
0.0
0.0
1
2
9.02
13.635
80
0.0
0.0
2
3
9.84
14.395
75
0.0
0.0
3
4
9.84
14.395
75
0.0
0.0
4
In [42]:
X_vec_con
Out[42]:
array([[ 14.395 , 0. , 0. , 81. , 9.84 , 0. ],
[ 13.635 , 0. , 1. , 80. , 9.02 , 0. ],
[ 13.635 , 0. , 2. , 80. , 9.02 , 0. ],
...,
[ 15.91 , 718. , 21. , 61. , 13.94 , 15.0013],
[ 17.425 , 718. , 22. , 61. , 13.94 , 6.0032],
[ 16.665 , 718. , 23. , 66. , 13.12 , 8.9981]])
In [43]:
dataFeatureCat.head()
Out[43]:
season
holiday
workingday
weather
Saturday
Sunday
0
1
0
0
1
1
0
1
1
0
0
1
1
0
2
1
0
0
1
1
0
3
1
0
0
1
1
0
4
1
0
0
1
1
0
In [44]:
X_vec_cat
Out[44]:
array([[ 1., 0., 0., 1., 1., 0.],
[ 1., 0., 0., 1., 1., 0.],
[ 1., 0., 0., 1., 1., 0.],
...,
[ 0., 0., 0., 4., 1., 1.],
[ 0., 0., 0., 4., 1., 1.],
[ 0., 0., 0., 4., 1., 1.]])
In [18]:
from sklearn import preprocessing
# 标准化连续值数据
scaler = preprocessing.StandardScaler().fit(X_vec_con)
X_vec_con = scaler.transform(X_vec_con)
X_vec_con
Out[18]:
array([[-1.09273697, -1.70912256, -1.66894356, 0.99321305, -1.33366069,
-1.56775367],
[-1.18242083, -1.70912256, -1.52434128, 0.94124921, -1.43890721,
-1.56775367],
[-1.18242083, -1.70912256, -1.379739 , 0.94124921, -1.43890721,
-1.56775367],
...,
[-0.91395927, 1.70183906, 1.36770431, -0.04606385, -0.80742813,
0.26970368],
[-0.73518157, 1.70183906, 1.51230659, -0.04606385, -0.80742813,
-0.83244247],
[-0.82486544, 1.70183906, 1.65690887, 0.21375537, -0.91267464,
-0.46560752]])
In [20]:
from sklearn import preprocessing
# one-hot编码
enc = preprocessing.OneHotEncoder()
enc.fit(X_vec_cat)
X_vec_cat = enc.transform(X_vec_cat).toarray()
X_vec_cat
Out[20]:
array([[ 1., 0., 0., ..., 1., 1., 0.],
[ 1., 0., 0., ..., 1., 1., 0.],
[ 1., 0., 0., ..., 1., 1., 0.],
...,
[ 0., 1., 1., ..., 0., 0., 1.],
[ 0., 1., 1., ..., 0., 0., 1.],
[ 0., 1., 1., ..., 0., 0., 1.]])
In [22]:
import numpy as np
# combine cat & con features
X_vec = np.concatenate((X_vec_con,X_vec_cat), axis=1)
X_vec
Out[22]:
array([[-1.09273697, -1.70912256, -1.66894356, ..., 1. ,
1. , 0. ],
[-1.18242083, -1.70912256, -1.52434128, ..., 1. ,
1. , 0. ],
[-1.18242083, -1.70912256, -1.379739 , ..., 1. ,
1. , 0. ],
...,
[-0.91395927, 1.70183906, 1.36770431, ..., 0. ,
0. , 1. ],
[-0.73518157, 1.70183906, 1.51230659, ..., 0. ,
0. , 1. ],
[-0.82486544, 1.70183906, 1.65690887, ..., 0. ,
0. , 1. ]])
最后的特征,前6列是标准化过后的连续值特征,后面是编码后的离散值特征
In [23]:
# 对Y向量化
Y_vec_reg = dataRel['registered'].values.astype(float)
Y_vec_cas = dataRel['casual'].values.astype(float)
In [24]:
Y_vec_reg
Out[24]:
array([ 13., 32., 27., ..., 164., 117., 84.])
In [25]:
Y_vec_cas
Out[25]:
array([ 3., 8., 5., ..., 4., 12., 4.])
In [ ]:
In [ ]:
Content source: qiu997018209/MachineLearning
Similar notebooks:
notebook.community | gallery | about