In [1]:
import numpy as np
import pandas as pd
import os
filepath = '/Users/mac/Desktop/Kaggle_datasets/Taxi_Duration/'
filename01 = 'train.csv'
filename02 = 'test.csv'
filename03 = 'sample_submission.csv'
df_train = pd.read_csv(os.path.join(filepath, filename01))
df_test = pd.read_csv(os.path.join(filepath, filename02))
df_ans = pd.read_csv(os.path.join(filepath, filename03))
In [2]:
df_train_copy = df_train
df_test_copy = df_test
In [3]:
df_train_copy.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1458644 entries, 0 to 1458643
Data columns (total 11 columns):
id 1458644 non-null object
vendor_id 1458644 non-null int64
pickup_datetime 1458644 non-null object
dropoff_datetime 1458644 non-null object
passenger_count 1458644 non-null int64
pickup_longitude 1458644 non-null float64
pickup_latitude 1458644 non-null float64
dropoff_longitude 1458644 non-null float64
dropoff_latitude 1458644 non-null float64
store_and_fwd_flag 1458644 non-null object
trip_duration 1458644 non-null int64
dtypes: float64(4), int64(3), object(4)
memory usage: 122.4+ MB
In [4]:
df_train_copy
Out[4]:
id
vendor_id
pickup_datetime
dropoff_datetime
passenger_count
pickup_longitude
pickup_latitude
dropoff_longitude
dropoff_latitude
store_and_fwd_flag
trip_duration
0
id2875421
2
2016-03-14 17:24:55
2016-03-14 17:32:30
1
-73.982155
40.767937
-73.964630
40.765602
N
455
1
id2377394
1
2016-06-12 00:43:35
2016-06-12 00:54:38
1
-73.980415
40.738564
-73.999481
40.731152
N
663
2
id3858529
2
2016-01-19 11:35:24
2016-01-19 12:10:48
1
-73.979027
40.763939
-74.005333
40.710087
N
2124
3
id3504673
2
2016-04-06 19:32:31
2016-04-06 19:39:40
1
-74.010040
40.719971
-74.012268
40.706718
N
429
4
id2181028
2
2016-03-26 13:30:55
2016-03-26 13:38:10
1
-73.973053
40.793209
-73.972923
40.782520
N
435
5
id0801584
2
2016-01-30 22:01:40
2016-01-30 22:09:03
6
-73.982857
40.742195
-73.992081
40.749184
N
443
6
id1813257
1
2016-06-17 22:34:59
2016-06-17 22:40:40
4
-73.969017
40.757839
-73.957405
40.765896
N
341
7
id1324603
2
2016-05-21 07:54:58
2016-05-21 08:20:49
1
-73.969276
40.797779
-73.922470
40.760559
N
1551
8
id1301050
1
2016-05-27 23:12:23
2016-05-27 23:16:38
1
-73.999481
40.738400
-73.985786
40.732815
N
255
9
id0012891
2
2016-03-10 21:45:01
2016-03-10 22:05:26
1
-73.981049
40.744339
-73.973000
40.789989
N
1225
10
id1436371
2
2016-05-10 22:08:41
2016-05-10 22:29:55
1
-73.982651
40.763840
-74.002228
40.732990
N
1274
11
id1299289
2
2016-05-15 11:16:11
2016-05-15 11:34:59
4
-73.991531
40.749439
-73.956543
40.770630
N
1128
12
id1187965
2
2016-02-19 09:52:46
2016-02-19 10:11:20
2
-73.962982
40.756680
-73.984406
40.760719
N
1114
13
id0799785
2
2016-06-01 20:58:29
2016-06-01 21:02:49
1
-73.956306
40.767941
-73.966110
40.763000
N
260
14
id2900608
2
2016-05-27 00:43:36
2016-05-27 01:07:10
1
-73.992195
40.727226
-73.974655
40.783070
N
1414
15
id3319787
1
2016-05-16 15:29:02
2016-05-16 15:32:33
1
-73.955513
40.768593
-73.948761
40.771545
N
211
16
id3379579
2
2016-04-11 17:29:50
2016-04-11 18:08:26
1
-73.991165
40.755562
-73.999290
40.725353
N
2316
17
id1154431
1
2016-04-14 08:48:26
2016-04-14 09:00:37
1
-73.994255
40.745804
-73.999657
40.723343
N
731
18
id3552682
1
2016-06-27 09:55:13
2016-06-27 10:17:10
1
-74.003983
40.713013
-73.979195
40.749924
N
1317
19
id3390316
2
2016-06-05 13:47:23
2016-06-05 13:51:34
1
-73.983887
40.738197
-73.991203
40.727871
N
251
20
id2070428
1
2016-02-28 02:23:02
2016-02-28 02:31:08
1
-73.980370
40.742420
-73.962852
40.760635
N
486
21
id0809232
2
2016-04-01 12:12:25
2016-04-01 12:23:17
1
-73.979538
40.753361
-73.963997
40.763458
N
652
22
id2352683
1
2016-04-09 03:34:27
2016-04-09 03:41:30
1
-73.995865
40.758812
-73.993324
40.740322
N
423
23
id1603037
1
2016-06-25 10:36:26
2016-06-25 10:55:49
1
-73.993553
40.747173
-74.006142
40.704384
N
1163
24
id3321406
2
2016-06-03 08:15:05
2016-06-03 08:56:30
1
-73.955231
40.777134
-73.788750
40.641472
N
2485
25
id0129640
2
2016-02-14 13:27:56
2016-02-14 13:49:19
1
-73.956581
40.771358
-73.974968
40.732792
N
1283
26
id3587298
1
2016-02-27 21:56:01
2016-02-27 22:14:51
1
-73.983765
40.749874
-73.958832
40.800961
N
1130
27
id2104175
1
2016-06-20 23:07:16
2016-06-20 23:18:50
1
-73.958435
40.713192
-73.949539
40.680252
N
694
28
id3973319
2
2016-06-13 21:57:27
2016-06-13 22:12:19
1
-73.994217
40.713306
-73.982849
40.692299
N
892
29
id1410897
1
2016-03-23 14:10:39
2016-03-23 14:49:30
1
-73.982117
40.756351
-73.865692
40.770988
N
2331
...
...
...
...
...
...
...
...
...
...
...
...
1458614
id2061444
2
2016-02-08 17:16:07
2016-02-08 17:21:45
1
-73.980927
40.767651
-73.965302
40.765251
N
338
1458615
id3182230
1
2016-02-05 17:57:08
2016-02-05 18:11:25
1
-73.991013
40.728321
-73.966766
40.711548
N
857
1458616
id2822294
1
2016-04-22 17:21:14
2016-04-22 17:29:22
1
-73.988327
40.732147
-73.999641
40.734192
N
488
1458617
id0820021
2
2016-04-15 08:31:20
2016-04-15 08:34:48
1
-73.975433
40.752411
-73.973122
40.746780
N
208
1458618
id1046767
2
2016-04-17 01:46:48
2016-04-17 01:52:55
1
-73.987564
40.733387
-74.001129
40.731056
N
367
1458619
id1083860
2
2016-04-23 12:14:15
2016-04-23 12:26:03
1
-73.954773
40.777882
-73.980904
40.782516
N
708
1458620
id0694577
2
2016-04-28 20:51:03
2016-04-28 21:10:25
1
-73.966324
40.758072
-74.006516
40.736641
N
1162
1458621
id3267199
2
2016-05-09 14:33:30
2016-05-09 15:12:45
1
-73.959534
40.782749
-73.990959
40.751091
N
2355
1458622
id0125435
2
2016-02-19 18:26:52
2016-02-19 18:36:04
1
-74.008408
40.721142
-74.000557
40.723911
N
552
1458623
id3369208
1
2016-01-18 20:35:30
2016-01-18 20:44:44
1
-73.991081
40.737408
-73.987671
40.722622
N
554
1458624
id3482902
1
2016-03-01 07:21:04
2016-03-01 07:23:36
1
-73.974693
40.756088
-73.969971
40.762115
N
152
1458625
id3730733
2
2016-01-25 17:21:15
2016-01-25 17:54:37
1
-73.989655
40.740612
-73.961029
40.765366
N
2002
1458626
id0155863
2
2016-01-17 17:21:11
2016-01-17 17:25:15
2
-73.954071
40.767021
-73.950340
40.778233
N
244
1458627
id0439281
2
2016-06-23 10:10:28
2016-06-23 10:25:08
5
-73.981651
40.767708
-73.959183
40.777412
N
880
1458628
id0986544
2
2016-05-30 03:08:19
2016-05-30 03:14:10
2
-73.988632
40.721378
-73.975548
40.728519
N
351
1458629
id3109086
2
2016-06-24 10:33:51
2016-06-24 10:43:52
1
-73.959618
40.808941
-73.947922
40.830189
N
601
1458630
id0287353
2
2016-06-25 03:44:32
2016-06-25 03:53:41
5
-73.991508
40.727135
-73.988136
40.740932
N
549
1458631
id1724231
1
2016-05-14 23:18:23
2016-05-14 23:24:05
3
-73.958946
40.763725
-73.953156
40.780003
N
342
1458632
id0469946
2
2016-03-06 11:04:48
2016-03-06 11:17:45
2
-74.015572
40.710892
-73.996620
40.743633
N
777
1458633
id2432342
1
2016-03-17 19:10:16
2016-03-17 19:26:35
3
-73.979652
40.735279
-73.995522
40.759754
N
979
1458634
id3445276
1
2016-04-03 13:51:25
2016-04-03 14:07:37
2
-73.989075
40.730465
-73.963882
40.773739
N
972
1458635
id3027038
2
2016-05-19 14:46:55
2016-05-19 14:50:52
1
-73.985390
40.763020
-73.989708
40.767502
N
237
1458636
id0405770
2
2016-02-12 10:13:06
2016-02-12 10:26:26
1
-73.863815
40.769684
-73.864395
40.761326
N
800
1458637
id1920898
1
2016-04-17 18:48:16
2016-04-17 19:00:56
1
-73.975357
40.751705
-73.949478
40.776764
N
760
1458638
id1454193
2
2016-02-02 00:39:39
2016-02-02 00:46:33
5
-73.988823
40.736553
-73.989166
40.757393
N
414
1458639
id2376096
2
2016-04-08 13:31:04
2016-04-08 13:44:02
4
-73.982201
40.745522
-73.994911
40.740170
N
778
1458640
id1049543
1
2016-01-10 07:35:15
2016-01-10 07:46:10
1
-74.000946
40.747379
-73.970184
40.796547
N
655
1458641
id2304944
2
2016-04-22 06:57:41
2016-04-22 07:10:25
1
-73.959129
40.768799
-74.004433
40.707371
N
764
1458642
id2714485
1
2016-01-05 15:56:26
2016-01-05 16:02:39
1
-73.982079
40.749062
-73.974632
40.757107
N
373
1458643
id1209952
1
2016-04-05 14:44:25
2016-04-05 14:47:43
1
-73.979538
40.781750
-73.972809
40.790585
N
198
1458644 rows × 11 columns
In [5]:
df_train_copy['delta_long'] = abs(df_train_copy['pickup_longitude']-df_train_copy['dropoff_longitude'])
df_train_copy['delta_la'] = abs(df_train_copy['pickup_latitude']-df_train_copy['dropoff_latitude'])
df_train_copy['dist'] = np.sqrt(df_train_copy['delta_long']**2 + df_train_copy['delta_la']**2)
df_test_copy['delta_long'] = abs(df_test_copy['pickup_longitude']-df_test_copy['dropoff_longitude'])
df_test_copy['delta_la'] = abs(df_test_copy['pickup_latitude']-df_test_copy['dropoff_latitude'])
df_test_copy['dist'] = np.sqrt(df_test_copy['delta_long']**2 + df_test_copy['delta_la']**2)
In [6]:
import matplotlib.pyplot as plt
plt.scatter(df_train_copy.dist, df_train_copy.trip_duration, alpha=0.1)
plt.show()
In [7]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
sns.jointplot(df_train_copy.dist, df_train_copy.trip_duration , data = df_train_copy)
plt.show()
In [17]:
df_train_copy[df_train_copy.dist>2]
Out[17]:
id
vendor_id
pickup_datetime
dropoff_datetime
passenger_count
pickup_longitude
pickup_latitude
dropoff_longitude
dropoff_latitude
store_and_fwd_flag
trip_duration
delta_long
delta_la
dist
184925
id2306955
1
2016-05-07 18:58:53
2016-05-07 19:12:05
1
-72.809669
51.881084
-73.987228
40.750599
N
792
1.177559
11.130486
11.192603
275644
id0978162
1
2016-02-24 16:20:59
2016-02-24 16:35:34
4
-75.354332
34.712234
-73.834923
32.181141
N
875
1.519409
2.531094
2.952124
377068
id0116374
1
2016-04-02 20:33:19
2016-04-02 20:38:01
1
-74.007095
40.717113
-76.963242
38.946033
N
282
2.956146
1.771080
3.446088
397526
id0982904
1
2016-04-28 13:32:14
2016-04-28 14:14:09
2
-73.870682
40.773598
-79.817978
38.963852
N
2515
5.947296
1.809746
6.216551
644165
id0401529
2
2016-06-02 15:19:35
2016-06-02 15:32:59
1
-73.980751
40.757111
-74.240051
38.478298
N
804
0.259300
2.278812
2.293518
910072
id1146400
1
2016-02-15 18:57:32
2016-02-15 19:02:35
2
-73.989914
40.756634
-70.346077
36.398121
N
303
3.643837
4.358513
5.681037
923793
id1001696
1
2016-02-24 21:02:32
2016-02-24 21:25:57
1
-73.972366
40.758633
-79.553535
43.674000
N
1405
5.581169
2.915367
6.296730
974378
id1510552
2
2016-01-06 20:40:52
2016-01-06 20:51:03
5
-71.799896
35.081532
-79.352837
40.436329
N
611
7.552940
5.354797
9.258551
1013474
id3626673
1
2016-05-05 18:02:50
2016-05-05 18:18:23
1
-73.978912
40.756763
-79.338699
41.427902
N
933
5.359787
0.671139
5.401643
1060807
id0838705
1
2016-02-26 19:50:03
2016-02-26 20:08:54
1
-66.972160
44.371944
-69.048019
43.147583
N
1131
2.075859
1.224361
2.410031
1100676
id2644780
1
2016-05-03 16:24:07
2016-05-03 17:18:34
2
-73.991325
40.750023
-79.518616
43.921028
N
3267
5.527290
3.171005
6.372300
1301396
id1216866
1
2016-03-26 22:01:54
2016-03-27 00:47:16
1
-73.981491
40.773251
-76.135719
40.243626
N
9922
2.154228
0.529625
2.218378
In [19]:
df_train_copy[df_train_copy.trip_duration>3000]
Out[19]:
id
vendor_id
pickup_datetime
dropoff_datetime
passenger_count
pickup_longitude
pickup_latitude
dropoff_longitude
dropoff_latitude
store_and_fwd_flag
trip_duration
delta_long
delta_la
dist
55
id3827863
2
2016-04-19 11:29:08
2016-04-19 12:27:56
3
-73.792892
40.657879
-73.809189
40.690182
N
3528
0.016296
0.032303
0.036181
354
id3402983
2
2016-06-30 15:48:06
2016-06-30 17:31:13
1
-73.781898
40.644772
-73.985893
40.760159
N
6187
0.203995
0.115387
0.234367
403
id2693863
1
2016-03-18 08:22:10
2016-03-18 09:47:19
1
-73.777184
40.646500
-73.985001
40.760918
N
5109
0.207817
0.114418
0.237233
531
id3307903
2
2016-02-20 04:03:06
2016-02-21 03:33:00
3
-74.008102
40.741489
-74.009956
40.714611
N
84594
0.001854
0.026878
0.026942
563
id3607196
2
2016-01-26 11:22:27
2016-01-26 12:20:57
1
-74.017250
40.708477
-73.979927
40.761356
N
3510
0.037323
0.052879
0.064724
861
id2029339
2
2016-01-22 14:13:46
2016-01-22 15:15:21
1
-73.873360
40.774109
-73.958115
40.775558
N
3695
0.084755
0.001450
0.084767
976
id3579210
2
2016-01-25 21:05:42
2016-01-25 22:01:52
1
-73.782089
40.644650
-73.974243
40.789761
N
3370
0.192154
0.145111
0.240791
1031
id1211472
1
2016-05-12 14:11:19
2016-05-12 15:03:16
1
-73.951576
40.791344
-73.789146
40.641678
Y
3117
0.162430
0.149666
0.220869
1057
id0631822
2
2016-05-17 14:17:48
2016-05-17 15:26:06
1
-73.995583
40.716949
-73.789101
40.642448
N
4098
0.206482
0.074501
0.219511
1101
id3913560
2
2016-01-11 22:48:55
2016-01-11 23:40:20
2
-74.012657
40.702179
-74.307762
40.567341
N
3085
0.295105
0.134838
0.324451
1113
id3893063
2
2016-06-02 17:32:41
2016-06-02 18:42:43
1
-73.948578
40.778080
-73.781792
40.646790
N
4202
0.166786
0.131290
0.212261
1134
id1091477
2
2016-05-07 18:36:22
2016-05-08 18:32:11
1
-73.990242
40.750919
-73.976280
40.750889
N
86149
0.013962
0.000031
0.013962
1160
id1040844
1
2016-06-03 15:10:40
2016-06-03 16:36:32
3
-73.872971
40.774124
-73.975449
40.765354
N
5152
0.102478
0.008770
0.102853
1248
id2553024
1
2016-04-23 13:43:33
2016-04-23 14:47:19
4
-73.781555
40.644749
-73.989708
40.758984
N
3826
0.208153
0.114235
0.237439
1280
id0896335
1
2016-01-25 11:07:10
2016-01-25 12:04:58
1
-73.776794
40.645473
-73.958183
40.673565
N
3468
0.181389
0.028091
0.183551
1297
id0306216
2
2016-02-08 16:49:18
2016-02-08 17:44:53
2
-73.975739
40.758381
-73.783272
40.643829
N
3335
0.192467
0.114552
0.223977
1331
id3126063
2
2016-03-22 14:10:31
2016-03-22 15:01:06
1
-73.982307
40.723171
-73.807510
40.655022
N
3035
0.174797
0.068150
0.187612
1378
id2295021
2
2016-04-10 17:39:50
2016-04-10 18:48:17
6
-73.903511
40.639160
-73.903107
40.639210
N
4107
0.000404
0.000050
0.000407
1387
id2054300
1
2016-03-25 15:40:55
2016-03-25 16:33:23
1
-73.790184
40.646782
-73.962723
40.758728
N
3148
0.172539
0.111946
0.205673
1395
id3286220
1
2016-03-21 16:36:56
2016-03-21 17:42:45
1
-73.948822
40.773197
-73.782997
40.643906
N
3949
0.165825
0.129292
0.210272
1487
id0067459
2
2016-04-22 11:55:21
2016-04-22 12:50:38
1
-73.945900
40.786091
-73.885216
40.679943
N
3317
0.060684
0.106148
0.122270
1521
id1629416
1
2016-02-29 10:04:11
2016-02-29 10:59:56
1
-73.786102
40.639755
-74.003876
40.716850
N
3345
0.217773
0.077095
0.231017
1531
id0297286
2
2016-03-24 22:33:54
2016-03-24 23:25:02
5
-73.983299
40.766411
-73.936111
40.697361
N
3068
0.047188
0.069050
0.083634
1610
id0324418
2
2016-04-18 11:01:11
2016-04-18 11:52:40
1
-73.782066
40.644733
-73.939980
40.841206
N
3089
0.157913
0.196472
0.252067
1613
id3057665
1
2016-05-20 00:04:32
2016-05-20 01:40:36
1
-73.982430
40.756622
-73.915901
40.618385
N
5764
0.066528
0.138237
0.153413
1618
id0717835
1
2016-05-26 12:52:19
2016-05-26 13:43:45
1
-73.985916
40.736004
-73.789841
40.647041
N
3086
0.196075
0.088963
0.215314
1717
id1297383
1
2016-05-06 23:59:25
2016-05-07 00:50:42
1
-74.005775
40.733086
-73.813446
40.710449
N
3077
0.192329
0.022636
0.193657
1718
id2509513
1
2016-02-12 15:20:26
2016-02-12 16:11:35
1
-73.870949
40.773811
-74.008217
40.733921
N
3069
0.137268
0.039890
0.142947
1810
id3289160
2
2016-06-11 13:16:55
2016-06-11 14:09:27
1
-73.981949
40.761753
-73.981300
40.760811
N
3152
0.000648
0.000942
0.001144
1852
id1212821
2
2016-05-13 15:33:52
2016-05-13 16:35:02
1
-73.872917
40.774200
-73.982277
40.739410
N
3670
0.109360
0.034790
0.114760
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
1456758
id2468950
1
2016-06-20 06:26:56
2016-06-20 07:35:47
3
-73.779076
40.647453
-73.984070
40.695400
N
4131
0.204994
0.047947
0.210527
1456769
id1594834
2
2016-04-01 13:52:37
2016-04-01 14:47:41
1
-74.008499
40.746078
-73.873009
40.774288
N
3304
0.135490
0.028210
0.138396
1456832
id3237194
1
2016-05-26 17:41:28
2016-05-26 18:32:41
1
-73.863472
40.770042
-73.988480
40.769516
N
3073
0.125008
0.000526
0.125009
1456907
id2902176
1
2016-06-03 13:55:02
2016-06-03 15:11:11
2
-73.807365
40.655212
-73.994232
40.745899
N
4569
0.186867
0.090687
0.207710
1456972
id2240253
2
2016-05-08 21:14:49
2016-05-08 22:06:39
1
-74.000961
40.746410
-73.916878
40.842361
N
3110
0.084084
0.095951
0.127580
1457118
id1737779
2
2016-04-11 14:58:26
2016-04-11 16:04:17
3
-73.781799
40.644886
-73.986588
40.769478
N
3951
0.204788
0.124592
0.239711
1457207
id1910950
2
2016-06-08 16:29:12
2016-06-09 16:11:03
1
-73.958061
40.800869
-73.964119
40.805515
N
85311
0.006058
0.004646
0.007634
1457307
id0949868
2
2016-06-30 17:44:48
2016-06-30 19:04:10
5
-73.997070
40.746990
-73.782570
40.644119
N
4762
0.214500
0.102871
0.237893
1457326
id0329809
2
2016-04-13 11:14:24
2016-04-13 12:09:45
6
-73.873009
40.774052
-73.974945
40.759502
N
3321
0.101936
0.014549
0.102969
1457659
id2081814
1
2016-05-13 06:57:13
2016-05-13 08:03:19
1
-73.781868
40.644672
-74.010361
40.718979
N
3966
0.228493
0.074306
0.240271
1457752
id1215198
2
2016-02-02 11:31:10
2016-02-03 11:29:44
6
-73.972069
40.794220
-73.961014
40.806728
N
86314
0.011055
0.012508
0.016693
1457871
id1549362
2
2016-02-18 15:54:26
2016-02-18 16:51:07
5
-73.782013
40.646851
-73.982254
40.755718
N
3401
0.200241
0.108868
0.227922
1457877
id1093816
1
2016-03-13 17:19:21
2016-03-13 18:11:55
1
-73.788727
40.641460
-73.991020
40.731461
N
3154
0.202293
0.090000
0.221411
1457965
id0452167
1
2016-02-10 07:46:50
2016-02-10 08:50:18
1
-73.943726
40.835865
-73.972069
40.762417
N
3808
0.028343
0.073448
0.078727
1458011
id0169298
2
2016-05-19 10:12:23
2016-05-19 11:07:13
5
-73.863136
40.769482
-73.997101
40.750431
N
3290
0.133965
0.019051
0.135312
1458034
id2290196
1
2016-04-01 20:16:44
2016-04-01 21:08:32
1
-73.780212
40.645531
-73.981400
40.763084
N
3108
0.201187
0.117554
0.233013
1458053
id0087401
2
2016-04-19 14:14:15
2016-04-19 15:06:46
4
-73.863487
40.769901
-73.985809
40.759800
N
3151
0.122322
0.010101
0.122738
1458058
id1648084
1
2016-05-06 16:06:33
2016-05-06 16:57:54
1
-73.863701
40.769882
-73.980873
40.764423
N
3081
0.117172
0.005459
0.117299
1458076
id2332290
2
2016-04-24 00:58:34
2016-04-25 00:56:16
1
-73.990479
40.760876
-73.983025
40.764557
N
86262
0.007454
0.003681
0.008313
1458142
id1140321
1
2016-06-21 09:19:51
2016-06-21 10:11:00
1
-73.988449
40.769115
-73.782249
40.644234
N
3069
0.206200
0.124882
0.241068
1458223
id0644773
1
2016-04-21 18:34:36
2016-04-21 19:24:51
4
-73.776634
40.645344
-73.976517
40.672150
N
3015
0.199883
0.026806
0.201672
1458247
id3078288
2
2016-01-07 14:40:39
2016-01-07 16:09:39
1
-73.974297
40.742706
-73.881180
40.766731
N
5340
0.093117
0.024025
0.096166
1458263
id3934738
2
2016-06-09 08:37:03
2016-06-09 09:27:50
5
-73.870918
40.773754
-73.998497
40.737244
N
3047
0.127579
0.036510
0.132700
1458275
id2979452
1
2016-05-25 12:57:52
2016-05-25 13:49:37
2
-73.866280
40.767677
-73.984802
40.768402
N
3105
0.118523
0.000725
0.118525
1458310
id1325943
1
2016-05-25 15:08:13
2016-05-25 15:58:24
1
-73.993568
40.724503
-73.955742
40.779602
N
3011
0.037827
0.055099
0.066834
1458328
id2824253
1
2016-03-03 08:09:29
2016-03-03 09:04:10
1
-73.961922
40.800533
-74.177269
40.691124
N
3281
0.215347
0.109409
0.241547
1458329
id0067309
1
2016-06-02 12:31:02
2016-06-02 13:31:49
4
-73.873878
40.773800
-73.973000
40.755688
N
3647
0.099121
0.018112
0.100762
1458333
id1758713
2
2016-04-17 14:56:46
2016-04-17 15:46:59
1
-73.782722
40.644966
-73.974808
40.750660
N
3013
0.192085
0.105694
0.219244
1458550
id2976426
1
2016-06-06 10:48:13
2016-06-06 11:40:31
3
-73.784454
40.648521
-73.972176
40.757133
N
3138
0.187721
0.108612
0.216877
1458600
id0995846
2
2016-05-09 17:26:56
2016-05-09 18:30:37
2
-73.789543
40.647099
-73.960320
40.798180
N
3821
0.170776
0.151081
0.228013
23823 rows × 14 columns
In [8]:
df_train_copy['vendor_id'].value_counts()
Out[8]:
2 780302
1 678342
Name: vendor_id, dtype: int64
In [9]:
df_train_copy['store_and_fwd_flag'].value_counts()
Out[9]:
N 1450599
Y 8045
Name: store_and_fwd_flag, dtype: int64
In [10]:
df_train_copy.columns
Out[10]:
Index(['id', 'vendor_id', 'pickup_datetime', 'dropoff_datetime',
'passenger_count', 'pickup_longitude', 'pickup_latitude',
'dropoff_longitude', 'dropoff_latitude', 'store_and_fwd_flag',
'trip_duration', 'delta_long', 'delta_la', 'dist'],
dtype='object')
In [11]:
df_train_copy['trip_duration'].describe()
Out[11]:
count 1.458644e+06
mean 9.594923e+02
std 5.237432e+03
min 1.000000e+00
25% 3.970000e+02
50% 6.620000e+02
75% 1.075000e+03
max 3.526282e+06
Name: trip_duration, dtype: float64
In [14]:
df_train_copy['dist'].describe()
Out[14]:
count 1.458644e+06
mean 3.548232e-02
std 4.596807e-02
min 0.000000e+00
25% 1.258443e-02
50% 2.121671e-02
75% 3.840884e-02
max 1.119260e+01
Name: dist, dtype: float64
In [8]:
from datetime import datetime
t = datetime.strptime(df_train_copy['pickup_datetime'][0], '%Y-%m-%d %H:%M:%S')
df_train_copy['month'][0] = t.month
df_train_copy['hour'][0] = t.hour
print(df_train_copy['month'][0])
print(df_train_copy['hour'][0])
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
//anaconda/lib/python3.5/site-packages/pandas/indexes/base.py in get_loc(self, key, method, tolerance)
2133 try:
-> 2134 return self._engine.get_loc(key)
2135 except KeyError:
pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4433)()
pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4279)()
pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13742)()
pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13696)()
KeyError: 'month'
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
<ipython-input-8-aa0342a008b4> in <module>()
2
3 t = datetime.strptime(df_train_copy['pickup_datetime'][0], '%Y-%m-%d %H:%M:%S')
----> 4 df_train_copy['month'][0] = t.month
5 df_train_copy['hour'][0] = t.hour
6
//anaconda/lib/python3.5/site-packages/pandas/core/frame.py in __getitem__(self, key)
2057 return self._getitem_multilevel(key)
2058 else:
-> 2059 return self._getitem_column(key)
2060
2061 def _getitem_column(self, key):
//anaconda/lib/python3.5/site-packages/pandas/core/frame.py in _getitem_column(self, key)
2064 # get column
2065 if self.columns.is_unique:
-> 2066 return self._get_item_cache(key)
2067
2068 # duplicate columns & possible reduce dimensionality
//anaconda/lib/python3.5/site-packages/pandas/core/generic.py in _get_item_cache(self, item)
1384 res = cache.get(item)
1385 if res is None:
-> 1386 values = self._data.get(item)
1387 res = self._box_item_values(item, values)
1388 cache[item] = res
//anaconda/lib/python3.5/site-packages/pandas/core/internals.py in get(self, item, fastpath)
3541
3542 if not isnull(item):
-> 3543 loc = self.items.get_loc(item)
3544 else:
3545 indexer = np.arange(len(self.items))[isnull(self.items)]
//anaconda/lib/python3.5/site-packages/pandas/indexes/base.py in get_loc(self, key, method, tolerance)
2134 return self._engine.get_loc(key)
2135 except KeyError:
-> 2136 return self._engine.get_loc(self._maybe_cast_indexer(key))
2137
2138 indexer = self.get_indexer([key], method=method, tolerance=tolerance)
pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4433)()
pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4279)()
pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13742)()
pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13696)()
KeyError: 'month'
In [10]:
from datetime import datetime
pickup_datetime = df_train_copy['pickup_datetime'].tolist()
month = []
hour = []
for i in range(len(df_train_copy)):
t = datetime.strptime(pickup_datetime[i],'%Y-%m-%d %H:%M:%S')
month.append(t.month)
hour.append(t.hour)
if i % 100000 == 0:
print('already: ', i)
df_train_copy['month'] = month
df_train_copy['hour'] = hour
already: 0
already: 10000
already: 20000
already: 30000
already: 40000
already: 50000
already: 60000
already: 70000
already: 80000
already: 90000
already: 100000
already: 110000
already: 120000
already: 130000
already: 140000
already: 150000
already: 160000
already: 170000
already: 180000
already: 190000
already: 200000
already: 210000
already: 220000
already: 230000
already: 240000
already: 250000
already: 260000
already: 270000
already: 280000
already: 290000
already: 300000
already: 310000
already: 320000
already: 330000
already: 340000
already: 350000
already: 360000
already: 370000
already: 380000
already: 390000
already: 400000
already: 410000
already: 420000
already: 430000
already: 440000
already: 450000
already: 460000
already: 470000
already: 480000
already: 490000
already: 500000
already: 510000
already: 520000
already: 530000
already: 540000
already: 550000
already: 560000
already: 570000
already: 580000
already: 590000
already: 600000
already: 610000
already: 620000
already: 630000
already: 640000
already: 650000
already: 660000
already: 670000
already: 680000
already: 690000
already: 700000
already: 710000
already: 720000
already: 730000
already: 740000
already: 750000
already: 760000
already: 770000
already: 780000
already: 790000
already: 800000
already: 810000
already: 820000
already: 830000
already: 840000
already: 850000
already: 860000
already: 870000
already: 880000
already: 890000
already: 900000
already: 910000
already: 920000
already: 930000
already: 940000
already: 950000
already: 960000
already: 970000
already: 980000
already: 990000
already: 1000000
already: 1010000
already: 1020000
already: 1030000
already: 1040000
already: 1050000
already: 1060000
already: 1070000
already: 1080000
already: 1090000
already: 1100000
already: 1110000
already: 1120000
already: 1130000
already: 1140000
already: 1150000
already: 1160000
already: 1170000
already: 1180000
already: 1190000
already: 1200000
already: 1210000
already: 1220000
already: 1230000
already: 1240000
already: 1250000
already: 1260000
already: 1270000
already: 1280000
already: 1290000
already: 1300000
already: 1310000
already: 1320000
already: 1330000
already: 1340000
already: 1350000
already: 1360000
already: 1370000
already: 1380000
already: 1390000
already: 1400000
already: 1410000
already: 1420000
already: 1430000
already: 1440000
already: 1450000
In [11]:
df_train_copy
Out[11]:
id
vendor_id
pickup_datetime
dropoff_datetime
passenger_count
pickup_longitude
pickup_latitude
dropoff_longitude
dropoff_latitude
store_and_fwd_flag
trip_duration
delta_long
delta_la
dist
month
hour
0
id2875421
2
2016-03-14 17:24:55
2016-03-14 17:32:30
1
-73.982155
40.767937
-73.964630
40.765602
N
455
0.017525
0.002335
0.017680
3
17
1
id2377394
1
2016-06-12 00:43:35
2016-06-12 00:54:38
1
-73.980415
40.738564
-73.999481
40.731152
N
663
0.019066
0.007412
0.020456
6
0
2
id3858529
2
2016-01-19 11:35:24
2016-01-19 12:10:48
1
-73.979027
40.763939
-74.005333
40.710087
N
2124
0.026306
0.053852
0.059934
1
11
3
id3504673
2
2016-04-06 19:32:31
2016-04-06 19:39:40
1
-74.010040
40.719971
-74.012268
40.706718
N
429
0.002228
0.013252
0.013438
4
19
4
id2181028
2
2016-03-26 13:30:55
2016-03-26 13:38:10
1
-73.973053
40.793209
-73.972923
40.782520
N
435
0.000130
0.010689
0.010690
3
13
5
id0801584
2
2016-01-30 22:01:40
2016-01-30 22:09:03
6
-73.982857
40.742195
-73.992081
40.749184
N
443
0.009224
0.006989
0.011572
1
22
6
id1813257
1
2016-06-17 22:34:59
2016-06-17 22:40:40
4
-73.969017
40.757839
-73.957405
40.765896
N
341
0.011612
0.008057
0.014133
6
22
7
id1324603
2
2016-05-21 07:54:58
2016-05-21 08:20:49
1
-73.969276
40.797779
-73.922470
40.760559
N
1551
0.046806
0.037220
0.059801
5
7
8
id1301050
1
2016-05-27 23:12:23
2016-05-27 23:16:38
1
-73.999481
40.738400
-73.985786
40.732815
N
255
0.013695
0.005585
0.014790
5
23
9
id0012891
2
2016-03-10 21:45:01
2016-03-10 22:05:26
1
-73.981049
40.744339
-73.973000
40.789989
N
1225
0.008049
0.045650
0.046355
3
21
10
id1436371
2
2016-05-10 22:08:41
2016-05-10 22:29:55
1
-73.982651
40.763840
-74.002228
40.732990
N
1274
0.019577
0.030849
0.036537
5
22
11
id1299289
2
2016-05-15 11:16:11
2016-05-15 11:34:59
4
-73.991531
40.749439
-73.956543
40.770630
N
1128
0.034988
0.021191
0.040905
5
11
12
id1187965
2
2016-02-19 09:52:46
2016-02-19 10:11:20
2
-73.962982
40.756680
-73.984406
40.760719
N
1114
0.021423
0.004040
0.021801
2
9
13
id0799785
2
2016-06-01 20:58:29
2016-06-01 21:02:49
1
-73.956306
40.767941
-73.966110
40.763000
N
260
0.009804
0.004940
0.010978
6
20
14
id2900608
2
2016-05-27 00:43:36
2016-05-27 01:07:10
1
-73.992195
40.727226
-73.974655
40.783070
N
1414
0.017540
0.055843
0.058533
5
0
15
id3319787
1
2016-05-16 15:29:02
2016-05-16 15:32:33
1
-73.955513
40.768593
-73.948761
40.771545
N
211
0.006752
0.002953
0.007369
5
15
16
id3379579
2
2016-04-11 17:29:50
2016-04-11 18:08:26
1
-73.991165
40.755562
-73.999290
40.725353
N
2316
0.008125
0.030209
0.031282
4
17
17
id1154431
1
2016-04-14 08:48:26
2016-04-14 09:00:37
1
-73.994255
40.745804
-73.999657
40.723343
N
731
0.005402
0.022461
0.023101
4
8
18
id3552682
1
2016-06-27 09:55:13
2016-06-27 10:17:10
1
-74.003983
40.713013
-73.979195
40.749924
N
1317
0.024788
0.036911
0.044462
6
9
19
id3390316
2
2016-06-05 13:47:23
2016-06-05 13:51:34
1
-73.983887
40.738197
-73.991203
40.727871
N
251
0.007317
0.010326
0.012656
6
13
20
id2070428
1
2016-02-28 02:23:02
2016-02-28 02:31:08
1
-73.980370
40.742420
-73.962852
40.760635
N
486
0.017517
0.018215
0.025271
2
2
21
id0809232
2
2016-04-01 12:12:25
2016-04-01 12:23:17
1
-73.979538
40.753361
-73.963997
40.763458
N
652
0.015541
0.010098
0.018533
4
12
22
id2352683
1
2016-04-09 03:34:27
2016-04-09 03:41:30
1
-73.995865
40.758812
-73.993324
40.740322
N
423
0.002541
0.018490
0.018664
4
3
23
id1603037
1
2016-06-25 10:36:26
2016-06-25 10:55:49
1
-73.993553
40.747173
-74.006142
40.704384
N
1163
0.012589
0.042789
0.044603
6
10
24
id3321406
2
2016-06-03 08:15:05
2016-06-03 08:56:30
1
-73.955231
40.777134
-73.788750
40.641472
N
2485
0.166481
0.135662
0.214756
6
8
25
id0129640
2
2016-02-14 13:27:56
2016-02-14 13:49:19
1
-73.956581
40.771358
-73.974968
40.732792
N
1283
0.018387
0.038567
0.042725
2
13
26
id3587298
1
2016-02-27 21:56:01
2016-02-27 22:14:51
1
-73.983765
40.749874
-73.958832
40.800961
N
1130
0.024933
0.051086
0.056846
2
21
27
id2104175
1
2016-06-20 23:07:16
2016-06-20 23:18:50
1
-73.958435
40.713192
-73.949539
40.680252
N
694
0.008896
0.032940
0.034120
6
23
28
id3973319
2
2016-06-13 21:57:27
2016-06-13 22:12:19
1
-73.994217
40.713306
-73.982849
40.692299
N
892
0.011368
0.021008
0.023886
6
21
29
id1410897
1
2016-03-23 14:10:39
2016-03-23 14:49:30
1
-73.982117
40.756351
-73.865692
40.770988
N
2331
0.116425
0.014637
0.117341
3
14
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
1458614
id2061444
2
2016-02-08 17:16:07
2016-02-08 17:21:45
1
-73.980927
40.767651
-73.965302
40.765251
N
338
0.015625
0.002399
0.015808
2
17
1458615
id3182230
1
2016-02-05 17:57:08
2016-02-05 18:11:25
1
-73.991013
40.728321
-73.966766
40.711548
N
857
0.024246
0.016773
0.029483
2
17
1458616
id2822294
1
2016-04-22 17:21:14
2016-04-22 17:29:22
1
-73.988327
40.732147
-73.999641
40.734192
N
488
0.011314
0.002045
0.011498
4
17
1458617
id0820021
2
2016-04-15 08:31:20
2016-04-15 08:34:48
1
-73.975433
40.752411
-73.973122
40.746780
N
208
0.002312
0.005630
0.006087
4
8
1458618
id1046767
2
2016-04-17 01:46:48
2016-04-17 01:52:55
1
-73.987564
40.733387
-74.001129
40.731056
N
367
0.013565
0.002331
0.013764
4
1
1458619
id1083860
2
2016-04-23 12:14:15
2016-04-23 12:26:03
1
-73.954773
40.777882
-73.980904
40.782516
N
708
0.026131
0.004635
0.026539
4
12
1458620
id0694577
2
2016-04-28 20:51:03
2016-04-28 21:10:25
1
-73.966324
40.758072
-74.006516
40.736641
N
1162
0.040192
0.021431
0.045548
4
20
1458621
id3267199
2
2016-05-09 14:33:30
2016-05-09 15:12:45
1
-73.959534
40.782749
-73.990959
40.751091
N
2355
0.031425
0.031658
0.044607
5
14
1458622
id0125435
2
2016-02-19 18:26:52
2016-02-19 18:36:04
1
-74.008408
40.721142
-74.000557
40.723911
N
552
0.007851
0.002769
0.008325
2
18
1458623
id3369208
1
2016-01-18 20:35:30
2016-01-18 20:44:44
1
-73.991081
40.737408
-73.987671
40.722622
N
554
0.003410
0.014786
0.015174
1
20
1458624
id3482902
1
2016-03-01 07:21:04
2016-03-01 07:23:36
1
-73.974693
40.756088
-73.969971
40.762115
N
152
0.004723
0.006027
0.007657
3
7
1458625
id3730733
2
2016-01-25 17:21:15
2016-01-25 17:54:37
1
-73.989655
40.740612
-73.961029
40.765366
N
2002
0.028625
0.024754
0.037844
1
17
1458626
id0155863
2
2016-01-17 17:21:11
2016-01-17 17:25:15
2
-73.954071
40.767021
-73.950340
40.778233
N
244
0.003731
0.011211
0.011816
1
17
1458627
id0439281
2
2016-06-23 10:10:28
2016-06-23 10:25:08
5
-73.981651
40.767708
-73.959183
40.777412
N
880
0.022469
0.009705
0.024475
6
10
1458628
id0986544
2
2016-05-30 03:08:19
2016-05-30 03:14:10
2
-73.988632
40.721378
-73.975548
40.728519
N
351
0.013084
0.007141
0.014906
5
3
1458629
id3109086
2
2016-06-24 10:33:51
2016-06-24 10:43:52
1
-73.959618
40.808941
-73.947922
40.830189
N
601
0.011696
0.021248
0.024254
6
10
1458630
id0287353
2
2016-06-25 03:44:32
2016-06-25 03:53:41
5
-73.991508
40.727135
-73.988136
40.740932
N
549
0.003372
0.013798
0.014204
6
3
1458631
id1724231
1
2016-05-14 23:18:23
2016-05-14 23:24:05
3
-73.958946
40.763725
-73.953156
40.780003
N
342
0.005791
0.016277
0.017277
5
23
1458632
id0469946
2
2016-03-06 11:04:48
2016-03-06 11:17:45
2
-74.015572
40.710892
-73.996620
40.743633
N
777
0.018951
0.032742
0.037831
3
11
1458633
id2432342
1
2016-03-17 19:10:16
2016-03-17 19:26:35
3
-73.979652
40.735279
-73.995522
40.759754
N
979
0.015869
0.024475
0.029170
3
19
1458634
id3445276
1
2016-04-03 13:51:25
2016-04-03 14:07:37
2
-73.989075
40.730465
-73.963882
40.773739
N
972
0.025192
0.043274
0.050073
4
13
1458635
id3027038
2
2016-05-19 14:46:55
2016-05-19 14:50:52
1
-73.985390
40.763020
-73.989708
40.767502
N
237
0.004318
0.004482
0.006224
5
14
1458636
id0405770
2
2016-02-12 10:13:06
2016-02-12 10:26:26
1
-73.863815
40.769684
-73.864395
40.761326
N
800
0.000580
0.008358
0.008378
2
10
1458637
id1920898
1
2016-04-17 18:48:16
2016-04-17 19:00:56
1
-73.975357
40.751705
-73.949478
40.776764
N
760
0.025879
0.025059
0.036023
4
18
1458638
id1454193
2
2016-02-02 00:39:39
2016-02-02 00:46:33
5
-73.988823
40.736553
-73.989166
40.757393
N
414
0.000343
0.020840
0.020843
2
0
1458639
id2376096
2
2016-04-08 13:31:04
2016-04-08 13:44:02
4
-73.982201
40.745522
-73.994911
40.740170
N
778
0.012711
0.005352
0.013791
4
13
1458640
id1049543
1
2016-01-10 07:35:15
2016-01-10 07:46:10
1
-74.000946
40.747379
-73.970184
40.796547
N
655
0.030762
0.049168
0.057998
1
7
1458641
id2304944
2
2016-04-22 06:57:41
2016-04-22 07:10:25
1
-73.959129
40.768799
-74.004433
40.707371
N
764
0.045303
0.061428
0.076327
4
6
1458642
id2714485
1
2016-01-05 15:56:26
2016-01-05 16:02:39
1
-73.982079
40.749062
-73.974632
40.757107
N
373
0.007446
0.008045
0.010962
1
15
1458643
id1209952
1
2016-04-05 14:44:25
2016-04-05 14:47:43
1
-73.979538
40.781750
-73.972809
40.790585
N
198
0.006729
0.008835
0.011106
4
14
1458644 rows × 16 columns
In [13]:
from datetime import datetime
pickup_datetime_test = df_test_copy['pickup_datetime'].tolist()
month = []
hour = []
for i in range(len(df_test_copy)):
t = datetime.strptime(pickup_datetime_test[i],'%Y-%m-%d %H:%M:%S')
month.append(t.month)
hour.append(t.hour)
if i % 100000 == 0:
print('already: ', i)
df_test_copy['month'] = month
df_test_copy['hour'] = hour
already: 0
already: 10000
already: 20000
already: 30000
already: 40000
already: 50000
already: 60000
already: 70000
already: 80000
already: 90000
already: 100000
already: 110000
already: 120000
already: 130000
already: 140000
already: 150000
already: 160000
already: 170000
already: 180000
already: 190000
already: 200000
already: 210000
already: 220000
already: 230000
already: 240000
already: 250000
already: 260000
already: 270000
already: 280000
already: 290000
already: 300000
already: 310000
already: 320000
already: 330000
already: 340000
already: 350000
already: 360000
already: 370000
already: 380000
already: 390000
already: 400000
already: 410000
already: 420000
already: 430000
already: 440000
already: 450000
already: 460000
already: 470000
already: 480000
already: 490000
already: 500000
already: 510000
already: 520000
already: 530000
already: 540000
already: 550000
already: 560000
already: 570000
already: 580000
already: 590000
already: 600000
already: 610000
already: 620000
In [14]:
df_test_copy
Out[14]:
id
vendor_id
pickup_datetime
passenger_count
pickup_longitude
pickup_latitude
dropoff_longitude
dropoff_latitude
store_and_fwd_flag
delta_long
delta_la
dist
month
hour
0
id3004672
1
2016-06-30 23:59:58
1
-73.988129
40.732029
-73.990173
40.756680
N
0.002045
0.024651
0.024735
6
23
1
id3505355
1
2016-06-30 23:59:53
1
-73.964203
40.679993
-73.959808
40.655403
N
0.004395
0.024590
0.024979
6
23
2
id1217141
1
2016-06-30 23:59:47
1
-73.997437
40.737583
-73.986160
40.729523
N
0.011276
0.008060
0.013861
6
23
3
id2150126
2
2016-06-30 23:59:41
1
-73.956070
40.771900
-73.986427
40.730469
N
0.030357
0.041431
0.051363
6
23
4
id1598245
1
2016-06-30 23:59:33
1
-73.970215
40.761475
-73.961510
40.755890
N
0.008705
0.005585
0.010343
6
23
5
id0668992
1
2016-06-30 23:59:30
1
-73.991302
40.749798
-73.980515
40.786549
N
0.010788
0.036751
0.038301
6
23
6
id1765014
1
2016-06-30 23:59:15
1
-73.978310
40.741550
-73.952072
40.717003
N
0.026237
0.024548
0.035930
6
23
7
id0898117
1
2016-06-30 23:59:09
2
-74.012711
40.701527
-73.986481
40.719509
N
0.026230
0.017982
0.031802
6
23
8
id3905224
2
2016-06-30 23:58:55
2
-73.992332
40.730511
-73.875618
40.875214
N
0.116714
0.144703
0.185906
6
23
9
id1543102
2
2016-06-30 23:58:46
1
-73.993179
40.748760
-73.979309
40.761311
N
0.013870
0.012550
0.018705
6
23
10
id3024712
1
2016-06-30 23:58:32
4
-73.968529
40.678432
-73.966591
40.635712
N
0.001938
0.042721
0.042765
6
23
11
id3665810
2
2016-06-30 23:58:05
1
-73.982773
40.756908
-73.974693
40.753330
N
0.008080
0.003578
0.008836
6
23
12
id1836461
1
2016-06-30 23:58:01
1
-73.921104
40.767292
-73.936859
40.774044
N
0.015755
0.006752
0.017141
6
23
13
id3457080
2
2016-06-30 23:57:57
1
-73.986801
40.734917
-73.975899
40.756893
N
0.010902
0.021976
0.024532
6
23
14
id3376065
1
2016-06-30 23:57:25
1
-73.996346
40.748161
-73.950829
40.782825
N
0.045517
0.034664
0.057214
6
23
15
id3008739
1
2016-06-30 23:57:22
1
-73.968025
40.762283
-73.934792
40.797436
N
0.033234
0.035152
0.048375
6
23
16
id0902216
2
2016-06-30 23:56:44
1
-74.007713
40.740681
-73.968811
40.753860
N
0.038902
0.013180
0.041074
6
23
17
id3564824
2
2016-06-30 23:55:36
5
-73.984299
40.724983
-73.981819
40.740597
N
0.002480
0.015614
0.015809
6
23
18
id0820280
2
2016-06-30 23:55:28
1
-73.952599
40.768322
-73.948555
40.773724
N
0.004044
0.005402
0.006747
6
23
19
id0775088
2
2016-06-30 23:55:20
1
-73.966690
40.794090
-73.920776
40.830059
N
0.045914
0.035969
0.058325
6
23
20
id1468488
2
2016-06-30 23:55:13
1
-73.994690
40.725819
-73.987160
40.729259
N
0.007530
0.003441
0.008279
6
23
21
id2657479
1
2016-06-30 23:55:12
1
-73.965950
40.758068
-73.977524
40.742527
N
0.011574
0.015541
0.019377
6
23
22
id1262719
2
2016-06-30 23:55:04
5
-73.986382
40.762001
-73.966148
40.762089
N
0.020233
0.000088
0.020233
6
23
23
id1345524
2
2016-06-30 23:54:55
1
-73.955986
40.714069
-73.980682
40.675735
N
0.024696
0.038334
0.045600
6
23
24
id2911638
1
2016-06-30 23:54:45
1
-73.984100
40.742760
-73.956001
40.784809
N
0.028099
0.042049
0.050574
6
23
25
id2849512
2
2016-06-30 23:54:16
3
-73.872993
40.773979
-73.962440
40.774712
N
0.089447
0.000732
0.089450
6
23
26
id0236829
1
2016-06-30 23:53:06
1
-73.967621
40.762856
-73.952301
40.782181
N
0.015320
0.019325
0.024661
6
23
27
id2905906
1
2016-06-30 23:52:34
1
-73.985359
40.759548
-73.973267
40.763294
Y
0.012093
0.003746
0.012660
6
23
28
id3737939
1
2016-06-30 23:52:15
2
-73.987526
40.765511
-73.938713
40.849827
N
0.048813
0.084316
0.097427
6
23
29
id0766179
2
2016-06-30 23:51:42
2
-73.985344
40.747356
-73.978378
40.675571
N
0.006966
0.071785
0.072122
6
23
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
625104
id0120169
1
2016-01-01 00:13:21
1
-73.968399
40.799793
-73.974541
40.787251
N
0.006142
0.012543
0.013966
1
0
625105
id0386349
2
2016-01-01 00:11:38
5
-73.988403
40.737289
-73.992073
40.749142
N
0.003670
0.011852
0.012407
1
0
625106
id1962532
2
2016-01-01 00:11:33
1
-74.005394
40.739971
-73.989204
40.723003
N
0.016190
0.016968
0.023452
1
0
625107
id0335207
2
2016-01-01 00:11:10
2
-73.982239
40.771309
-73.963661
40.774353
N
0.018578
0.003044
0.018825
1
0
625108
id0273508
2
2016-01-01 00:10:40
3
-74.006516
40.744549
-73.938103
40.792294
N
0.068413
0.047745
0.083426
1
0
625109
id2936770
2
2016-01-01 00:10:15
1
-73.976822
40.751690
-73.955719
40.776470
N
0.021103
0.024780
0.032548
1
0
625110
id0044526
2
2016-01-01 00:10:09
1
-73.976547
40.750626
-73.975777
40.745041
N
0.000771
0.005585
0.005638
1
0
625111
id3605431
2
2016-01-01 00:10:02
1
-74.001259
40.747192
-73.978333
40.779781
N
0.022926
0.032589
0.039845
1
0
625112
id2681896
1
2016-01-01 00:09:44
1
-73.998657
40.739952
-73.983231
40.734612
N
0.015427
0.005341
0.016325
1
0
625113
id3308448
2
2016-01-01 00:09:40
5
-73.976784
40.774750
-73.973106
40.756870
N
0.003677
0.017879
0.018254
1
0
625114
id2108525
1
2016-01-01 00:09:17
1
-73.976669
40.765736
-73.983246
40.749500
N
0.006577
0.016235
0.017517
1
0
625115
id3952220
2
2016-01-01 00:08:38
1
-73.992180
40.759155
-73.958916
40.618134
N
0.033264
0.141022
0.144892
1
0
625116
id2771348
1
2016-01-01 00:08:36
1
-73.976189
40.765678
-73.982147
40.744007
N
0.005959
0.021671
0.022476
1
0
625117
id3065313
2
2016-01-01 00:08:30
1
-74.002258
40.745335
-73.995331
40.663185
N
0.006927
0.082150
0.082441
1
0
625118
id2332834
2
2016-01-01 00:07:20
2
-73.999954
40.728645
-73.936699
40.750092
N
0.063255
0.021446
0.066792
1
0
625119
id3495407
1
2016-01-01 00:07:13
2
-73.997025
40.720509
-73.998932
40.734013
N
0.001907
0.013504
0.013638
1
0
625120
id3811106
2
2016-01-01 00:06:57
6
-73.990585
40.740227
-74.000893
40.729008
N
0.010307
0.011219
0.015235
1
0
625121
id2693698
1
2016-01-01 00:06:00
1
-73.999481
40.748959
-74.008507
40.745422
N
0.009026
0.003536
0.009694
1
0
625122
id2884571
2
2016-01-01 00:05:39
2
-74.000259
40.730247
-73.999969
40.741158
N
0.000290
0.010910
0.010914
1
0
625123
id2790343
2
2016-01-01 00:05:14
1
-73.956131
40.778793
-74.001259
40.736439
N
0.045128
0.042355
0.061891
1
0
625124
id1901191
1
2016-01-01 00:05:12
1
-73.988899
40.718777
-73.972511
40.743450
N
0.016388
0.024673
0.029620
1
0
625125
id0664662
1
2016-01-01 00:05:02
1
-73.969040
40.790852
-73.967690
40.761066
N
0.001350
0.029785
0.029816
1
0
625126
id2073829
1
2016-01-01 00:05:01
1
-74.002586
40.733627
-73.998955
40.744518
N
0.003632
0.010891
0.011480
1
0
625127
id0328287
1
2016-01-01 00:03:38
2
-73.952110
40.777416
-73.958450
40.764320
N
0.006340
0.013096
0.014550
1
0
625128
id1340822
1
2016-01-01 00:03:00
1
-73.973167
40.764042
-73.974464
40.757187
N
0.001297
0.006855
0.006977
1
0
625129
id3008929
1
2016-01-01 00:02:52
1
-74.003464
40.725105
-74.001251
40.733643
N
0.002213
0.008537
0.008819
1
0
625130
id3700764
1
2016-01-01 00:01:52
1
-74.006363
40.743782
-73.953407
40.782467
N
0.052956
0.038685
0.065581
1
0
625131
id2568735
1
2016-01-01 00:01:24
2
-73.972267
40.759865
-73.876602
40.748665
N
0.095665
0.011200
0.096318
1
0
625132
id1384355
1
2016-01-01 00:00:28
1
-73.976501
40.733562
-73.854263
40.891788
N
0.122238
0.158226
0.199944
1
0
625133
id0621643
2
2016-01-01 00:00:22
2
-73.981850
40.716881
-73.969330
40.769379
N
0.012520
0.052498
0.053970
1
0
625134 rows × 14 columns
In [15]:
df_train_para = df_train_copy[['vendor_id', 'passenger_count','store_and_fwd_flag',
'trip_duration','dist','month','hour']]
df_test_para = df_test_copy[['vendor_id', 'passenger_count','store_and_fwd_flag',
'dist','month','hour']]
In [22]:
df_train_para['month'].value_counts()
Out[22]:
3 256189
4 251645
5 248487
2 238300
6 234316
1 229707
Name: month, dtype: int64
In [25]:
sns.boxplot(df_train_para['hour'], df_train_para['trip_duration'])
plt.show()
In [23]:
df_test_para['month'].value_counts()
Out[23]:
3 109697
5 107570
4 107432
2 102314
6 100445
1 97676
Name: month, dtype: int64
In [31]:
df_train_s = df_train_para[(df_train_para.trip_duration < 2500) &
(df_train_para.dist < 0.04) &
(df_train_para.dist > 0.01) ]
In [32]:
import matplotlib.pyplot as plt
plt.scatter(df_train_s.dist, df_train_s.trip_duration, alpha=0.01)
plt.show()
In [33]:
sns.boxplot(df_train_s['hour'], df_train_s['trip_duration'])
plt.show()
In [34]:
sns.boxplot(df_train_s['month'], df_train_s['trip_duration'])
plt.show()
In [53]:
sns.boxplot(df_train_s['passenger_count'], df_train_s['trip_duration'])
plt.show()
In [54]:
df_train_s['passenger_count'].value_counts()
Out[54]:
1 620561
2 124453
5 46437
3 35956
6 29012
4 16811
0 5
Name: passenger_count, dtype: int64
In [35]:
df_train_s['trip_duration'].describe()
Out[35]:
count 873235.000000
mean 675.163420
std 344.399329
min 4.000000
25% 425.000000
50% 606.000000
75% 850.000000
max 2499.000000
Name: trip_duration, dtype: float64
In [36]:
df_train_s['dist'].describe()
Out[36]:
count 873235.000000
mean 0.021175
std 0.007947
min 0.010000
25% 0.014457
50% 0.019651
75% 0.026826
max 0.040000
Name: dist, dtype: float64
In [38]:
df_test_para
Out[38]:
vendor_id
passenger_count
store_and_fwd_flag
dist
month
hour
0
1
1
N
0.024735
6
23
1
1
1
N
0.024979
6
23
2
1
1
N
0.013861
6
23
3
2
1
N
0.051363
6
23
4
1
1
N
0.010343
6
23
5
1
1
N
0.038301
6
23
6
1
1
N
0.035930
6
23
7
1
2
N
0.031802
6
23
8
2
2
N
0.185906
6
23
9
2
1
N
0.018705
6
23
10
1
4
N
0.042765
6
23
11
2
1
N
0.008836
6
23
12
1
1
N
0.017141
6
23
13
2
1
N
0.024532
6
23
14
1
1
N
0.057214
6
23
15
1
1
N
0.048375
6
23
16
2
1
N
0.041074
6
23
17
2
5
N
0.015809
6
23
18
2
1
N
0.006747
6
23
19
2
1
N
0.058325
6
23
20
2
1
N
0.008279
6
23
21
1
1
N
0.019377
6
23
22
2
5
N
0.020233
6
23
23
2
1
N
0.045600
6
23
24
1
1
N
0.050574
6
23
25
2
3
N
0.089450
6
23
26
1
1
N
0.024661
6
23
27
1
1
Y
0.012660
6
23
28
1
2
N
0.097427
6
23
29
2
2
N
0.072122
6
23
...
...
...
...
...
...
...
625104
1
1
N
0.013966
1
0
625105
2
5
N
0.012407
1
0
625106
2
1
N
0.023452
1
0
625107
2
2
N
0.018825
1
0
625108
2
3
N
0.083426
1
0
625109
2
1
N
0.032548
1
0
625110
2
1
N
0.005638
1
0
625111
2
1
N
0.039845
1
0
625112
1
1
N
0.016325
1
0
625113
2
5
N
0.018254
1
0
625114
1
1
N
0.017517
1
0
625115
2
1
N
0.144892
1
0
625116
1
1
N
0.022476
1
0
625117
2
1
N
0.082441
1
0
625118
2
2
N
0.066792
1
0
625119
1
2
N
0.013638
1
0
625120
2
6
N
0.015235
1
0
625121
1
1
N
0.009694
1
0
625122
2
2
N
0.010914
1
0
625123
2
1
N
0.061891
1
0
625124
1
1
N
0.029620
1
0
625125
1
1
N
0.029816
1
0
625126
1
1
N
0.011480
1
0
625127
1
2
N
0.014550
1
0
625128
1
1
N
0.006977
1
0
625129
1
1
N
0.008819
1
0
625130
1
1
N
0.065581
1
0
625131
1
2
N
0.096318
1
0
625132
1
1
N
0.199944
1
0
625133
2
2
N
0.053970
1
0
625134 rows × 6 columns
In [40]:
df_test_para['vendor_id'].value_counts()
Out[40]:
2 334374
1 290760
Name: vendor_id, dtype: int64
In [41]:
df_test_para['store_and_fwd_flag'].value_counts()
Out[41]:
N 621704
Y 3430
Name: store_and_fwd_flag, dtype: int64
In [42]:
df_test_para['dist'].describe()
Out[42]:
count 625134.000000
mean 0.035400
std 0.045585
min 0.000000
25% 0.012590
50% 0.021216
75% 0.038454
max 10.385000
Name: dist, dtype: float64
In [43]:
df_ans.head()
Out[43]:
id
trip_duration
0
id3004672
959
1
id3505355
959
2
id1217141
959
3
id2150126
959
4
id1598245
959
In [245]:
df_ans[df_ans.id == 0]
Out[245]:
id
trip_duration
2892
0
1.0
2991
0
1.0
9325
0
1.0
9414
0
1.0
9419
0
1.0
9440
0
1.0
9441
0
1.0
12383
0
1.0
12473
0
1.0
14891
0
1.0
14971
0
1.0
15044
0
1.0
18436
0
1.0
18524
0
1.0
18528
0
1.0
25860
0
1.0
25921
0
1.0
25959
0
1.0
25963
0
1.0
29396
0
1.0
32738
0
1.0
32751
0
1.0
32795
0
1.0
35791
0
1.0
35800
0
1.0
35807
0
1.0
35818
0
1.0
41592
0
1.0
41616
0
1.0
41643
0
1.0
...
...
...
596217
0
1.0
596343
0
1.0
596459
0
1.0
599914
0
1.0
599927
0
1.0
599994
0
1.0
600087
0
1.0
600186
0
1.0
603741
0
1.0
603805
0
1.0
603845
0
1.0
603868
0
1.0
607095
0
1.0
607141
0
1.0
610400
0
1.0
610447
0
1.0
613501
0
1.0
613512
0
1.0
616376
0
1.0
618614
0
1.0
618755
0
1.0
618819
0
1.0
621561
0
1.0
621626
0
1.0
621733
0
1.0
624091
0
1.0
624096
0
1.0
624116
0
1.0
624261
0
1.0
624343
0
1.0
459 rows × 2 columns
In [55]:
df_train_feature = df_train_s[['vendor_id','store_and_fwd_flag','dist','month','hour']]
df_train_label = df_train_s['trip_duration']
df_test_feature = df_test_para[['vendor_id','store_and_fwd_flag','dist','month','hour']]
In [56]:
df_train_feature = pd.get_dummies(data=df_train_feature,
columns=['vendor_id','store_and_fwd_flag',
'month','hour'])
df_test_feature = pd.get_dummies(data=df_test_feature,
columns=['vendor_id','store_and_fwd_flag',
'month','hour'])
In [57]:
df_train_feature.head()
Out[57]:
dist
vendor_id_1
vendor_id_2
store_and_fwd_flag_N
store_and_fwd_flag_Y
month_1
month_2
month_3
month_4
month_5
...
hour_14
hour_15
hour_16
hour_17
hour_18
hour_19
hour_20
hour_21
hour_22
hour_23
0
0.017680
0
1
1
0
0
0
1
0
0
...
0
0
0
1
0
0
0
0
0
0
1
0.020456
1
0
1
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
3
0.013438
0
1
1
0
0
0
0
1
0
...
0
0
0
0
0
1
0
0
0
0
4
0.010690
0
1
1
0
0
0
1
0
0
...
0
0
0
0
0
0
0
0
0
0
5
0.011572
0
1
1
0
1
0
0
0
0
...
0
0
0
0
0
0
0
0
1
0
5 rows × 35 columns
In [58]:
df_train_feature.columns
Out[58]:
Index(['dist', 'vendor_id_1', 'vendor_id_2', 'store_and_fwd_flag_N',
'store_and_fwd_flag_Y', 'month_1', 'month_2', 'month_3', 'month_4',
'month_5', 'month_6', 'hour_0', 'hour_1', 'hour_2', 'hour_3', 'hour_4',
'hour_5', 'hour_6', 'hour_7', 'hour_8', 'hour_9', 'hour_10', 'hour_11',
'hour_12', 'hour_13', 'hour_14', 'hour_15', 'hour_16', 'hour_17',
'hour_18', 'hour_19', 'hour_20', 'hour_21', 'hour_22', 'hour_23'],
dtype='object')
In [62]:
df_train_feature['dist'].describe()
Out[62]:
count 873235.000000
mean 0.021175
std 0.007947
min 0.010000
25% 0.014457
50% 0.019651
75% 0.026826
max 0.040000
Name: dist, dtype: float64
In [59]:
df_test_feature.head()
Out[59]:
dist
vendor_id_1
vendor_id_2
store_and_fwd_flag_N
store_and_fwd_flag_Y
month_1
month_2
month_3
month_4
month_5
...
hour_14
hour_15
hour_16
hour_17
hour_18
hour_19
hour_20
hour_21
hour_22
hour_23
0
0.024735
1
0
1
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
1
1
0.024979
1
0
1
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
1
2
0.013861
1
0
1
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
1
3
0.051363
0
1
1
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
1
4
0.010343
1
0
1
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
1
5 rows × 35 columns
In [60]:
df_test_feature.columns
Out[60]:
Index(['dist', 'vendor_id_1', 'vendor_id_2', 'store_and_fwd_flag_N',
'store_and_fwd_flag_Y', 'month_1', 'month_2', 'month_3', 'month_4',
'month_5', 'month_6', 'hour_0', 'hour_1', 'hour_2', 'hour_3', 'hour_4',
'hour_5', 'hour_6', 'hour_7', 'hour_8', 'hour_9', 'hour_10', 'hour_11',
'hour_12', 'hour_13', 'hour_14', 'hour_15', 'hour_16', 'hour_17',
'hour_18', 'hour_19', 'hour_20', 'hour_21', 'hour_22', 'hour_23'],
dtype='object')
In [61]:
df_test_feature['dist'].describe()
Out[61]:
count 625134.000000
mean 0.035400
std 0.045585
min 0.000000
25% 0.012590
50% 0.021216
75% 0.038454
max 10.385000
Name: dist, dtype: float64
In [141]:
train_feature = df_train_feature.values
test_feature = df_test_feature.values
train_label = df_train_label.values
In [130]:
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
train_feature_trans = min_max_scaler.fit_transform(train_feature.reshape(-1,35))
test_feature_trans = min_max_scaler.fit_transform(test_feature.reshape(-1,35))
In [123]:
train_feature_trans[0]
Out[123]:
array([ 0.25598384, 0. , 1. , 1. , 0. ,
0. , 0. , 1. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 1. , 0. ,
0. , 0. , 0. , 0. , 0. ])
In [118]:
train_feature.shape
Out[118]:
(873235, 35)
In [132]:
test_feature_trans[0]
Out[132]:
array([ 0.00238182, 1. , 0. , 1. , 0. ,
0. , 0. , 0. , 0. , 0. ,
1. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 1. ])
In [67]:
train_label = train_label.reshape(-1,1)
In [68]:
train_label
Out[68]:
array([[455],
[663],
[429],
...,
[778],
[373],
[198]])
In [190]:
import matplotlib.pyplot as plt
def show_train_history(train_history,train,validation):
plt.plot(train_history.history[train])
plt.plot(train_history.history[validation])
plt.title('Train History')
plt.ylabel(train)
plt.xlabel('Epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()
######################### 建立模型
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.models import load_model
model = Sequential()
model.add(Dense(units=100,
input_dim=35,
kernel_initializer='uniform'))
model.add(Dropout(0.5))
model.add(Dense(units=100,
kernel_initializer='uniform'))
model.add(Dropout(0.5))
model.add(Dense(units=50,
kernel_initializer='uniform'))
model.add(Dropout(0.5))
model.add(Dense(units=1))
print(model.summary())
#可以反覆訓練fine tune
weights_path = 'Savemodels/TaxiDuration(Kaggles)_MLP.h5'
model.load_weights(weights_path)
model.compile(loss='mean_squared_logarithmic_error',
optimizer='adam', metrics=['accuracy'])
train_history = model.fit(train_feature, train_label,
validation_split=0.2, epochs=20, batch_size=50000, verbose=2)
######################### 訓練過程視覺化
show_train_history(train_history,'acc','val_acc')
show_train_history(train_history,'loss','val_loss')
#儲存訓練結果
model.save_weights("Savemodels/TaxiDuration(Kaggles)_MLP.h5")
print('model saved to disk')
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
dense_73 (Dense) (None, 100) 3600
_________________________________________________________________
dropout_54 (Dropout) (None, 100) 0
_________________________________________________________________
dense_74 (Dense) (None, 100) 10100
_________________________________________________________________
dropout_55 (Dropout) (None, 100) 0
_________________________________________________________________
dense_75 (Dense) (None, 50) 5050
_________________________________________________________________
dropout_56 (Dropout) (None, 50) 0
_________________________________________________________________
dense_76 (Dense) (None, 1) 51
=================================================================
Total params: 18,801
Trainable params: 18,801
Non-trainable params: 0
_________________________________________________________________
None
Train on 698588 samples, validate on 174647 samples
Epoch 1/20
11s - loss: 0.1971 - acc: 0.0016 - val_loss: 0.1609 - val_acc: 0.0016
Epoch 2/20
11s - loss: 0.1878 - acc: 0.0017 - val_loss: 0.1546 - val_acc: 0.0019
Epoch 3/20
12s - loss: 0.1865 - acc: 0.0017 - val_loss: 0.1549 - val_acc: 0.0018
Epoch 4/20
13s - loss: 0.1860 - acc: 0.0017 - val_loss: 0.1536 - val_acc: 0.0016
Epoch 5/20
13s - loss: 0.1859 - acc: 0.0017 - val_loss: 0.1537 - val_acc: 0.0018
Epoch 6/20
13s - loss: 0.1857 - acc: 0.0016 - val_loss: 0.1541 - val_acc: 0.0019
Epoch 7/20
13s - loss: 0.1858 - acc: 0.0017 - val_loss: 0.1541 - val_acc: 0.0019
Epoch 8/20
13s - loss: 0.1854 - acc: 0.0017 - val_loss: 0.1541 - val_acc: 0.0019
Epoch 9/20
14s - loss: 0.1857 - acc: 0.0016 - val_loss: 0.1539 - val_acc: 0.0021
Epoch 10/20
13s - loss: 0.1859 - acc: 0.0016 - val_loss: 0.1538 - val_acc: 0.0017
Epoch 11/20
13s - loss: 0.1855 - acc: 0.0016 - val_loss: 0.1538 - val_acc: 0.0018
Epoch 12/20
14s - loss: 0.1856 - acc: 0.0017 - val_loss: 0.1538 - val_acc: 0.0018
Epoch 13/20
13s - loss: 0.1859 - acc: 0.0018 - val_loss: 0.1539 - val_acc: 0.0020
Epoch 14/20
13s - loss: 0.1859 - acc: 0.0017 - val_loss: 0.1539 - val_acc: 0.0019
Epoch 15/20
14s - loss: 0.1855 - acc: 0.0017 - val_loss: 0.1539 - val_acc: 0.0019
Epoch 16/20
14s - loss: 0.1855 - acc: 0.0017 - val_loss: 0.1537 - val_acc: 0.0018
Epoch 17/20
14s - loss: 0.1857 - acc: 0.0016 - val_loss: 0.1543 - val_acc: 0.0019
Epoch 18/20
17s - loss: 0.1854 - acc: 0.0018 - val_loss: 0.1538 - val_acc: 0.0018
Epoch 19/20
14s - loss: 0.1856 - acc: 0.0016 - val_loss: 0.1538 - val_acc: 0.0017
Epoch 20/20
14s - loss: 0.1855 - acc: 0.0018 - val_loss: 0.1538 - val_acc: 0.0019
model saved to disk
In [233]:
prediction = model.predict(test_feature)
In [234]:
import matplotlib.pyplot as plt
plt.hist(prediction, range=(0,1500), bins = 20)
plt.show()
In [235]:
prediction[0]
Out[235]:
array([ 658.69891357], dtype=float32)
In [236]:
prediction<0
Out[236]:
array([[False],
[False],
[False],
...,
[False],
[False],
[False]], dtype=bool)
In [237]:
df_ans['trip_duration'] = prediction
In [238]:
df_ans
Out[238]:
id
trip_duration
0
id3004672
658.698914
1
id3505355
664.361084
2
id1217141
406.257385
3
id2150126
1276.813354
4
id1598245
324.580902
5
id0668992
973.631470
6
id1765014
918.587402
7
id0898117
822.753113
8
id3905224
4400.172363
9
id1543102
518.692566
10
id3024712
1077.243774
11
id3665810
289.587524
12
id1836461
482.393799
13
id3457080
653.956177
14
id3376065
1412.667114
15
id3008739
1207.490112
16
id0902216
1037.971436
17
id3564824
451.457336
18
id0820280
241.093140
19
id0775088
1438.441162
20
id1468488
276.649902
21
id2657479
534.316345
22
id1262719
554.161133
23
id1345524
1143.043579
24
id2911638
1258.527466
25
id2849512
2160.988525
26
id0236829
656.974670
27
id2905906
390.893555
28
id3737939
2346.187988
29
id0766179
1758.731079
...
...
...
625104
id0120169
355.541199
625105
id0386349
319.337341
625106
id1962532
575.737732
625107
id0335207
468.326477
625108
id0273508
1967.992188
625109
id2936770
786.898804
625110
id0044526
162.181168
625111
id3605431
956.296326
625112
id2681896
410.309784
625113
id3308448
455.057373
625114
id2108525
437.978241
625115
id3952220
3394.890869
625116
id2771348
553.093018
625117
id3065313
1945.131592
625118
id2332834
1581.847168
625119
id3495407
347.935974
625120
id3811106
384.980469
625121
id2693698
256.367401
625122
id2884571
284.666626
625123
id2790343
1468.061279
625124
id1901191
718.947998
625125
id0664662
723.492249
625126
id2073829
297.848938
625127
id0328287
369.102020
625128
id1340822
193.294464
625129
id3008929
236.071793
625130
id3700764
1553.753784
625131
id2568735
2267.314209
625132
id1384355
4672.929199
625133
id0621643
1284.193237
625134 rows × 2 columns
In [239]:
df_ans['trip_duration'].describe()
Out[239]:
count 625134.000000
mean 961.263977
std 1055.356079
min -75.861900
25% 443.087776
50% 641.375122
75% 1030.174469
max 241286.125000
Name: trip_duration, dtype: float64
In [240]:
df_ans[df_ans['trip_duration']>900]
Out[240]:
id
trip_duration
3
id2150126
1276.813354
5
id0668992
973.631470
6
id1765014
918.587402
8
id3905224
4400.172363
10
id3024712
1077.243774
14
id3376065
1412.667114
15
id3008739
1207.490112
16
id0902216
1037.971436
19
id0775088
1438.441162
23
id1345524
1143.043579
24
id2911638
1258.527466
25
id2849512
2160.988525
28
id3737939
2346.187988
29
id0766179
1758.731079
31
id3864673
1758.454224
34
id1686925
1563.794434
36
id1860783
1036.201660
39
id2844603
1836.709595
41
id1408427
2745.732422
42
id1712395
1009.190430
53
id0364901
1067.034424
55
id0814840
4428.150391
56
id3159174
3708.820801
57
id3650870
2752.457520
63
id0527242
944.176575
64
id1890566
1265.187622
65
id0882843
5028.599609
66
id2126329
3596.704834
75
id1707352
1759.266968
76
id1650458
940.201111
...
...
...
625037
id0628544
1175.407837
625039
id2955883
1221.528198
625044
id1186878
1486.027954
625047
id1674078
2840.673340
625049
id3578600
1203.533813
625050
id3125631
1460.024414
625054
id0332493
2740.439697
625056
id2411893
985.152283
625062
id2980593
915.679504
625064
id0885135
1126.859253
625066
id3219550
1307.898926
625070
id2346166
2727.847656
625071
id3076640
2169.414062
625076
id0689827
3979.948242
625080
id1027135
1369.531738
625081
id0256919
1077.124146
625082
id3919531
1476.884155
625090
id2282537
1087.634033
625095
id0876134
1374.796387
625099
id1670163
1120.195190
625108
id0273508
1967.992188
625111
id3605431
956.296326
625115
id3952220
3394.890869
625117
id3065313
1945.131592
625118
id2332834
1581.847168
625123
id2790343
1468.061279
625130
id3700764
1553.753784
625131
id2568735
2267.314209
625132
id1384355
4672.929199
625133
id0621643
1284.193237
191269 rows × 2 columns
In [242]:
df_ans[df_ans.trip_duration<0]
Out[242]:
id
trip_duration
2892
0
-12.690598
2991
0
-31.010937
9325
0
-10.982977
9414
0
-41.040585
9419
0
-39.292061
9440
0
-3.148615
9441
0
-30.874996
12383
0
-12.909513
12473
0
-41.011528
14891
0
-13.086611
14971
0
-31.073009
15044
0
-31.073009
18436
0
-12.888595
18524
0
-31.102066
18528
0
-30.512924
25860
0
-9.407928
25921
0
-12.191135
25959
0
-36.496025
25963
0
-41.011528
29396
0
-30.006660
32738
0
-13.115671
32751
0
-12.196654
32795
0
-1.484802
35791
0
-13.086611
35800
0
-21.641125
35807
0
-41.011528
35818
0
-1.253344
41592
0
-36.011440
41616
0
-30.676970
41643
0
-29.884901
...
...
...
596217
0
-52.376923
596343
0
-18.180151
596459
0
-9.101417
599914
0
-46.859680
599927
0
-30.613188
599994
0
-65.952454
600087
0
-17.885429
600186
0
-7.884256
603741
0
-47.223053
603805
0
-74.487122
603845
0
-63.456509
603868
0
-7.730523
607095
0
-15.979280
607141
0
-75.663872
610400
0
-29.581120
610447
0
-65.923409
613501
0
-11.531940
613512
0
-28.689337
616376
0
-41.309258
618614
0
-65.952454
618755
0
-9.101417
618819
0
-9.072357
621561
0
-47.327480
621626
0
-65.587334
621733
0
-9.072357
624091
0
-27.420218
624096
0
-58.307053
624116
0
-16.993511
624261
0
-7.612906
624343
0
-7.217021
459 rows × 2 columns
In [243]:
ans = df_ans['trip_duration'].tolist()
for i in range(len(ans)):
if ans[i] < 0:
ans[i] = 1
df_ans['trip_duration'] = ans
In [244]:
df_ans[df_ans['id']==0]
Out[244]:
id
trip_duration
2892
0
1.0
2991
0
1.0
9325
0
1.0
9414
0
1.0
9419
0
1.0
9440
0
1.0
9441
0
1.0
12383
0
1.0
12473
0
1.0
14891
0
1.0
14971
0
1.0
15044
0
1.0
18436
0
1.0
18524
0
1.0
18528
0
1.0
25860
0
1.0
25921
0
1.0
25959
0
1.0
25963
0
1.0
29396
0
1.0
32738
0
1.0
32751
0
1.0
32795
0
1.0
35791
0
1.0
35800
0
1.0
35807
0
1.0
35818
0
1.0
41592
0
1.0
41616
0
1.0
41643
0
1.0
...
...
...
596217
0
1.0
596343
0
1.0
596459
0
1.0
599914
0
1.0
599927
0
1.0
599994
0
1.0
600087
0
1.0
600186
0
1.0
603741
0
1.0
603805
0
1.0
603845
0
1.0
603868
0
1.0
607095
0
1.0
607141
0
1.0
610400
0
1.0
610447
0
1.0
613501
0
1.0
613512
0
1.0
616376
0
1.0
618614
0
1.0
618755
0
1.0
618819
0
1.0
621561
0
1.0
621626
0
1.0
621733
0
1.0
624091
0
1.0
624096
0
1.0
624116
0
1.0
624261
0
1.0
624343
0
1.0
459 rows × 2 columns
In [246]:
df_ans.to_csv('TaxiDuration_ans.csv',mode = 'w', index=False)
In [ ]:
Content source: Pytoddler/Kaggle-competition
Similar notebooks:
notebook.community | gallery | about