資料預處理:能使用list(文字)和array(計算)就不要用pandas,太慢囉

觀察


In [1]:
import numpy as np
import pandas as pd
import os

filepath = '/Users/mac/Desktop/Kaggle_datasets/Taxi_Duration/'
filename01 = 'train.csv'
filename02 = 'test.csv'
filename03 = 'sample_submission.csv'

df_train = pd.read_csv(os.path.join(filepath, filename01))
df_test = pd.read_csv(os.path.join(filepath, filename02))
df_ans = pd.read_csv(os.path.join(filepath, filename03))

In [2]:
df_train_copy = df_train
df_test_copy = df_test

In [3]:
df_train_copy.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1458644 entries, 0 to 1458643
Data columns (total 11 columns):
id                    1458644 non-null object
vendor_id             1458644 non-null int64
pickup_datetime       1458644 non-null object
dropoff_datetime      1458644 non-null object
passenger_count       1458644 non-null int64
pickup_longitude      1458644 non-null float64
pickup_latitude       1458644 non-null float64
dropoff_longitude     1458644 non-null float64
dropoff_latitude      1458644 non-null float64
store_and_fwd_flag    1458644 non-null object
trip_duration         1458644 non-null int64
dtypes: float64(4), int64(3), object(4)
memory usage: 122.4+ MB

In [4]:
df_train_copy


Out[4]:
id vendor_id pickup_datetime dropoff_datetime passenger_count pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude store_and_fwd_flag trip_duration
0 id2875421 2 2016-03-14 17:24:55 2016-03-14 17:32:30 1 -73.982155 40.767937 -73.964630 40.765602 N 455
1 id2377394 1 2016-06-12 00:43:35 2016-06-12 00:54:38 1 -73.980415 40.738564 -73.999481 40.731152 N 663
2 id3858529 2 2016-01-19 11:35:24 2016-01-19 12:10:48 1 -73.979027 40.763939 -74.005333 40.710087 N 2124
3 id3504673 2 2016-04-06 19:32:31 2016-04-06 19:39:40 1 -74.010040 40.719971 -74.012268 40.706718 N 429
4 id2181028 2 2016-03-26 13:30:55 2016-03-26 13:38:10 1 -73.973053 40.793209 -73.972923 40.782520 N 435
5 id0801584 2 2016-01-30 22:01:40 2016-01-30 22:09:03 6 -73.982857 40.742195 -73.992081 40.749184 N 443
6 id1813257 1 2016-06-17 22:34:59 2016-06-17 22:40:40 4 -73.969017 40.757839 -73.957405 40.765896 N 341
7 id1324603 2 2016-05-21 07:54:58 2016-05-21 08:20:49 1 -73.969276 40.797779 -73.922470 40.760559 N 1551
8 id1301050 1 2016-05-27 23:12:23 2016-05-27 23:16:38 1 -73.999481 40.738400 -73.985786 40.732815 N 255
9 id0012891 2 2016-03-10 21:45:01 2016-03-10 22:05:26 1 -73.981049 40.744339 -73.973000 40.789989 N 1225
10 id1436371 2 2016-05-10 22:08:41 2016-05-10 22:29:55 1 -73.982651 40.763840 -74.002228 40.732990 N 1274
11 id1299289 2 2016-05-15 11:16:11 2016-05-15 11:34:59 4 -73.991531 40.749439 -73.956543 40.770630 N 1128
12 id1187965 2 2016-02-19 09:52:46 2016-02-19 10:11:20 2 -73.962982 40.756680 -73.984406 40.760719 N 1114
13 id0799785 2 2016-06-01 20:58:29 2016-06-01 21:02:49 1 -73.956306 40.767941 -73.966110 40.763000 N 260
14 id2900608 2 2016-05-27 00:43:36 2016-05-27 01:07:10 1 -73.992195 40.727226 -73.974655 40.783070 N 1414
15 id3319787 1 2016-05-16 15:29:02 2016-05-16 15:32:33 1 -73.955513 40.768593 -73.948761 40.771545 N 211
16 id3379579 2 2016-04-11 17:29:50 2016-04-11 18:08:26 1 -73.991165 40.755562 -73.999290 40.725353 N 2316
17 id1154431 1 2016-04-14 08:48:26 2016-04-14 09:00:37 1 -73.994255 40.745804 -73.999657 40.723343 N 731
18 id3552682 1 2016-06-27 09:55:13 2016-06-27 10:17:10 1 -74.003983 40.713013 -73.979195 40.749924 N 1317
19 id3390316 2 2016-06-05 13:47:23 2016-06-05 13:51:34 1 -73.983887 40.738197 -73.991203 40.727871 N 251
20 id2070428 1 2016-02-28 02:23:02 2016-02-28 02:31:08 1 -73.980370 40.742420 -73.962852 40.760635 N 486
21 id0809232 2 2016-04-01 12:12:25 2016-04-01 12:23:17 1 -73.979538 40.753361 -73.963997 40.763458 N 652
22 id2352683 1 2016-04-09 03:34:27 2016-04-09 03:41:30 1 -73.995865 40.758812 -73.993324 40.740322 N 423
23 id1603037 1 2016-06-25 10:36:26 2016-06-25 10:55:49 1 -73.993553 40.747173 -74.006142 40.704384 N 1163
24 id3321406 2 2016-06-03 08:15:05 2016-06-03 08:56:30 1 -73.955231 40.777134 -73.788750 40.641472 N 2485
25 id0129640 2 2016-02-14 13:27:56 2016-02-14 13:49:19 1 -73.956581 40.771358 -73.974968 40.732792 N 1283
26 id3587298 1 2016-02-27 21:56:01 2016-02-27 22:14:51 1 -73.983765 40.749874 -73.958832 40.800961 N 1130
27 id2104175 1 2016-06-20 23:07:16 2016-06-20 23:18:50 1 -73.958435 40.713192 -73.949539 40.680252 N 694
28 id3973319 2 2016-06-13 21:57:27 2016-06-13 22:12:19 1 -73.994217 40.713306 -73.982849 40.692299 N 892
29 id1410897 1 2016-03-23 14:10:39 2016-03-23 14:49:30 1 -73.982117 40.756351 -73.865692 40.770988 N 2331
... ... ... ... ... ... ... ... ... ... ... ...
1458614 id2061444 2 2016-02-08 17:16:07 2016-02-08 17:21:45 1 -73.980927 40.767651 -73.965302 40.765251 N 338
1458615 id3182230 1 2016-02-05 17:57:08 2016-02-05 18:11:25 1 -73.991013 40.728321 -73.966766 40.711548 N 857
1458616 id2822294 1 2016-04-22 17:21:14 2016-04-22 17:29:22 1 -73.988327 40.732147 -73.999641 40.734192 N 488
1458617 id0820021 2 2016-04-15 08:31:20 2016-04-15 08:34:48 1 -73.975433 40.752411 -73.973122 40.746780 N 208
1458618 id1046767 2 2016-04-17 01:46:48 2016-04-17 01:52:55 1 -73.987564 40.733387 -74.001129 40.731056 N 367
1458619 id1083860 2 2016-04-23 12:14:15 2016-04-23 12:26:03 1 -73.954773 40.777882 -73.980904 40.782516 N 708
1458620 id0694577 2 2016-04-28 20:51:03 2016-04-28 21:10:25 1 -73.966324 40.758072 -74.006516 40.736641 N 1162
1458621 id3267199 2 2016-05-09 14:33:30 2016-05-09 15:12:45 1 -73.959534 40.782749 -73.990959 40.751091 N 2355
1458622 id0125435 2 2016-02-19 18:26:52 2016-02-19 18:36:04 1 -74.008408 40.721142 -74.000557 40.723911 N 552
1458623 id3369208 1 2016-01-18 20:35:30 2016-01-18 20:44:44 1 -73.991081 40.737408 -73.987671 40.722622 N 554
1458624 id3482902 1 2016-03-01 07:21:04 2016-03-01 07:23:36 1 -73.974693 40.756088 -73.969971 40.762115 N 152
1458625 id3730733 2 2016-01-25 17:21:15 2016-01-25 17:54:37 1 -73.989655 40.740612 -73.961029 40.765366 N 2002
1458626 id0155863 2 2016-01-17 17:21:11 2016-01-17 17:25:15 2 -73.954071 40.767021 -73.950340 40.778233 N 244
1458627 id0439281 2 2016-06-23 10:10:28 2016-06-23 10:25:08 5 -73.981651 40.767708 -73.959183 40.777412 N 880
1458628 id0986544 2 2016-05-30 03:08:19 2016-05-30 03:14:10 2 -73.988632 40.721378 -73.975548 40.728519 N 351
1458629 id3109086 2 2016-06-24 10:33:51 2016-06-24 10:43:52 1 -73.959618 40.808941 -73.947922 40.830189 N 601
1458630 id0287353 2 2016-06-25 03:44:32 2016-06-25 03:53:41 5 -73.991508 40.727135 -73.988136 40.740932 N 549
1458631 id1724231 1 2016-05-14 23:18:23 2016-05-14 23:24:05 3 -73.958946 40.763725 -73.953156 40.780003 N 342
1458632 id0469946 2 2016-03-06 11:04:48 2016-03-06 11:17:45 2 -74.015572 40.710892 -73.996620 40.743633 N 777
1458633 id2432342 1 2016-03-17 19:10:16 2016-03-17 19:26:35 3 -73.979652 40.735279 -73.995522 40.759754 N 979
1458634 id3445276 1 2016-04-03 13:51:25 2016-04-03 14:07:37 2 -73.989075 40.730465 -73.963882 40.773739 N 972
1458635 id3027038 2 2016-05-19 14:46:55 2016-05-19 14:50:52 1 -73.985390 40.763020 -73.989708 40.767502 N 237
1458636 id0405770 2 2016-02-12 10:13:06 2016-02-12 10:26:26 1 -73.863815 40.769684 -73.864395 40.761326 N 800
1458637 id1920898 1 2016-04-17 18:48:16 2016-04-17 19:00:56 1 -73.975357 40.751705 -73.949478 40.776764 N 760
1458638 id1454193 2 2016-02-02 00:39:39 2016-02-02 00:46:33 5 -73.988823 40.736553 -73.989166 40.757393 N 414
1458639 id2376096 2 2016-04-08 13:31:04 2016-04-08 13:44:02 4 -73.982201 40.745522 -73.994911 40.740170 N 778
1458640 id1049543 1 2016-01-10 07:35:15 2016-01-10 07:46:10 1 -74.000946 40.747379 -73.970184 40.796547 N 655
1458641 id2304944 2 2016-04-22 06:57:41 2016-04-22 07:10:25 1 -73.959129 40.768799 -74.004433 40.707371 N 764
1458642 id2714485 1 2016-01-05 15:56:26 2016-01-05 16:02:39 1 -73.982079 40.749062 -73.974632 40.757107 N 373
1458643 id1209952 1 2016-04-05 14:44:25 2016-04-05 14:47:43 1 -73.979538 40.781750 -73.972809 40.790585 N 198

1458644 rows × 11 columns


In [5]:
df_train_copy['delta_long'] = abs(df_train_copy['pickup_longitude']-df_train_copy['dropoff_longitude']) 
df_train_copy['delta_la'] = abs(df_train_copy['pickup_latitude']-df_train_copy['dropoff_latitude'])
df_train_copy['dist'] = np.sqrt(df_train_copy['delta_long']**2 + df_train_copy['delta_la']**2)

df_test_copy['delta_long'] = abs(df_test_copy['pickup_longitude']-df_test_copy['dropoff_longitude']) 
df_test_copy['delta_la'] = abs(df_test_copy['pickup_latitude']-df_test_copy['dropoff_latitude'])
df_test_copy['dist'] = np.sqrt(df_test_copy['delta_long']**2 + df_test_copy['delta_la']**2)

In [6]:
import matplotlib.pyplot as plt
plt.scatter(df_train_copy.dist, df_train_copy.trip_duration, alpha=0.1)
plt.show()



In [7]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

sns.jointplot(df_train_copy.dist, df_train_copy.trip_duration , data = df_train_copy)
plt.show()



In [17]:
df_train_copy[df_train_copy.dist>2]


Out[17]:
id vendor_id pickup_datetime dropoff_datetime passenger_count pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude store_and_fwd_flag trip_duration delta_long delta_la dist
184925 id2306955 1 2016-05-07 18:58:53 2016-05-07 19:12:05 1 -72.809669 51.881084 -73.987228 40.750599 N 792 1.177559 11.130486 11.192603
275644 id0978162 1 2016-02-24 16:20:59 2016-02-24 16:35:34 4 -75.354332 34.712234 -73.834923 32.181141 N 875 1.519409 2.531094 2.952124
377068 id0116374 1 2016-04-02 20:33:19 2016-04-02 20:38:01 1 -74.007095 40.717113 -76.963242 38.946033 N 282 2.956146 1.771080 3.446088
397526 id0982904 1 2016-04-28 13:32:14 2016-04-28 14:14:09 2 -73.870682 40.773598 -79.817978 38.963852 N 2515 5.947296 1.809746 6.216551
644165 id0401529 2 2016-06-02 15:19:35 2016-06-02 15:32:59 1 -73.980751 40.757111 -74.240051 38.478298 N 804 0.259300 2.278812 2.293518
910072 id1146400 1 2016-02-15 18:57:32 2016-02-15 19:02:35 2 -73.989914 40.756634 -70.346077 36.398121 N 303 3.643837 4.358513 5.681037
923793 id1001696 1 2016-02-24 21:02:32 2016-02-24 21:25:57 1 -73.972366 40.758633 -79.553535 43.674000 N 1405 5.581169 2.915367 6.296730
974378 id1510552 2 2016-01-06 20:40:52 2016-01-06 20:51:03 5 -71.799896 35.081532 -79.352837 40.436329 N 611 7.552940 5.354797 9.258551
1013474 id3626673 1 2016-05-05 18:02:50 2016-05-05 18:18:23 1 -73.978912 40.756763 -79.338699 41.427902 N 933 5.359787 0.671139 5.401643
1060807 id0838705 1 2016-02-26 19:50:03 2016-02-26 20:08:54 1 -66.972160 44.371944 -69.048019 43.147583 N 1131 2.075859 1.224361 2.410031
1100676 id2644780 1 2016-05-03 16:24:07 2016-05-03 17:18:34 2 -73.991325 40.750023 -79.518616 43.921028 N 3267 5.527290 3.171005 6.372300
1301396 id1216866 1 2016-03-26 22:01:54 2016-03-27 00:47:16 1 -73.981491 40.773251 -76.135719 40.243626 N 9922 2.154228 0.529625 2.218378

In [19]:
df_train_copy[df_train_copy.trip_duration>3000]


Out[19]:
id vendor_id pickup_datetime dropoff_datetime passenger_count pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude store_and_fwd_flag trip_duration delta_long delta_la dist
55 id3827863 2 2016-04-19 11:29:08 2016-04-19 12:27:56 3 -73.792892 40.657879 -73.809189 40.690182 N 3528 0.016296 0.032303 0.036181
354 id3402983 2 2016-06-30 15:48:06 2016-06-30 17:31:13 1 -73.781898 40.644772 -73.985893 40.760159 N 6187 0.203995 0.115387 0.234367
403 id2693863 1 2016-03-18 08:22:10 2016-03-18 09:47:19 1 -73.777184 40.646500 -73.985001 40.760918 N 5109 0.207817 0.114418 0.237233
531 id3307903 2 2016-02-20 04:03:06 2016-02-21 03:33:00 3 -74.008102 40.741489 -74.009956 40.714611 N 84594 0.001854 0.026878 0.026942
563 id3607196 2 2016-01-26 11:22:27 2016-01-26 12:20:57 1 -74.017250 40.708477 -73.979927 40.761356 N 3510 0.037323 0.052879 0.064724
861 id2029339 2 2016-01-22 14:13:46 2016-01-22 15:15:21 1 -73.873360 40.774109 -73.958115 40.775558 N 3695 0.084755 0.001450 0.084767
976 id3579210 2 2016-01-25 21:05:42 2016-01-25 22:01:52 1 -73.782089 40.644650 -73.974243 40.789761 N 3370 0.192154 0.145111 0.240791
1031 id1211472 1 2016-05-12 14:11:19 2016-05-12 15:03:16 1 -73.951576 40.791344 -73.789146 40.641678 Y 3117 0.162430 0.149666 0.220869
1057 id0631822 2 2016-05-17 14:17:48 2016-05-17 15:26:06 1 -73.995583 40.716949 -73.789101 40.642448 N 4098 0.206482 0.074501 0.219511
1101 id3913560 2 2016-01-11 22:48:55 2016-01-11 23:40:20 2 -74.012657 40.702179 -74.307762 40.567341 N 3085 0.295105 0.134838 0.324451
1113 id3893063 2 2016-06-02 17:32:41 2016-06-02 18:42:43 1 -73.948578 40.778080 -73.781792 40.646790 N 4202 0.166786 0.131290 0.212261
1134 id1091477 2 2016-05-07 18:36:22 2016-05-08 18:32:11 1 -73.990242 40.750919 -73.976280 40.750889 N 86149 0.013962 0.000031 0.013962
1160 id1040844 1 2016-06-03 15:10:40 2016-06-03 16:36:32 3 -73.872971 40.774124 -73.975449 40.765354 N 5152 0.102478 0.008770 0.102853
1248 id2553024 1 2016-04-23 13:43:33 2016-04-23 14:47:19 4 -73.781555 40.644749 -73.989708 40.758984 N 3826 0.208153 0.114235 0.237439
1280 id0896335 1 2016-01-25 11:07:10 2016-01-25 12:04:58 1 -73.776794 40.645473 -73.958183 40.673565 N 3468 0.181389 0.028091 0.183551
1297 id0306216 2 2016-02-08 16:49:18 2016-02-08 17:44:53 2 -73.975739 40.758381 -73.783272 40.643829 N 3335 0.192467 0.114552 0.223977
1331 id3126063 2 2016-03-22 14:10:31 2016-03-22 15:01:06 1 -73.982307 40.723171 -73.807510 40.655022 N 3035 0.174797 0.068150 0.187612
1378 id2295021 2 2016-04-10 17:39:50 2016-04-10 18:48:17 6 -73.903511 40.639160 -73.903107 40.639210 N 4107 0.000404 0.000050 0.000407
1387 id2054300 1 2016-03-25 15:40:55 2016-03-25 16:33:23 1 -73.790184 40.646782 -73.962723 40.758728 N 3148 0.172539 0.111946 0.205673
1395 id3286220 1 2016-03-21 16:36:56 2016-03-21 17:42:45 1 -73.948822 40.773197 -73.782997 40.643906 N 3949 0.165825 0.129292 0.210272
1487 id0067459 2 2016-04-22 11:55:21 2016-04-22 12:50:38 1 -73.945900 40.786091 -73.885216 40.679943 N 3317 0.060684 0.106148 0.122270
1521 id1629416 1 2016-02-29 10:04:11 2016-02-29 10:59:56 1 -73.786102 40.639755 -74.003876 40.716850 N 3345 0.217773 0.077095 0.231017
1531 id0297286 2 2016-03-24 22:33:54 2016-03-24 23:25:02 5 -73.983299 40.766411 -73.936111 40.697361 N 3068 0.047188 0.069050 0.083634
1610 id0324418 2 2016-04-18 11:01:11 2016-04-18 11:52:40 1 -73.782066 40.644733 -73.939980 40.841206 N 3089 0.157913 0.196472 0.252067
1613 id3057665 1 2016-05-20 00:04:32 2016-05-20 01:40:36 1 -73.982430 40.756622 -73.915901 40.618385 N 5764 0.066528 0.138237 0.153413
1618 id0717835 1 2016-05-26 12:52:19 2016-05-26 13:43:45 1 -73.985916 40.736004 -73.789841 40.647041 N 3086 0.196075 0.088963 0.215314
1717 id1297383 1 2016-05-06 23:59:25 2016-05-07 00:50:42 1 -74.005775 40.733086 -73.813446 40.710449 N 3077 0.192329 0.022636 0.193657
1718 id2509513 1 2016-02-12 15:20:26 2016-02-12 16:11:35 1 -73.870949 40.773811 -74.008217 40.733921 N 3069 0.137268 0.039890 0.142947
1810 id3289160 2 2016-06-11 13:16:55 2016-06-11 14:09:27 1 -73.981949 40.761753 -73.981300 40.760811 N 3152 0.000648 0.000942 0.001144
1852 id1212821 2 2016-05-13 15:33:52 2016-05-13 16:35:02 1 -73.872917 40.774200 -73.982277 40.739410 N 3670 0.109360 0.034790 0.114760
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1456758 id2468950 1 2016-06-20 06:26:56 2016-06-20 07:35:47 3 -73.779076 40.647453 -73.984070 40.695400 N 4131 0.204994 0.047947 0.210527
1456769 id1594834 2 2016-04-01 13:52:37 2016-04-01 14:47:41 1 -74.008499 40.746078 -73.873009 40.774288 N 3304 0.135490 0.028210 0.138396
1456832 id3237194 1 2016-05-26 17:41:28 2016-05-26 18:32:41 1 -73.863472 40.770042 -73.988480 40.769516 N 3073 0.125008 0.000526 0.125009
1456907 id2902176 1 2016-06-03 13:55:02 2016-06-03 15:11:11 2 -73.807365 40.655212 -73.994232 40.745899 N 4569 0.186867 0.090687 0.207710
1456972 id2240253 2 2016-05-08 21:14:49 2016-05-08 22:06:39 1 -74.000961 40.746410 -73.916878 40.842361 N 3110 0.084084 0.095951 0.127580
1457118 id1737779 2 2016-04-11 14:58:26 2016-04-11 16:04:17 3 -73.781799 40.644886 -73.986588 40.769478 N 3951 0.204788 0.124592 0.239711
1457207 id1910950 2 2016-06-08 16:29:12 2016-06-09 16:11:03 1 -73.958061 40.800869 -73.964119 40.805515 N 85311 0.006058 0.004646 0.007634
1457307 id0949868 2 2016-06-30 17:44:48 2016-06-30 19:04:10 5 -73.997070 40.746990 -73.782570 40.644119 N 4762 0.214500 0.102871 0.237893
1457326 id0329809 2 2016-04-13 11:14:24 2016-04-13 12:09:45 6 -73.873009 40.774052 -73.974945 40.759502 N 3321 0.101936 0.014549 0.102969
1457659 id2081814 1 2016-05-13 06:57:13 2016-05-13 08:03:19 1 -73.781868 40.644672 -74.010361 40.718979 N 3966 0.228493 0.074306 0.240271
1457752 id1215198 2 2016-02-02 11:31:10 2016-02-03 11:29:44 6 -73.972069 40.794220 -73.961014 40.806728 N 86314 0.011055 0.012508 0.016693
1457871 id1549362 2 2016-02-18 15:54:26 2016-02-18 16:51:07 5 -73.782013 40.646851 -73.982254 40.755718 N 3401 0.200241 0.108868 0.227922
1457877 id1093816 1 2016-03-13 17:19:21 2016-03-13 18:11:55 1 -73.788727 40.641460 -73.991020 40.731461 N 3154 0.202293 0.090000 0.221411
1457965 id0452167 1 2016-02-10 07:46:50 2016-02-10 08:50:18 1 -73.943726 40.835865 -73.972069 40.762417 N 3808 0.028343 0.073448 0.078727
1458011 id0169298 2 2016-05-19 10:12:23 2016-05-19 11:07:13 5 -73.863136 40.769482 -73.997101 40.750431 N 3290 0.133965 0.019051 0.135312
1458034 id2290196 1 2016-04-01 20:16:44 2016-04-01 21:08:32 1 -73.780212 40.645531 -73.981400 40.763084 N 3108 0.201187 0.117554 0.233013
1458053 id0087401 2 2016-04-19 14:14:15 2016-04-19 15:06:46 4 -73.863487 40.769901 -73.985809 40.759800 N 3151 0.122322 0.010101 0.122738
1458058 id1648084 1 2016-05-06 16:06:33 2016-05-06 16:57:54 1 -73.863701 40.769882 -73.980873 40.764423 N 3081 0.117172 0.005459 0.117299
1458076 id2332290 2 2016-04-24 00:58:34 2016-04-25 00:56:16 1 -73.990479 40.760876 -73.983025 40.764557 N 86262 0.007454 0.003681 0.008313
1458142 id1140321 1 2016-06-21 09:19:51 2016-06-21 10:11:00 1 -73.988449 40.769115 -73.782249 40.644234 N 3069 0.206200 0.124882 0.241068
1458223 id0644773 1 2016-04-21 18:34:36 2016-04-21 19:24:51 4 -73.776634 40.645344 -73.976517 40.672150 N 3015 0.199883 0.026806 0.201672
1458247 id3078288 2 2016-01-07 14:40:39 2016-01-07 16:09:39 1 -73.974297 40.742706 -73.881180 40.766731 N 5340 0.093117 0.024025 0.096166
1458263 id3934738 2 2016-06-09 08:37:03 2016-06-09 09:27:50 5 -73.870918 40.773754 -73.998497 40.737244 N 3047 0.127579 0.036510 0.132700
1458275 id2979452 1 2016-05-25 12:57:52 2016-05-25 13:49:37 2 -73.866280 40.767677 -73.984802 40.768402 N 3105 0.118523 0.000725 0.118525
1458310 id1325943 1 2016-05-25 15:08:13 2016-05-25 15:58:24 1 -73.993568 40.724503 -73.955742 40.779602 N 3011 0.037827 0.055099 0.066834
1458328 id2824253 1 2016-03-03 08:09:29 2016-03-03 09:04:10 1 -73.961922 40.800533 -74.177269 40.691124 N 3281 0.215347 0.109409 0.241547
1458329 id0067309 1 2016-06-02 12:31:02 2016-06-02 13:31:49 4 -73.873878 40.773800 -73.973000 40.755688 N 3647 0.099121 0.018112 0.100762
1458333 id1758713 2 2016-04-17 14:56:46 2016-04-17 15:46:59 1 -73.782722 40.644966 -73.974808 40.750660 N 3013 0.192085 0.105694 0.219244
1458550 id2976426 1 2016-06-06 10:48:13 2016-06-06 11:40:31 3 -73.784454 40.648521 -73.972176 40.757133 N 3138 0.187721 0.108612 0.216877
1458600 id0995846 2 2016-05-09 17:26:56 2016-05-09 18:30:37 2 -73.789543 40.647099 -73.960320 40.798180 N 3821 0.170776 0.151081 0.228013

23823 rows × 14 columns


In [8]:
df_train_copy['vendor_id'].value_counts()


Out[8]:
2    780302
1    678342
Name: vendor_id, dtype: int64

In [9]:
df_train_copy['store_and_fwd_flag'].value_counts()


Out[9]:
N    1450599
Y       8045
Name: store_and_fwd_flag, dtype: int64

In [10]:
df_train_copy.columns


Out[10]:
Index(['id', 'vendor_id', 'pickup_datetime', 'dropoff_datetime',
       'passenger_count', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'store_and_fwd_flag',
       'trip_duration', 'delta_long', 'delta_la', 'dist'],
      dtype='object')

In [11]:
df_train_copy['trip_duration'].describe()


Out[11]:
count    1.458644e+06
mean     9.594923e+02
std      5.237432e+03
min      1.000000e+00
25%      3.970000e+02
50%      6.620000e+02
75%      1.075000e+03
max      3.526282e+06
Name: trip_duration, dtype: float64

In [14]:
df_train_copy['dist'].describe()


Out[14]:
count    1.458644e+06
mean     3.548232e-02
std      4.596807e-02
min      0.000000e+00
25%      1.258443e-02
50%      2.121671e-02
75%      3.840884e-02
max      1.119260e+01
Name: dist, dtype: float64

把上車時間轉換成數字,只取月份和小時

血淚教訓:List速度>>>pandas


In [8]:
from datetime import datetime

t = datetime.strptime(df_train_copy['pickup_datetime'][0], '%Y-%m-%d %H:%M:%S')
df_train_copy['month'][0] = t.month
df_train_copy['hour'][0] = t.hour

print(df_train_copy['month'][0])
print(df_train_copy['hour'][0])


---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
//anaconda/lib/python3.5/site-packages/pandas/indexes/base.py in get_loc(self, key, method, tolerance)
   2133             try:
-> 2134                 return self._engine.get_loc(key)
   2135             except KeyError:

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4433)()

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4279)()

pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13742)()

pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13696)()

KeyError: 'month'

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
<ipython-input-8-aa0342a008b4> in <module>()
      2 
      3 t = datetime.strptime(df_train_copy['pickup_datetime'][0], '%Y-%m-%d %H:%M:%S')
----> 4 df_train_copy['month'][0] = t.month
      5 df_train_copy['hour'][0] = t.hour
      6 

//anaconda/lib/python3.5/site-packages/pandas/core/frame.py in __getitem__(self, key)
   2057             return self._getitem_multilevel(key)
   2058         else:
-> 2059             return self._getitem_column(key)
   2060 
   2061     def _getitem_column(self, key):

//anaconda/lib/python3.5/site-packages/pandas/core/frame.py in _getitem_column(self, key)
   2064         # get column
   2065         if self.columns.is_unique:
-> 2066             return self._get_item_cache(key)
   2067 
   2068         # duplicate columns & possible reduce dimensionality

//anaconda/lib/python3.5/site-packages/pandas/core/generic.py in _get_item_cache(self, item)
   1384         res = cache.get(item)
   1385         if res is None:
-> 1386             values = self._data.get(item)
   1387             res = self._box_item_values(item, values)
   1388             cache[item] = res

//anaconda/lib/python3.5/site-packages/pandas/core/internals.py in get(self, item, fastpath)
   3541 
   3542             if not isnull(item):
-> 3543                 loc = self.items.get_loc(item)
   3544             else:
   3545                 indexer = np.arange(len(self.items))[isnull(self.items)]

//anaconda/lib/python3.5/site-packages/pandas/indexes/base.py in get_loc(self, key, method, tolerance)
   2134                 return self._engine.get_loc(key)
   2135             except KeyError:
-> 2136                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   2137 
   2138         indexer = self.get_indexer([key], method=method, tolerance=tolerance)

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4433)()

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4279)()

pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13742)()

pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13696)()

KeyError: 'month'

In [10]:
from datetime import datetime

pickup_datetime = df_train_copy['pickup_datetime'].tolist()
month = []
hour = []

for i in range(len(df_train_copy)):
    t = datetime.strptime(pickup_datetime[i],'%Y-%m-%d %H:%M:%S')
    month.append(t.month)
    hour.append(t.hour)
    
    if i % 100000 == 0:
        print('already: ', i)

df_train_copy['month'] = month
df_train_copy['hour'] = hour


already:  0
already:  10000
already:  20000
already:  30000
already:  40000
already:  50000
already:  60000
already:  70000
already:  80000
already:  90000
already:  100000
already:  110000
already:  120000
already:  130000
already:  140000
already:  150000
already:  160000
already:  170000
already:  180000
already:  190000
already:  200000
already:  210000
already:  220000
already:  230000
already:  240000
already:  250000
already:  260000
already:  270000
already:  280000
already:  290000
already:  300000
already:  310000
already:  320000
already:  330000
already:  340000
already:  350000
already:  360000
already:  370000
already:  380000
already:  390000
already:  400000
already:  410000
already:  420000
already:  430000
already:  440000
already:  450000
already:  460000
already:  470000
already:  480000
already:  490000
already:  500000
already:  510000
already:  520000
already:  530000
already:  540000
already:  550000
already:  560000
already:  570000
already:  580000
already:  590000
already:  600000
already:  610000
already:  620000
already:  630000
already:  640000
already:  650000
already:  660000
already:  670000
already:  680000
already:  690000
already:  700000
already:  710000
already:  720000
already:  730000
already:  740000
already:  750000
already:  760000
already:  770000
already:  780000
already:  790000
already:  800000
already:  810000
already:  820000
already:  830000
already:  840000
already:  850000
already:  860000
already:  870000
already:  880000
already:  890000
already:  900000
already:  910000
already:  920000
already:  930000
already:  940000
already:  950000
already:  960000
already:  970000
already:  980000
already:  990000
already:  1000000
already:  1010000
already:  1020000
already:  1030000
already:  1040000
already:  1050000
already:  1060000
already:  1070000
already:  1080000
already:  1090000
already:  1100000
already:  1110000
already:  1120000
already:  1130000
already:  1140000
already:  1150000
already:  1160000
already:  1170000
already:  1180000
already:  1190000
already:  1200000
already:  1210000
already:  1220000
already:  1230000
already:  1240000
already:  1250000
already:  1260000
already:  1270000
already:  1280000
already:  1290000
already:  1300000
already:  1310000
already:  1320000
already:  1330000
already:  1340000
already:  1350000
already:  1360000
already:  1370000
already:  1380000
already:  1390000
already:  1400000
already:  1410000
already:  1420000
already:  1430000
already:  1440000
already:  1450000

In [11]:
df_train_copy


Out[11]:
id vendor_id pickup_datetime dropoff_datetime passenger_count pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude store_and_fwd_flag trip_duration delta_long delta_la dist month hour
0 id2875421 2 2016-03-14 17:24:55 2016-03-14 17:32:30 1 -73.982155 40.767937 -73.964630 40.765602 N 455 0.017525 0.002335 0.017680 3 17
1 id2377394 1 2016-06-12 00:43:35 2016-06-12 00:54:38 1 -73.980415 40.738564 -73.999481 40.731152 N 663 0.019066 0.007412 0.020456 6 0
2 id3858529 2 2016-01-19 11:35:24 2016-01-19 12:10:48 1 -73.979027 40.763939 -74.005333 40.710087 N 2124 0.026306 0.053852 0.059934 1 11
3 id3504673 2 2016-04-06 19:32:31 2016-04-06 19:39:40 1 -74.010040 40.719971 -74.012268 40.706718 N 429 0.002228 0.013252 0.013438 4 19
4 id2181028 2 2016-03-26 13:30:55 2016-03-26 13:38:10 1 -73.973053 40.793209 -73.972923 40.782520 N 435 0.000130 0.010689 0.010690 3 13
5 id0801584 2 2016-01-30 22:01:40 2016-01-30 22:09:03 6 -73.982857 40.742195 -73.992081 40.749184 N 443 0.009224 0.006989 0.011572 1 22
6 id1813257 1 2016-06-17 22:34:59 2016-06-17 22:40:40 4 -73.969017 40.757839 -73.957405 40.765896 N 341 0.011612 0.008057 0.014133 6 22
7 id1324603 2 2016-05-21 07:54:58 2016-05-21 08:20:49 1 -73.969276 40.797779 -73.922470 40.760559 N 1551 0.046806 0.037220 0.059801 5 7
8 id1301050 1 2016-05-27 23:12:23 2016-05-27 23:16:38 1 -73.999481 40.738400 -73.985786 40.732815 N 255 0.013695 0.005585 0.014790 5 23
9 id0012891 2 2016-03-10 21:45:01 2016-03-10 22:05:26 1 -73.981049 40.744339 -73.973000 40.789989 N 1225 0.008049 0.045650 0.046355 3 21
10 id1436371 2 2016-05-10 22:08:41 2016-05-10 22:29:55 1 -73.982651 40.763840 -74.002228 40.732990 N 1274 0.019577 0.030849 0.036537 5 22
11 id1299289 2 2016-05-15 11:16:11 2016-05-15 11:34:59 4 -73.991531 40.749439 -73.956543 40.770630 N 1128 0.034988 0.021191 0.040905 5 11
12 id1187965 2 2016-02-19 09:52:46 2016-02-19 10:11:20 2 -73.962982 40.756680 -73.984406 40.760719 N 1114 0.021423 0.004040 0.021801 2 9
13 id0799785 2 2016-06-01 20:58:29 2016-06-01 21:02:49 1 -73.956306 40.767941 -73.966110 40.763000 N 260 0.009804 0.004940 0.010978 6 20
14 id2900608 2 2016-05-27 00:43:36 2016-05-27 01:07:10 1 -73.992195 40.727226 -73.974655 40.783070 N 1414 0.017540 0.055843 0.058533 5 0
15 id3319787 1 2016-05-16 15:29:02 2016-05-16 15:32:33 1 -73.955513 40.768593 -73.948761 40.771545 N 211 0.006752 0.002953 0.007369 5 15
16 id3379579 2 2016-04-11 17:29:50 2016-04-11 18:08:26 1 -73.991165 40.755562 -73.999290 40.725353 N 2316 0.008125 0.030209 0.031282 4 17
17 id1154431 1 2016-04-14 08:48:26 2016-04-14 09:00:37 1 -73.994255 40.745804 -73.999657 40.723343 N 731 0.005402 0.022461 0.023101 4 8
18 id3552682 1 2016-06-27 09:55:13 2016-06-27 10:17:10 1 -74.003983 40.713013 -73.979195 40.749924 N 1317 0.024788 0.036911 0.044462 6 9
19 id3390316 2 2016-06-05 13:47:23 2016-06-05 13:51:34 1 -73.983887 40.738197 -73.991203 40.727871 N 251 0.007317 0.010326 0.012656 6 13
20 id2070428 1 2016-02-28 02:23:02 2016-02-28 02:31:08 1 -73.980370 40.742420 -73.962852 40.760635 N 486 0.017517 0.018215 0.025271 2 2
21 id0809232 2 2016-04-01 12:12:25 2016-04-01 12:23:17 1 -73.979538 40.753361 -73.963997 40.763458 N 652 0.015541 0.010098 0.018533 4 12
22 id2352683 1 2016-04-09 03:34:27 2016-04-09 03:41:30 1 -73.995865 40.758812 -73.993324 40.740322 N 423 0.002541 0.018490 0.018664 4 3
23 id1603037 1 2016-06-25 10:36:26 2016-06-25 10:55:49 1 -73.993553 40.747173 -74.006142 40.704384 N 1163 0.012589 0.042789 0.044603 6 10
24 id3321406 2 2016-06-03 08:15:05 2016-06-03 08:56:30 1 -73.955231 40.777134 -73.788750 40.641472 N 2485 0.166481 0.135662 0.214756 6 8
25 id0129640 2 2016-02-14 13:27:56 2016-02-14 13:49:19 1 -73.956581 40.771358 -73.974968 40.732792 N 1283 0.018387 0.038567 0.042725 2 13
26 id3587298 1 2016-02-27 21:56:01 2016-02-27 22:14:51 1 -73.983765 40.749874 -73.958832 40.800961 N 1130 0.024933 0.051086 0.056846 2 21
27 id2104175 1 2016-06-20 23:07:16 2016-06-20 23:18:50 1 -73.958435 40.713192 -73.949539 40.680252 N 694 0.008896 0.032940 0.034120 6 23
28 id3973319 2 2016-06-13 21:57:27 2016-06-13 22:12:19 1 -73.994217 40.713306 -73.982849 40.692299 N 892 0.011368 0.021008 0.023886 6 21
29 id1410897 1 2016-03-23 14:10:39 2016-03-23 14:49:30 1 -73.982117 40.756351 -73.865692 40.770988 N 2331 0.116425 0.014637 0.117341 3 14
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1458614 id2061444 2 2016-02-08 17:16:07 2016-02-08 17:21:45 1 -73.980927 40.767651 -73.965302 40.765251 N 338 0.015625 0.002399 0.015808 2 17
1458615 id3182230 1 2016-02-05 17:57:08 2016-02-05 18:11:25 1 -73.991013 40.728321 -73.966766 40.711548 N 857 0.024246 0.016773 0.029483 2 17
1458616 id2822294 1 2016-04-22 17:21:14 2016-04-22 17:29:22 1 -73.988327 40.732147 -73.999641 40.734192 N 488 0.011314 0.002045 0.011498 4 17
1458617 id0820021 2 2016-04-15 08:31:20 2016-04-15 08:34:48 1 -73.975433 40.752411 -73.973122 40.746780 N 208 0.002312 0.005630 0.006087 4 8
1458618 id1046767 2 2016-04-17 01:46:48 2016-04-17 01:52:55 1 -73.987564 40.733387 -74.001129 40.731056 N 367 0.013565 0.002331 0.013764 4 1
1458619 id1083860 2 2016-04-23 12:14:15 2016-04-23 12:26:03 1 -73.954773 40.777882 -73.980904 40.782516 N 708 0.026131 0.004635 0.026539 4 12
1458620 id0694577 2 2016-04-28 20:51:03 2016-04-28 21:10:25 1 -73.966324 40.758072 -74.006516 40.736641 N 1162 0.040192 0.021431 0.045548 4 20
1458621 id3267199 2 2016-05-09 14:33:30 2016-05-09 15:12:45 1 -73.959534 40.782749 -73.990959 40.751091 N 2355 0.031425 0.031658 0.044607 5 14
1458622 id0125435 2 2016-02-19 18:26:52 2016-02-19 18:36:04 1 -74.008408 40.721142 -74.000557 40.723911 N 552 0.007851 0.002769 0.008325 2 18
1458623 id3369208 1 2016-01-18 20:35:30 2016-01-18 20:44:44 1 -73.991081 40.737408 -73.987671 40.722622 N 554 0.003410 0.014786 0.015174 1 20
1458624 id3482902 1 2016-03-01 07:21:04 2016-03-01 07:23:36 1 -73.974693 40.756088 -73.969971 40.762115 N 152 0.004723 0.006027 0.007657 3 7
1458625 id3730733 2 2016-01-25 17:21:15 2016-01-25 17:54:37 1 -73.989655 40.740612 -73.961029 40.765366 N 2002 0.028625 0.024754 0.037844 1 17
1458626 id0155863 2 2016-01-17 17:21:11 2016-01-17 17:25:15 2 -73.954071 40.767021 -73.950340 40.778233 N 244 0.003731 0.011211 0.011816 1 17
1458627 id0439281 2 2016-06-23 10:10:28 2016-06-23 10:25:08 5 -73.981651 40.767708 -73.959183 40.777412 N 880 0.022469 0.009705 0.024475 6 10
1458628 id0986544 2 2016-05-30 03:08:19 2016-05-30 03:14:10 2 -73.988632 40.721378 -73.975548 40.728519 N 351 0.013084 0.007141 0.014906 5 3
1458629 id3109086 2 2016-06-24 10:33:51 2016-06-24 10:43:52 1 -73.959618 40.808941 -73.947922 40.830189 N 601 0.011696 0.021248 0.024254 6 10
1458630 id0287353 2 2016-06-25 03:44:32 2016-06-25 03:53:41 5 -73.991508 40.727135 -73.988136 40.740932 N 549 0.003372 0.013798 0.014204 6 3
1458631 id1724231 1 2016-05-14 23:18:23 2016-05-14 23:24:05 3 -73.958946 40.763725 -73.953156 40.780003 N 342 0.005791 0.016277 0.017277 5 23
1458632 id0469946 2 2016-03-06 11:04:48 2016-03-06 11:17:45 2 -74.015572 40.710892 -73.996620 40.743633 N 777 0.018951 0.032742 0.037831 3 11
1458633 id2432342 1 2016-03-17 19:10:16 2016-03-17 19:26:35 3 -73.979652 40.735279 -73.995522 40.759754 N 979 0.015869 0.024475 0.029170 3 19
1458634 id3445276 1 2016-04-03 13:51:25 2016-04-03 14:07:37 2 -73.989075 40.730465 -73.963882 40.773739 N 972 0.025192 0.043274 0.050073 4 13
1458635 id3027038 2 2016-05-19 14:46:55 2016-05-19 14:50:52 1 -73.985390 40.763020 -73.989708 40.767502 N 237 0.004318 0.004482 0.006224 5 14
1458636 id0405770 2 2016-02-12 10:13:06 2016-02-12 10:26:26 1 -73.863815 40.769684 -73.864395 40.761326 N 800 0.000580 0.008358 0.008378 2 10
1458637 id1920898 1 2016-04-17 18:48:16 2016-04-17 19:00:56 1 -73.975357 40.751705 -73.949478 40.776764 N 760 0.025879 0.025059 0.036023 4 18
1458638 id1454193 2 2016-02-02 00:39:39 2016-02-02 00:46:33 5 -73.988823 40.736553 -73.989166 40.757393 N 414 0.000343 0.020840 0.020843 2 0
1458639 id2376096 2 2016-04-08 13:31:04 2016-04-08 13:44:02 4 -73.982201 40.745522 -73.994911 40.740170 N 778 0.012711 0.005352 0.013791 4 13
1458640 id1049543 1 2016-01-10 07:35:15 2016-01-10 07:46:10 1 -74.000946 40.747379 -73.970184 40.796547 N 655 0.030762 0.049168 0.057998 1 7
1458641 id2304944 2 2016-04-22 06:57:41 2016-04-22 07:10:25 1 -73.959129 40.768799 -74.004433 40.707371 N 764 0.045303 0.061428 0.076327 4 6
1458642 id2714485 1 2016-01-05 15:56:26 2016-01-05 16:02:39 1 -73.982079 40.749062 -73.974632 40.757107 N 373 0.007446 0.008045 0.010962 1 15
1458643 id1209952 1 2016-04-05 14:44:25 2016-04-05 14:47:43 1 -73.979538 40.781750 -73.972809 40.790585 N 198 0.006729 0.008835 0.011106 4 14

1458644 rows × 16 columns


In [13]:
from datetime import datetime

pickup_datetime_test = df_test_copy['pickup_datetime'].tolist()
month = []
hour = []

for i in range(len(df_test_copy)):
    t = datetime.strptime(pickup_datetime_test[i],'%Y-%m-%d %H:%M:%S')
    month.append(t.month)
    hour.append(t.hour)
    
    if i % 100000 == 0:
        print('already: ', i)

df_test_copy['month'] = month
df_test_copy['hour'] = hour


already:  0
already:  10000
already:  20000
already:  30000
already:  40000
already:  50000
already:  60000
already:  70000
already:  80000
already:  90000
already:  100000
already:  110000
already:  120000
already:  130000
already:  140000
already:  150000
already:  160000
already:  170000
already:  180000
already:  190000
already:  200000
already:  210000
already:  220000
already:  230000
already:  240000
already:  250000
already:  260000
already:  270000
already:  280000
already:  290000
already:  300000
already:  310000
already:  320000
already:  330000
already:  340000
already:  350000
already:  360000
already:  370000
already:  380000
already:  390000
already:  400000
already:  410000
already:  420000
already:  430000
already:  440000
already:  450000
already:  460000
already:  470000
already:  480000
already:  490000
already:  500000
already:  510000
already:  520000
already:  530000
already:  540000
already:  550000
already:  560000
already:  570000
already:  580000
already:  590000
already:  600000
already:  610000
already:  620000

In [14]:
df_test_copy


Out[14]:
id vendor_id pickup_datetime passenger_count pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude store_and_fwd_flag delta_long delta_la dist month hour
0 id3004672 1 2016-06-30 23:59:58 1 -73.988129 40.732029 -73.990173 40.756680 N 0.002045 0.024651 0.024735 6 23
1 id3505355 1 2016-06-30 23:59:53 1 -73.964203 40.679993 -73.959808 40.655403 N 0.004395 0.024590 0.024979 6 23
2 id1217141 1 2016-06-30 23:59:47 1 -73.997437 40.737583 -73.986160 40.729523 N 0.011276 0.008060 0.013861 6 23
3 id2150126 2 2016-06-30 23:59:41 1 -73.956070 40.771900 -73.986427 40.730469 N 0.030357 0.041431 0.051363 6 23
4 id1598245 1 2016-06-30 23:59:33 1 -73.970215 40.761475 -73.961510 40.755890 N 0.008705 0.005585 0.010343 6 23
5 id0668992 1 2016-06-30 23:59:30 1 -73.991302 40.749798 -73.980515 40.786549 N 0.010788 0.036751 0.038301 6 23
6 id1765014 1 2016-06-30 23:59:15 1 -73.978310 40.741550 -73.952072 40.717003 N 0.026237 0.024548 0.035930 6 23
7 id0898117 1 2016-06-30 23:59:09 2 -74.012711 40.701527 -73.986481 40.719509 N 0.026230 0.017982 0.031802 6 23
8 id3905224 2 2016-06-30 23:58:55 2 -73.992332 40.730511 -73.875618 40.875214 N 0.116714 0.144703 0.185906 6 23
9 id1543102 2 2016-06-30 23:58:46 1 -73.993179 40.748760 -73.979309 40.761311 N 0.013870 0.012550 0.018705 6 23
10 id3024712 1 2016-06-30 23:58:32 4 -73.968529 40.678432 -73.966591 40.635712 N 0.001938 0.042721 0.042765 6 23
11 id3665810 2 2016-06-30 23:58:05 1 -73.982773 40.756908 -73.974693 40.753330 N 0.008080 0.003578 0.008836 6 23
12 id1836461 1 2016-06-30 23:58:01 1 -73.921104 40.767292 -73.936859 40.774044 N 0.015755 0.006752 0.017141 6 23
13 id3457080 2 2016-06-30 23:57:57 1 -73.986801 40.734917 -73.975899 40.756893 N 0.010902 0.021976 0.024532 6 23
14 id3376065 1 2016-06-30 23:57:25 1 -73.996346 40.748161 -73.950829 40.782825 N 0.045517 0.034664 0.057214 6 23
15 id3008739 1 2016-06-30 23:57:22 1 -73.968025 40.762283 -73.934792 40.797436 N 0.033234 0.035152 0.048375 6 23
16 id0902216 2 2016-06-30 23:56:44 1 -74.007713 40.740681 -73.968811 40.753860 N 0.038902 0.013180 0.041074 6 23
17 id3564824 2 2016-06-30 23:55:36 5 -73.984299 40.724983 -73.981819 40.740597 N 0.002480 0.015614 0.015809 6 23
18 id0820280 2 2016-06-30 23:55:28 1 -73.952599 40.768322 -73.948555 40.773724 N 0.004044 0.005402 0.006747 6 23
19 id0775088 2 2016-06-30 23:55:20 1 -73.966690 40.794090 -73.920776 40.830059 N 0.045914 0.035969 0.058325 6 23
20 id1468488 2 2016-06-30 23:55:13 1 -73.994690 40.725819 -73.987160 40.729259 N 0.007530 0.003441 0.008279 6 23
21 id2657479 1 2016-06-30 23:55:12 1 -73.965950 40.758068 -73.977524 40.742527 N 0.011574 0.015541 0.019377 6 23
22 id1262719 2 2016-06-30 23:55:04 5 -73.986382 40.762001 -73.966148 40.762089 N 0.020233 0.000088 0.020233 6 23
23 id1345524 2 2016-06-30 23:54:55 1 -73.955986 40.714069 -73.980682 40.675735 N 0.024696 0.038334 0.045600 6 23
24 id2911638 1 2016-06-30 23:54:45 1 -73.984100 40.742760 -73.956001 40.784809 N 0.028099 0.042049 0.050574 6 23
25 id2849512 2 2016-06-30 23:54:16 3 -73.872993 40.773979 -73.962440 40.774712 N 0.089447 0.000732 0.089450 6 23
26 id0236829 1 2016-06-30 23:53:06 1 -73.967621 40.762856 -73.952301 40.782181 N 0.015320 0.019325 0.024661 6 23
27 id2905906 1 2016-06-30 23:52:34 1 -73.985359 40.759548 -73.973267 40.763294 Y 0.012093 0.003746 0.012660 6 23
28 id3737939 1 2016-06-30 23:52:15 2 -73.987526 40.765511 -73.938713 40.849827 N 0.048813 0.084316 0.097427 6 23
29 id0766179 2 2016-06-30 23:51:42 2 -73.985344 40.747356 -73.978378 40.675571 N 0.006966 0.071785 0.072122 6 23
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
625104 id0120169 1 2016-01-01 00:13:21 1 -73.968399 40.799793 -73.974541 40.787251 N 0.006142 0.012543 0.013966 1 0
625105 id0386349 2 2016-01-01 00:11:38 5 -73.988403 40.737289 -73.992073 40.749142 N 0.003670 0.011852 0.012407 1 0
625106 id1962532 2 2016-01-01 00:11:33 1 -74.005394 40.739971 -73.989204 40.723003 N 0.016190 0.016968 0.023452 1 0
625107 id0335207 2 2016-01-01 00:11:10 2 -73.982239 40.771309 -73.963661 40.774353 N 0.018578 0.003044 0.018825 1 0
625108 id0273508 2 2016-01-01 00:10:40 3 -74.006516 40.744549 -73.938103 40.792294 N 0.068413 0.047745 0.083426 1 0
625109 id2936770 2 2016-01-01 00:10:15 1 -73.976822 40.751690 -73.955719 40.776470 N 0.021103 0.024780 0.032548 1 0
625110 id0044526 2 2016-01-01 00:10:09 1 -73.976547 40.750626 -73.975777 40.745041 N 0.000771 0.005585 0.005638 1 0
625111 id3605431 2 2016-01-01 00:10:02 1 -74.001259 40.747192 -73.978333 40.779781 N 0.022926 0.032589 0.039845 1 0
625112 id2681896 1 2016-01-01 00:09:44 1 -73.998657 40.739952 -73.983231 40.734612 N 0.015427 0.005341 0.016325 1 0
625113 id3308448 2 2016-01-01 00:09:40 5 -73.976784 40.774750 -73.973106 40.756870 N 0.003677 0.017879 0.018254 1 0
625114 id2108525 1 2016-01-01 00:09:17 1 -73.976669 40.765736 -73.983246 40.749500 N 0.006577 0.016235 0.017517 1 0
625115 id3952220 2 2016-01-01 00:08:38 1 -73.992180 40.759155 -73.958916 40.618134 N 0.033264 0.141022 0.144892 1 0
625116 id2771348 1 2016-01-01 00:08:36 1 -73.976189 40.765678 -73.982147 40.744007 N 0.005959 0.021671 0.022476 1 0
625117 id3065313 2 2016-01-01 00:08:30 1 -74.002258 40.745335 -73.995331 40.663185 N 0.006927 0.082150 0.082441 1 0
625118 id2332834 2 2016-01-01 00:07:20 2 -73.999954 40.728645 -73.936699 40.750092 N 0.063255 0.021446 0.066792 1 0
625119 id3495407 1 2016-01-01 00:07:13 2 -73.997025 40.720509 -73.998932 40.734013 N 0.001907 0.013504 0.013638 1 0
625120 id3811106 2 2016-01-01 00:06:57 6 -73.990585 40.740227 -74.000893 40.729008 N 0.010307 0.011219 0.015235 1 0
625121 id2693698 1 2016-01-01 00:06:00 1 -73.999481 40.748959 -74.008507 40.745422 N 0.009026 0.003536 0.009694 1 0
625122 id2884571 2 2016-01-01 00:05:39 2 -74.000259 40.730247 -73.999969 40.741158 N 0.000290 0.010910 0.010914 1 0
625123 id2790343 2 2016-01-01 00:05:14 1 -73.956131 40.778793 -74.001259 40.736439 N 0.045128 0.042355 0.061891 1 0
625124 id1901191 1 2016-01-01 00:05:12 1 -73.988899 40.718777 -73.972511 40.743450 N 0.016388 0.024673 0.029620 1 0
625125 id0664662 1 2016-01-01 00:05:02 1 -73.969040 40.790852 -73.967690 40.761066 N 0.001350 0.029785 0.029816 1 0
625126 id2073829 1 2016-01-01 00:05:01 1 -74.002586 40.733627 -73.998955 40.744518 N 0.003632 0.010891 0.011480 1 0
625127 id0328287 1 2016-01-01 00:03:38 2 -73.952110 40.777416 -73.958450 40.764320 N 0.006340 0.013096 0.014550 1 0
625128 id1340822 1 2016-01-01 00:03:00 1 -73.973167 40.764042 -73.974464 40.757187 N 0.001297 0.006855 0.006977 1 0
625129 id3008929 1 2016-01-01 00:02:52 1 -74.003464 40.725105 -74.001251 40.733643 N 0.002213 0.008537 0.008819 1 0
625130 id3700764 1 2016-01-01 00:01:52 1 -74.006363 40.743782 -73.953407 40.782467 N 0.052956 0.038685 0.065581 1 0
625131 id2568735 1 2016-01-01 00:01:24 2 -73.972267 40.759865 -73.876602 40.748665 N 0.095665 0.011200 0.096318 1 0
625132 id1384355 1 2016-01-01 00:00:28 1 -73.976501 40.733562 -73.854263 40.891788 N 0.122238 0.158226 0.199944 1 0
625133 id0621643 2 2016-01-01 00:00:22 2 -73.981850 40.716881 -73.969330 40.769379 N 0.012520 0.052498 0.053970 1 0

625134 rows × 14 columns

整理所需參數


In [15]:
df_train_para = df_train_copy[['vendor_id', 'passenger_count','store_and_fwd_flag',
                               'trip_duration','dist','month','hour']]

df_test_para = df_test_copy[['vendor_id', 'passenger_count','store_and_fwd_flag',
                             'dist','month','hour']]

In [22]:
df_train_para['month'].value_counts()


Out[22]:
3    256189
4    251645
5    248487
2    238300
6    234316
1    229707
Name: month, dtype: int64

In [25]:
sns.boxplot(df_train_para['hour'], df_train_para['trip_duration'])
plt.show()



In [23]:
df_test_para['month'].value_counts()


Out[23]:
3    109697
5    107570
4    107432
2    102314
6    100445
1     97676
Name: month, dtype: int64

剔除outlier


In [31]:
df_train_s = df_train_para[(df_train_para.trip_duration < 2500) &
                           (df_train_para.dist < 0.04) &
                           (df_train_para.dist > 0.01) ]

In [32]:
import matplotlib.pyplot as plt
plt.scatter(df_train_s.dist, df_train_s.trip_duration, alpha=0.01)
plt.show()



In [33]:
sns.boxplot(df_train_s['hour'], df_train_s['trip_duration'])
plt.show()



In [34]:
sns.boxplot(df_train_s['month'], df_train_s['trip_duration'])
plt.show()



In [53]:
sns.boxplot(df_train_s['passenger_count'], df_train_s['trip_duration'])
plt.show()



In [54]:
df_train_s['passenger_count'].value_counts()


Out[54]:
1    620561
2    124453
5     46437
3     35956
6     29012
4     16811
0         5
Name: passenger_count, dtype: int64

In [35]:
df_train_s['trip_duration'].describe()


Out[35]:
count    873235.000000
mean        675.163420
std         344.399329
min           4.000000
25%         425.000000
50%         606.000000
75%         850.000000
max        2499.000000
Name: trip_duration, dtype: float64

In [36]:
df_train_s['dist'].describe()


Out[36]:
count    873235.000000
mean          0.021175
std           0.007947
min           0.010000
25%           0.014457
50%           0.019651
75%           0.026826
max           0.040000
Name: dist, dtype: float64

In [38]:
df_test_para


Out[38]:
vendor_id passenger_count store_and_fwd_flag dist month hour
0 1 1 N 0.024735 6 23
1 1 1 N 0.024979 6 23
2 1 1 N 0.013861 6 23
3 2 1 N 0.051363 6 23
4 1 1 N 0.010343 6 23
5 1 1 N 0.038301 6 23
6 1 1 N 0.035930 6 23
7 1 2 N 0.031802 6 23
8 2 2 N 0.185906 6 23
9 2 1 N 0.018705 6 23
10 1 4 N 0.042765 6 23
11 2 1 N 0.008836 6 23
12 1 1 N 0.017141 6 23
13 2 1 N 0.024532 6 23
14 1 1 N 0.057214 6 23
15 1 1 N 0.048375 6 23
16 2 1 N 0.041074 6 23
17 2 5 N 0.015809 6 23
18 2 1 N 0.006747 6 23
19 2 1 N 0.058325 6 23
20 2 1 N 0.008279 6 23
21 1 1 N 0.019377 6 23
22 2 5 N 0.020233 6 23
23 2 1 N 0.045600 6 23
24 1 1 N 0.050574 6 23
25 2 3 N 0.089450 6 23
26 1 1 N 0.024661 6 23
27 1 1 Y 0.012660 6 23
28 1 2 N 0.097427 6 23
29 2 2 N 0.072122 6 23
... ... ... ... ... ... ...
625104 1 1 N 0.013966 1 0
625105 2 5 N 0.012407 1 0
625106 2 1 N 0.023452 1 0
625107 2 2 N 0.018825 1 0
625108 2 3 N 0.083426 1 0
625109 2 1 N 0.032548 1 0
625110 2 1 N 0.005638 1 0
625111 2 1 N 0.039845 1 0
625112 1 1 N 0.016325 1 0
625113 2 5 N 0.018254 1 0
625114 1 1 N 0.017517 1 0
625115 2 1 N 0.144892 1 0
625116 1 1 N 0.022476 1 0
625117 2 1 N 0.082441 1 0
625118 2 2 N 0.066792 1 0
625119 1 2 N 0.013638 1 0
625120 2 6 N 0.015235 1 0
625121 1 1 N 0.009694 1 0
625122 2 2 N 0.010914 1 0
625123 2 1 N 0.061891 1 0
625124 1 1 N 0.029620 1 0
625125 1 1 N 0.029816 1 0
625126 1 1 N 0.011480 1 0
625127 1 2 N 0.014550 1 0
625128 1 1 N 0.006977 1 0
625129 1 1 N 0.008819 1 0
625130 1 1 N 0.065581 1 0
625131 1 2 N 0.096318 1 0
625132 1 1 N 0.199944 1 0
625133 2 2 N 0.053970 1 0

625134 rows × 6 columns


In [40]:
df_test_para['vendor_id'].value_counts()


Out[40]:
2    334374
1    290760
Name: vendor_id, dtype: int64

In [41]:
df_test_para['store_and_fwd_flag'].value_counts()


Out[41]:
N    621704
Y      3430
Name: store_and_fwd_flag, dtype: int64

In [42]:
df_test_para['dist'].describe()


Out[42]:
count    625134.000000
mean          0.035400
std           0.045585
min           0.000000
25%           0.012590
50%           0.021216
75%           0.038454
max          10.385000
Name: dist, dtype: float64

In [43]:
df_ans.head()


Out[43]:
id trip_duration
0 id3004672 959
1 id3505355 959
2 id1217141 959
3 id2150126 959
4 id1598245 959

In [245]:
df_ans[df_ans.id == 0]


Out[245]:
id trip_duration
2892 0 1.0
2991 0 1.0
9325 0 1.0
9414 0 1.0
9419 0 1.0
9440 0 1.0
9441 0 1.0
12383 0 1.0
12473 0 1.0
14891 0 1.0
14971 0 1.0
15044 0 1.0
18436 0 1.0
18524 0 1.0
18528 0 1.0
25860 0 1.0
25921 0 1.0
25959 0 1.0
25963 0 1.0
29396 0 1.0
32738 0 1.0
32751 0 1.0
32795 0 1.0
35791 0 1.0
35800 0 1.0
35807 0 1.0
35818 0 1.0
41592 0 1.0
41616 0 1.0
41643 0 1.0
... ... ...
596217 0 1.0
596343 0 1.0
596459 0 1.0
599914 0 1.0
599927 0 1.0
599994 0 1.0
600087 0 1.0
600186 0 1.0
603741 0 1.0
603805 0 1.0
603845 0 1.0
603868 0 1.0
607095 0 1.0
607141 0 1.0
610400 0 1.0
610447 0 1.0
613501 0 1.0
613512 0 1.0
616376 0 1.0
618614 0 1.0
618755 0 1.0
618819 0 1.0
621561 0 1.0
621626 0 1.0
621733 0 1.0
624091 0 1.0
624096 0 1.0
624116 0 1.0
624261 0 1.0
624343 0 1.0

459 rows × 2 columns

製作feature和labels:vendor,flag,month,hour做onehot_encoding


In [55]:
df_train_feature = df_train_s[['vendor_id','store_and_fwd_flag','dist','month','hour']]
df_train_label = df_train_s['trip_duration']

df_test_feature = df_test_para[['vendor_id','store_and_fwd_flag','dist','month','hour']]

In [56]:
df_train_feature = pd.get_dummies(data=df_train_feature, 
                                  columns=['vendor_id','store_and_fwd_flag',
                                           'month','hour'])

df_test_feature = pd.get_dummies(data=df_test_feature, 
                                  columns=['vendor_id','store_and_fwd_flag',
                                           'month','hour'])

In [57]:
df_train_feature.head()


Out[57]:
dist vendor_id_1 vendor_id_2 store_and_fwd_flag_N store_and_fwd_flag_Y month_1 month_2 month_3 month_4 month_5 ... hour_14 hour_15 hour_16 hour_17 hour_18 hour_19 hour_20 hour_21 hour_22 hour_23
0 0.017680 0 1 1 0 0 0 1 0 0 ... 0 0 0 1 0 0 0 0 0 0
1 0.020456 1 0 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3 0.013438 0 1 1 0 0 0 0 1 0 ... 0 0 0 0 0 1 0 0 0 0
4 0.010690 0 1 1 0 0 0 1 0 0 ... 0 0 0 0 0 0 0 0 0 0
5 0.011572 0 1 1 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 1 0

5 rows × 35 columns


In [58]:
df_train_feature.columns


Out[58]:
Index(['dist', 'vendor_id_1', 'vendor_id_2', 'store_and_fwd_flag_N',
       'store_and_fwd_flag_Y', 'month_1', 'month_2', 'month_3', 'month_4',
       'month_5', 'month_6', 'hour_0', 'hour_1', 'hour_2', 'hour_3', 'hour_4',
       'hour_5', 'hour_6', 'hour_7', 'hour_8', 'hour_9', 'hour_10', 'hour_11',
       'hour_12', 'hour_13', 'hour_14', 'hour_15', 'hour_16', 'hour_17',
       'hour_18', 'hour_19', 'hour_20', 'hour_21', 'hour_22', 'hour_23'],
      dtype='object')

In [62]:
df_train_feature['dist'].describe()


Out[62]:
count    873235.000000
mean          0.021175
std           0.007947
min           0.010000
25%           0.014457
50%           0.019651
75%           0.026826
max           0.040000
Name: dist, dtype: float64

In [59]:
df_test_feature.head()


Out[59]:
dist vendor_id_1 vendor_id_2 store_and_fwd_flag_N store_and_fwd_flag_Y month_1 month_2 month_3 month_4 month_5 ... hour_14 hour_15 hour_16 hour_17 hour_18 hour_19 hour_20 hour_21 hour_22 hour_23
0 0.024735 1 0 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1
1 0.024979 1 0 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1
2 0.013861 1 0 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1
3 0.051363 0 1 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1
4 0.010343 1 0 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1

5 rows × 35 columns


In [60]:
df_test_feature.columns


Out[60]:
Index(['dist', 'vendor_id_1', 'vendor_id_2', 'store_and_fwd_flag_N',
       'store_and_fwd_flag_Y', 'month_1', 'month_2', 'month_3', 'month_4',
       'month_5', 'month_6', 'hour_0', 'hour_1', 'hour_2', 'hour_3', 'hour_4',
       'hour_5', 'hour_6', 'hour_7', 'hour_8', 'hour_9', 'hour_10', 'hour_11',
       'hour_12', 'hour_13', 'hour_14', 'hour_15', 'hour_16', 'hour_17',
       'hour_18', 'hour_19', 'hour_20', 'hour_21', 'hour_22', 'hour_23'],
      dtype='object')

In [61]:
df_test_feature['dist'].describe()


Out[61]:
count    625134.000000
mean          0.035400
std           0.045585
min           0.000000
25%           0.012590
50%           0.021216
75%           0.038454
max          10.385000
Name: dist, dtype: float64

In [141]:
train_feature = df_train_feature.values
test_feature = df_test_feature.values
train_label = df_train_label.values

In [130]:
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
train_feature_trans = min_max_scaler.fit_transform(train_feature.reshape(-1,35))
test_feature_trans = min_max_scaler.fit_transform(test_feature.reshape(-1,35))

In [123]:
train_feature_trans[0]


Out[123]:
array([ 0.25598384,  0.        ,  1.        ,  1.        ,  0.        ,
        0.        ,  0.        ,  1.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  1.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ])

In [118]:
train_feature.shape


Out[118]:
(873235, 35)

In [132]:
test_feature_trans[0]


Out[132]:
array([ 0.00238182,  1.        ,  0.        ,  1.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  1.        ])

In [67]:
train_label = train_label.reshape(-1,1)

In [68]:
train_label


Out[68]:
array([[455],
       [663],
       [429],
       ..., 
       [778],
       [373],
       [198]])

跑模型囉!!


In [190]:
import matplotlib.pyplot as plt
def show_train_history(train_history,train,validation):
    plt.plot(train_history.history[train])
    plt.plot(train_history.history[validation])
    plt.title('Train History')
    plt.ylabel(train)
    plt.xlabel('Epoch')
    plt.legend(['train', 'validation'], loc='upper left')
    plt.show()


######################### 建立模型
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.models import load_model

model = Sequential()

model.add(Dense(units=100, 
                input_dim=35,
                kernel_initializer='uniform'))
model.add(Dropout(0.5))

model.add(Dense(units=100,
                kernel_initializer='uniform'))
model.add(Dropout(0.5))

model.add(Dense(units=50,
                kernel_initializer='uniform'))
model.add(Dropout(0.5))

model.add(Dense(units=1))

print(model.summary())

#可以反覆訓練fine tune
weights_path = 'Savemodels/TaxiDuration(Kaggles)_MLP.h5'
model.load_weights(weights_path)

model.compile(loss='mean_squared_logarithmic_error',   
              optimizer='adam', metrics=['accuracy'])

train_history = model.fit(train_feature, train_label,
                          validation_split=0.2, epochs=20, batch_size=50000, verbose=2) 


######################### 訓練過程視覺化
show_train_history(train_history,'acc','val_acc')
show_train_history(train_history,'loss','val_loss')

#儲存訓練結果
model.save_weights("Savemodels/TaxiDuration(Kaggles)_MLP.h5")
print('model saved to disk')


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
dense_73 (Dense)             (None, 100)               3600      
_________________________________________________________________
dropout_54 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_74 (Dense)             (None, 100)               10100     
_________________________________________________________________
dropout_55 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_75 (Dense)             (None, 50)                5050      
_________________________________________________________________
dropout_56 (Dropout)         (None, 50)                0         
_________________________________________________________________
dense_76 (Dense)             (None, 1)                 51        
=================================================================
Total params: 18,801
Trainable params: 18,801
Non-trainable params: 0
_________________________________________________________________
None
Train on 698588 samples, validate on 174647 samples
Epoch 1/20
11s - loss: 0.1971 - acc: 0.0016 - val_loss: 0.1609 - val_acc: 0.0016
Epoch 2/20
11s - loss: 0.1878 - acc: 0.0017 - val_loss: 0.1546 - val_acc: 0.0019
Epoch 3/20
12s - loss: 0.1865 - acc: 0.0017 - val_loss: 0.1549 - val_acc: 0.0018
Epoch 4/20
13s - loss: 0.1860 - acc: 0.0017 - val_loss: 0.1536 - val_acc: 0.0016
Epoch 5/20
13s - loss: 0.1859 - acc: 0.0017 - val_loss: 0.1537 - val_acc: 0.0018
Epoch 6/20
13s - loss: 0.1857 - acc: 0.0016 - val_loss: 0.1541 - val_acc: 0.0019
Epoch 7/20
13s - loss: 0.1858 - acc: 0.0017 - val_loss: 0.1541 - val_acc: 0.0019
Epoch 8/20
13s - loss: 0.1854 - acc: 0.0017 - val_loss: 0.1541 - val_acc: 0.0019
Epoch 9/20
14s - loss: 0.1857 - acc: 0.0016 - val_loss: 0.1539 - val_acc: 0.0021
Epoch 10/20
13s - loss: 0.1859 - acc: 0.0016 - val_loss: 0.1538 - val_acc: 0.0017
Epoch 11/20
13s - loss: 0.1855 - acc: 0.0016 - val_loss: 0.1538 - val_acc: 0.0018
Epoch 12/20
14s - loss: 0.1856 - acc: 0.0017 - val_loss: 0.1538 - val_acc: 0.0018
Epoch 13/20
13s - loss: 0.1859 - acc: 0.0018 - val_loss: 0.1539 - val_acc: 0.0020
Epoch 14/20
13s - loss: 0.1859 - acc: 0.0017 - val_loss: 0.1539 - val_acc: 0.0019
Epoch 15/20
14s - loss: 0.1855 - acc: 0.0017 - val_loss: 0.1539 - val_acc: 0.0019
Epoch 16/20
14s - loss: 0.1855 - acc: 0.0017 - val_loss: 0.1537 - val_acc: 0.0018
Epoch 17/20
14s - loss: 0.1857 - acc: 0.0016 - val_loss: 0.1543 - val_acc: 0.0019
Epoch 18/20
17s - loss: 0.1854 - acc: 0.0018 - val_loss: 0.1538 - val_acc: 0.0018
Epoch 19/20
14s - loss: 0.1856 - acc: 0.0016 - val_loss: 0.1538 - val_acc: 0.0017
Epoch 20/20
14s - loss: 0.1855 - acc: 0.0018 - val_loss: 0.1538 - val_acc: 0.0019
model saved to disk

In [233]:
prediction = model.predict(test_feature)

In [234]:
import matplotlib.pyplot as plt
plt.hist(prediction, range=(0,1500), bins = 20)
plt.show()



In [235]:
prediction[0]


Out[235]:
array([ 658.69891357], dtype=float32)

In [236]:
prediction<0


Out[236]:
array([[False],
       [False],
       [False],
       ..., 
       [False],
       [False],
       [False]], dtype=bool)

In [237]:
df_ans['trip_duration'] = prediction

In [238]:
df_ans


Out[238]:
id trip_duration
0 id3004672 658.698914
1 id3505355 664.361084
2 id1217141 406.257385
3 id2150126 1276.813354
4 id1598245 324.580902
5 id0668992 973.631470
6 id1765014 918.587402
7 id0898117 822.753113
8 id3905224 4400.172363
9 id1543102 518.692566
10 id3024712 1077.243774
11 id3665810 289.587524
12 id1836461 482.393799
13 id3457080 653.956177
14 id3376065 1412.667114
15 id3008739 1207.490112
16 id0902216 1037.971436
17 id3564824 451.457336
18 id0820280 241.093140
19 id0775088 1438.441162
20 id1468488 276.649902
21 id2657479 534.316345
22 id1262719 554.161133
23 id1345524 1143.043579
24 id2911638 1258.527466
25 id2849512 2160.988525
26 id0236829 656.974670
27 id2905906 390.893555
28 id3737939 2346.187988
29 id0766179 1758.731079
... ... ...
625104 id0120169 355.541199
625105 id0386349 319.337341
625106 id1962532 575.737732
625107 id0335207 468.326477
625108 id0273508 1967.992188
625109 id2936770 786.898804
625110 id0044526 162.181168
625111 id3605431 956.296326
625112 id2681896 410.309784
625113 id3308448 455.057373
625114 id2108525 437.978241
625115 id3952220 3394.890869
625116 id2771348 553.093018
625117 id3065313 1945.131592
625118 id2332834 1581.847168
625119 id3495407 347.935974
625120 id3811106 384.980469
625121 id2693698 256.367401
625122 id2884571 284.666626
625123 id2790343 1468.061279
625124 id1901191 718.947998
625125 id0664662 723.492249
625126 id2073829 297.848938
625127 id0328287 369.102020
625128 id1340822 193.294464
625129 id3008929 236.071793
625130 id3700764 1553.753784
625131 id2568735 2267.314209
625132 id1384355 4672.929199
625133 id0621643 1284.193237

625134 rows × 2 columns


In [239]:
df_ans['trip_duration'].describe()


Out[239]:
count    625134.000000
mean        961.263977
std        1055.356079
min         -75.861900
25%         443.087776
50%         641.375122
75%        1030.174469
max      241286.125000
Name: trip_duration, dtype: float64

In [240]:
df_ans[df_ans['trip_duration']>900]


Out[240]:
id trip_duration
3 id2150126 1276.813354
5 id0668992 973.631470
6 id1765014 918.587402
8 id3905224 4400.172363
10 id3024712 1077.243774
14 id3376065 1412.667114
15 id3008739 1207.490112
16 id0902216 1037.971436
19 id0775088 1438.441162
23 id1345524 1143.043579
24 id2911638 1258.527466
25 id2849512 2160.988525
28 id3737939 2346.187988
29 id0766179 1758.731079
31 id3864673 1758.454224
34 id1686925 1563.794434
36 id1860783 1036.201660
39 id2844603 1836.709595
41 id1408427 2745.732422
42 id1712395 1009.190430
53 id0364901 1067.034424
55 id0814840 4428.150391
56 id3159174 3708.820801
57 id3650870 2752.457520
63 id0527242 944.176575
64 id1890566 1265.187622
65 id0882843 5028.599609
66 id2126329 3596.704834
75 id1707352 1759.266968
76 id1650458 940.201111
... ... ...
625037 id0628544 1175.407837
625039 id2955883 1221.528198
625044 id1186878 1486.027954
625047 id1674078 2840.673340
625049 id3578600 1203.533813
625050 id3125631 1460.024414
625054 id0332493 2740.439697
625056 id2411893 985.152283
625062 id2980593 915.679504
625064 id0885135 1126.859253
625066 id3219550 1307.898926
625070 id2346166 2727.847656
625071 id3076640 2169.414062
625076 id0689827 3979.948242
625080 id1027135 1369.531738
625081 id0256919 1077.124146
625082 id3919531 1476.884155
625090 id2282537 1087.634033
625095 id0876134 1374.796387
625099 id1670163 1120.195190
625108 id0273508 1967.992188
625111 id3605431 956.296326
625115 id3952220 3394.890869
625117 id3065313 1945.131592
625118 id2332834 1581.847168
625123 id2790343 1468.061279
625130 id3700764 1553.753784
625131 id2568735 2267.314209
625132 id1384355 4672.929199
625133 id0621643 1284.193237

191269 rows × 2 columns


In [242]:
df_ans[df_ans.trip_duration<0]


Out[242]:
id trip_duration
2892 0 -12.690598
2991 0 -31.010937
9325 0 -10.982977
9414 0 -41.040585
9419 0 -39.292061
9440 0 -3.148615
9441 0 -30.874996
12383 0 -12.909513
12473 0 -41.011528
14891 0 -13.086611
14971 0 -31.073009
15044 0 -31.073009
18436 0 -12.888595
18524 0 -31.102066
18528 0 -30.512924
25860 0 -9.407928
25921 0 -12.191135
25959 0 -36.496025
25963 0 -41.011528
29396 0 -30.006660
32738 0 -13.115671
32751 0 -12.196654
32795 0 -1.484802
35791 0 -13.086611
35800 0 -21.641125
35807 0 -41.011528
35818 0 -1.253344
41592 0 -36.011440
41616 0 -30.676970
41643 0 -29.884901
... ... ...
596217 0 -52.376923
596343 0 -18.180151
596459 0 -9.101417
599914 0 -46.859680
599927 0 -30.613188
599994 0 -65.952454
600087 0 -17.885429
600186 0 -7.884256
603741 0 -47.223053
603805 0 -74.487122
603845 0 -63.456509
603868 0 -7.730523
607095 0 -15.979280
607141 0 -75.663872
610400 0 -29.581120
610447 0 -65.923409
613501 0 -11.531940
613512 0 -28.689337
616376 0 -41.309258
618614 0 -65.952454
618755 0 -9.101417
618819 0 -9.072357
621561 0 -47.327480
621626 0 -65.587334
621733 0 -9.072357
624091 0 -27.420218
624096 0 -58.307053
624116 0 -16.993511
624261 0 -7.612906
624343 0 -7.217021

459 rows × 2 columns


In [243]:
ans = df_ans['trip_duration'].tolist()

for i in range(len(ans)):
    if ans[i] < 0:
        ans[i] = 1
        
df_ans['trip_duration'] = ans

In [244]:
df_ans[df_ans['id']==0]


Out[244]:
id trip_duration
2892 0 1.0
2991 0 1.0
9325 0 1.0
9414 0 1.0
9419 0 1.0
9440 0 1.0
9441 0 1.0
12383 0 1.0
12473 0 1.0
14891 0 1.0
14971 0 1.0
15044 0 1.0
18436 0 1.0
18524 0 1.0
18528 0 1.0
25860 0 1.0
25921 0 1.0
25959 0 1.0
25963 0 1.0
29396 0 1.0
32738 0 1.0
32751 0 1.0
32795 0 1.0
35791 0 1.0
35800 0 1.0
35807 0 1.0
35818 0 1.0
41592 0 1.0
41616 0 1.0
41643 0 1.0
... ... ...
596217 0 1.0
596343 0 1.0
596459 0 1.0
599914 0 1.0
599927 0 1.0
599994 0 1.0
600087 0 1.0
600186 0 1.0
603741 0 1.0
603805 0 1.0
603845 0 1.0
603868 0 1.0
607095 0 1.0
607141 0 1.0
610400 0 1.0
610447 0 1.0
613501 0 1.0
613512 0 1.0
616376 0 1.0
618614 0 1.0
618755 0 1.0
618819 0 1.0
621561 0 1.0
621626 0 1.0
621733 0 1.0
624091 0 1.0
624096 0 1.0
624116 0 1.0
624261 0 1.0
624343 0 1.0

459 rows × 2 columns


In [246]:
df_ans.to_csv('TaxiDuration_ans.csv',mode = 'w', index=False)

In [ ]: