notebook.community

Edit and run



In [1]:

    
import os as os
import pandas as pd
import numpy as np
from scipy import stats, integrate
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
from statsmodels.distributions.empirical_distribution import ECDF
import datetime as dt
plt.style.use('seaborn-whitegrid')

plt.rcParams['image.cmap'] = 'blue'

#sns.set_context('notebook',font_scale=2)
sns.set_style("whitegrid")
% matplotlib inline
labelsize = 22
mpl.rcParams.update({'font.size': labelsize})
mpl.rcParams.update({'figure.figsize': (20,10)})
mpl.rcParams.update({'axes.titlesize': 'large'})
mpl.rcParams.update({'axes.labelsize': 'large'})
mpl.rcParams.update({'xtick.labelsize': labelsize})
mpl.rcParams.update({'ytick.labelsize': labelsize})



In [2]:

    
!cd data && ls









    



2014-05 - Citi Bike trip data.csv
bike_20140505_with_dist_and_avg_velo.csv
bike_20140505_with_dist.csv
bike_20140505_with_dist_velo.csv
bike_oneweekfrom20140505.csv
data_jan13.csv
README.txt
Taxi_from_2013-05-06_to_2013-05-13.csv
Taxi_from_2013-05-06_to_2013-05-13_testset.csv
Taxi_from_2013-05-06_to_2013-05-13testset.csv
Taxi_from_2013-05-06_to_2013-05-13_xtrain.csv
Taxi_from_2013-05-06_to_2013-05-13_ytrain.csv
taxi_oneweekfrom20130107.csv
taxi_oneweekfrom20130506.csv
taxi_oneweekfrom20140505.csv
taxi_tree_test_X_20130506-12.csv
taxi_tree_test_X.csv
taxi_tree_test_Xy_20130506-12.csv
taxi_tree_test_Xy.csv
taxi_tree_test_Xy_sample.csv
taxi_tree_test_y_20130506-12.csv
taxi_tree_test_y.csv
_testset.csv
yellow_tripdata_2013-05.csv
yellow_tripdata_2014-05.csv

Use the bash =)



In [3]:

    
data = pd.read_csv('data/bike_oneweekfrom20140505.csv', index_col=0, parse_dates=True)



In [4]:

    
data.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 185744 entries, 0 to 185743
Data columns (total 15 columns):
tripduration               185744 non-null int64
starttime                  185744 non-null object
stoptime                   185744 non-null object
start station id           185744 non-null int64
start station name         185744 non-null object
start station latitude     185744 non-null float64
start station longitude    185744 non-null float64
end station id             185744 non-null int64
end station name           185744 non-null object
end station latitude       185744 non-null float64
end station longitude      185744 non-null float64
bikeid                     185744 non-null int64
usertype                   185744 non-null object
birth year                 185744 non-null object
gender                     185744 non-null int64
dtypes: float64(4), int64(5), object(6)
memory usage: 22.7+ MB



In [5]:

    
data.columns









    Out[5]:





Index(['tripduration', 'starttime', 'stoptime', 'start station id',
       'start station name', 'start station latitude',
       'start station longitude', 'end station id', 'end station name',
       'end station latitude', 'end station longitude', 'bikeid', 'usertype',
       'birth year', 'gender'],
      dtype='object')

Rename the columns. They're now streamlined with the taxi input.



In [6]:

    
new_column_names = ['trip_time', 'pickup_datetime', 'dropoff_datetime', 'start_station_id',
       'start_station_name', 'pickup_latitude',
       'pickup_longitude', 'end_station_id', 'end_station_name',
       'dropoff_latitude', 'dropoff_longitude', 'bikeid', 'usertype',
       'birth year', 'gender']



In [7]:

    
data.columns = new_column_names



In [8]:

    
data.describe()









    Out[8]:






  
    
      
      trip_time
      start_station_id
      pickup_latitude
      pickup_longitude
      end_station_id
      dropoff_latitude
      dropoff_longitude
      bikeid
      gender
    
  
  
    
      count
      185744.000000
      185744.000000
      185744.000000
      185744.000000
      185744.000000
      185744.000000
      185744.000000
      185744.000000
      185744.000000
    
    
      mean
      869.523107
      450.726511
      40.734630
      -73.991079
      451.404142
      40.734387
      -73.991147
      18157.530305
      1.090598
    
    
      std
      974.098663
      370.232259
      0.019489
      0.012186
      375.448276
      0.019530
      0.012298
      2119.989210
      0.550519
    
    
      min
      60.000000
      72.000000
      40.680342
      -74.017134
      72.000000
      40.680342
      -74.017134
      14529.000000
      0.000000
    
    
      25%
      401.000000
      304.000000
      40.721655
      -73.999947
      303.000000
      40.721101
      -74.000040
      16315.000000
      1.000000
    
    
      50%
      640.000000
      405.000000
      40.736494
      -73.990765
      404.000000
      40.736197
      -73.990931
      18164.000000
      1.000000
    
    
      75%
      1059.000000
      490.000000
      40.750073
      -73.982050
      489.000000
      40.749718
      -73.982050
      20005.000000
      1.000000
    
    
      max
      21549.000000
      3002.000000
      40.771522
      -73.953809
      3002.000000
      40.771522
      -73.953809
      21679.000000
      2.000000

Parse the dates.



In [9]:

    
data['pickup_datetime'] =pd.to_datetime(data['pickup_datetime'], format = '%Y-%m-%d %H:%M:%S')
data['dropoff_datetime'] =pd.to_datetime(data['dropoff_datetime'], format = '%Y-%m-%d %H:%M:%S')
data['trip_time'] = pd.to_timedelta(data['trip_time'], 's')



In [10]:

    
data.describe().transpose()









    Out[10]:






  
    
      
      count
      mean
      std
      min
      25%
      50%
      75%
      max
    
  
  
    
      trip_time
      185744
      0 days 00:14:29.523107
      0 days 00:16:14.098662
      0 days 00:01:00
      0 days 00:06:41
      0 days 00:10:40
      0 days 00:17:39
      0 days 05:59:09
    
    
      start_station_id
      185744
      450.727
      370.232
      72
      304
      405
      490
      3002
    
    
      pickup_latitude
      185744
      40.7346
      0.0194894
      40.6803
      40.7217
      40.7365
      40.7501
      40.7715
    
    
      pickup_longitude
      185744
      -73.9911
      0.012186
      -74.0171
      -73.9999
      -73.9908
      -73.9821
      -73.9538
    
    
      end_station_id
      185744
      451.404
      375.448
      72
      303
      404
      489
      3002
    
    
      dropoff_latitude
      185744
      40.7344
      0.0195297
      40.6803
      40.7211
      40.7362
      40.7497
      40.7715
    
    
      dropoff_longitude
      185744
      -73.9911
      0.0122976
      -74.0171
      -74
      -73.9909
      -73.9821
      -73.9538
    
    
      bikeid
      185744
      18157.5
      2119.99
      14529
      16315
      18164
      20005
      21679
    
    
      gender
      185744
      1.0906
      0.550519
      0
      1
      1
      1
      2



In [11]:

    
data.head()









    Out[11]:






  
    
      
      trip_time
      pickup_datetime
      dropoff_datetime
      start_station_id
      start_station_name
      pickup_latitude
      pickup_longitude
      end_station_id
      end_station_name
      dropoff_latitude
      dropoff_longitude
      bikeid
      usertype
      birth year
      gender
    
  
  
    
      0
      00:04:49
      2014-05-05 00:00:14
      2014-05-05 00:05:03
      495
      W 47 St & 10 Ave
      40.762699
      -73.993012
      469
      Broadway & W 53 St
      40.763441
      -73.982681
      17039
      Subscriber
      1986
      1
    
    
      1
      00:17:31
      2014-05-05 00:00:19
      2014-05-05 00:17:50
      281
      Grand Army Plaza & Central Park S
      40.764397
      -73.973715
      236
      St Marks Pl & 2 Ave
      40.728419
      -73.987140
      17875
      Subscriber
      1985
      1
    
    
      2
      00:03:57
      2014-05-05 00:01:05
      2014-05-05 00:05:02
      309
      Murray St & West St
      40.714979
      -74.013012
      3002
      South End Ave & Liberty St
      40.711512
      -74.015756
      18008
      Customer
      \N
      0
    
    
      3
      00:08:54
      2014-05-05 00:01:12
      2014-05-05 00:10:06
      151
      Cleveland Pl & Spring St
      40.721816
      -73.997203
      345
      W 13 St & 6 Ave
      40.736494
      -73.997044
      18237
      Subscriber
      1983
      1
    
    
      4
      00:04:35
      2014-05-05 00:01:16
      2014-05-05 00:05:51
      173
      Broadway & W 49 St
      40.760647
      -73.984427
      449
      W 52 St & 9 Ave
      40.764618
      -73.987895
      16607
      Subscriber
      1985
      1



In [12]:

    
data.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 185744 entries, 0 to 185743
Data columns (total 15 columns):
trip_time             185744 non-null timedelta64[ns]
pickup_datetime       185744 non-null datetime64[ns]
dropoff_datetime      185744 non-null datetime64[ns]
start_station_id      185744 non-null int64
start_station_name    185744 non-null object
pickup_latitude       185744 non-null float64
pickup_longitude      185744 non-null float64
end_station_id        185744 non-null int64
end_station_name      185744 non-null object
dropoff_latitude      185744 non-null float64
dropoff_longitude     185744 non-null float64
bikeid                185744 non-null int64
usertype              185744 non-null object
birth year            185744 non-null object
gender                185744 non-null int64
dtypes: datetime64[ns](2), float64(4), int64(4), object(4), timedelta64[ns](1)
memory usage: 22.7+ MB

Have a look at every station. How many pickups are there per station?



In [13]:

    
group_by_start = data.groupby(data.start_station_id, )
usage_freq = group_by_start.trip_time.count()
print('Amount of stations: ' + str(len(usage_freq)))
plt.hist(usage_freq, bins = 50)
plt.title('Frequency of starts from start stations')









    



Amount of stations: 326






    Out[13]:





<matplotlib.text.Text at 0x7f7d1705b470>

When you take a look at the map, you won't be surprised that the Grand Central Station is the one with the most frequent usage.



In [14]:

    
group_by_start_name = data.groupby(data.start_station_name, )
usage_freq_name = group_by_start_name.trip_time.count()
usage_freq_name.sort_values().keys()[-10:]









    Out[14]:





Index(['Greenwich Ave & 8 Ave', 'Broadway & W 24 St', '8 Ave & W 33 St',
       'Broadway & E 14 St', 'W 21 St & 6 Ave', 'West St & Chambers St',
       'E 17 St & Broadway', 'Lafayette St & E 8 St', '8 Ave & W 31 St',
       'E 42 St & Vanderbilt Ave'],
      dtype='object', name='start_station_name')



In [15]:

    
usage_freq_name.sort_values().keys()









    Out[15]:





Index(['Clinton Ave & Flushing Ave', 'Railroad Ave & Kay Ave',
       'Hanover Pl & Livingston St', 'Park Ave & St Edwards St',
       'Monroe St & Classon Ave', 'Sands St & Navy St',
       'Bedford Ave & S 9th St', 'DeKalb Ave & Skillman St',
       'Carlton Ave & Park Ave', 'Gallatin Pl & Livingston St',
       ...
       'Greenwich Ave & 8 Ave', 'Broadway & W 24 St', '8 Ave & W 33 St',
       'Broadway & E 14 St', 'W 21 St & 6 Ave', 'West St & Chambers St',
       'E 17 St & Broadway', 'Lafayette St & E 8 St', '8 Ave & W 31 St',
       'E 42 St & Vanderbilt Ave'],
      dtype='object', name='start_station_name', length=326)

So we have 326 different start stations.



In [16]:

    
group_by_end_name = data.groupby(data.end_station_name, )
usage_freq_name = group_by_end_name.trip_time.count()
usage_freq_name.sort_values().keys()









    Out[16]:





Index(['Railroad Ave & Kay Ave', 'Clinton Ave & Flushing Ave',
       'Greenwich St & N Moore St', 'Hanover Pl & Livingston St',
       'Park Ave & St Edwards St', 'Sands St & Navy St',
       'Monroe St & Classon Ave', 'Bedford Ave & S 9th St',
       'Carlton Ave & Park Ave', 'E 4 St & 2 Ave',
       ...
       'Broadway & W 24 St', 'Broadway & W 60 St', 'University Pl & E 14 St',
       'Greenwich Ave & 8 Ave', 'W 21 St & 6 Ave', 'E 42 St & Vanderbilt Ave',
       'West St & Chambers St', 'Lafayette St & E 8 St', 'E 17 St & Broadway',
       '8 Ave & W 31 St'],
      dtype='object', name='end_station_name', length=326)

We have 326 different end stations, too. But are all end_stations contained in start_stations and vive versa?



In [17]:

    
group_by_start_id = data.groupby(data.start_station_id, )
start_ids = group_by_start_id.trip_time.count().sort_values().keys()
group_by_end_id = data.groupby(data.end_station_id, )
end_ids = group_by_end_id.trip_time.count().sort_values().keys()
len(set(start_ids).intersection(end_ids))









    Out[17]:





326

So this is the proof, that all stations are used as start and end stations.

Check for missing and false data



In [18]:

    
data.isnull().sum()









    Out[18]:





trip_time             0
pickup_datetime       0
dropoff_datetime      0
start_station_id      0
start_station_name    0
pickup_latitude       0
pickup_longitude      0
end_station_id        0
end_station_name      0
dropoff_latitude      0
dropoff_longitude     0
bikeid                0
usertype              0
birth year            0
gender                0
dtype: int64

So there is not that much data missing. That's quite surprising, maybe it's wrong.



In [19]:

    
(data==0).sum()









    Out[19]:





trip_time                 0
pickup_datetime           0
dropoff_datetime          0
start_station_id          0
start_station_name        0
pickup_latitude           0
pickup_longitude          0
end_station_id            0
end_station_name          0
dropoff_latitude          0
dropoff_longitude         0
bikeid                    0
usertype                  0
birth year                0
gender                20495
dtype: int64

So we only have many zeros in the gender-feature. Gender can have 3 values (0=unknown, 1=male, 2=female)



In [20]:

    
from collections import Counter
print(Counter(data.gender))
Counter(data['birth year'])









    



Counter({1: 127926, 2: 37323, 0: 20495})






    Out[20]:





Counter({'1899': 17,
         '1900': 57,
         '1901': 20,
         '1907': 5,
         '1910': 17,
         '1921': 7,
         '1922': 2,
         '1924': 1,
         '1926': 3,
         '1927': 1,
         '1931': 1,
         '1932': 7,
         '1933': 4,
         '1934': 4,
         '1935': 25,
         '1936': 9,
         '1937': 14,
         '1938': 62,
         '1939': 23,
         '1940': 77,
         '1941': 141,
         '1942': 166,
         '1943': 107,
         '1944': 176,
         '1945': 261,
         '1946': 225,
         '1947': 349,
         '1948': 486,
         '1949': 446,
         '1950': 609,
         '1951': 884,
         '1952': 777,
         '1953': 1170,
         '1954': 1267,
         '1955': 1323,
         '1956': 1589,
         '1957': 1656,
         '1958': 1856,
         '1959': 2050,
         '1960': 2495,
         '1961': 2238,
         '1962': 2729,
         '1963': 2940,
         '1964': 3065,
         '1965': 2863,
         '1966': 3016,
         '1967': 3374,
         '1968': 3305,
         '1969': 3950,
         '1970': 4198,
         '1971': 3708,
         '1972': 3681,
         '1973': 3854,
         '1974': 4086,
         '1975': 3953,
         '1976': 4418,
         '1977': 4647,
         '1978': 5117,
         '1979': 5311,
         '1980': 5737,
         '1981': 6296,
         '1982': 6424,
         '1983': 6973,
         '1984': 7168,
         '1985': 7474,
         '1986': 6451,
         '1987': 6407,
         '1988': 5885,
         '1989': 5752,
         '1990': 4953,
         '1991': 2559,
         '1992': 1405,
         '1993': 1200,
         '1994': 684,
         '1995': 516,
         '1996': 299,
         '1997': 202,
         '1998': 27,
         '\\N': 20490})

So we see, we have also missing values in "gender" and "birth year". But we are not interested in this features, therefore we can ignore this missing values at this point.

Quick pverview about the trip_times



In [21]:

    
plt.hist((data['trip_time'] / np.timedelta64(1, 'm')), bins=30, range=[0, 100])
plt.title('Distribution of trip_time in minutes')
plt.xlabel('trip_time')
plt.ylabel('frequency')
plt.savefig('figures/bike_trip_time.png', format='png', dpi=300)

We are interested in the trip time in minutes.



In [22]:

    
sns.boxplot((data['trip_time'] / np.timedelta64(1, 'm')))









    Out[22]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f7d188f6898>

We have lots of outliers.

So as we can see, we have many outliers.



In [23]:

    
data.trip_time.quantile([0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99])









    Out[23]:





0.01          00:01:59
0.05          00:03:25
0.10          00:04:25
0.25          00:06:41
0.50          00:10:40
0.75          00:17:39
0.90          00:26:54
0.95          00:33:56
0.99   01:08:57.570000
Name: trip_time, dtype: timedelta64[ns]



In [24]:

    
len(data.trip_time.value_counts().values)









    Out[24]:





5353

Identify the the cases without geo data and remove them from our data to be processed.



In [25]:

    
anomaly = data.loc[(data['dropoff_longitude'].isnull()) | (data['dropoff_latitude'].isnull()) | 
                       (data['pickup_longitude'].isnull()) | (data['pickup_latitude'].isnull())]
data = data.drop(anomaly.index)



In [26]:

    
anomaly['flag'] = 'geo_NA'



In [27]:

    
data.isnull().sum()









    Out[27]:





trip_time             0
pickup_datetime       0
dropoff_datetime      0
start_station_id      0
start_station_name    0
pickup_latitude       0
pickup_longitude      0
end_station_id        0
end_station_name      0
dropoff_latitude      0
dropoff_longitude     0
bikeid                0
usertype              0
birth year            0
gender                0
dtype: int64

So how many percent of data are left to be processed?



In [28]:

    
len(data)/(len(data)+len(anomaly))









    Out[28]:





1.0



In [29]:

    
anomaly.tail()









    Out[29]:






  
    
      
      trip_time
      pickup_datetime
      dropoff_datetime
      start_station_id
      start_station_name
      pickup_latitude
      pickup_longitude
      end_station_id
      end_station_name
      dropoff_latitude
      dropoff_longitude
      bikeid
      usertype
      birth year
      gender
      flag

So we dropped nothing because of missing geo tags. </font color>



In [30]:

    
plt.hist(data.trip_time.values / np.timedelta64(1, 'm'), bins=50, range=[0,100])









    Out[30]:





(array([  1886.,  12429.,  23265.,  25474.,  22858.,  18987.,  14736.,
         11772.,   9414.,   7721.,   6635.,   5540.,   4620.,   3809.,
          3101.,   2403.,   1852.,   1427.,   1223.,    958.,    755.,
           558.,    395.,    337.,    307.,    247.,    231.,    180.,
           141.,    134.,    124.,    122.,    134.,     72.,     84.,
            67.,     79.,     63.,     59.,     71.,     61.,     59.,
            42.,     47.,     37.,     37.,     41.,     47.,     38.,
            43.]),
 array([   0.,    2.,    4.,    6.,    8.,   10.,   12.,   14.,   16.,
          18.,   20.,   22.,   24.,   26.,   28.,   30.,   32.,   34.,
          36.,   38.,   40.,   42.,   44.,   46.,   48.,   50.,   52.,
          54.,   56.,   58.,   60.,   62.,   64.,   66.,   68.,   70.,
          72.,   74.,   76.,   78.,   80.,   82.,   84.,   86.,   88.,
          90.,   92.,   94.,   96.,   98.,  100.]),
 <a list of 50 Patch objects>)

We sometimes have some unreasonably small trip_times.

Calculate the distance of a trip. Use the difference of lon / lat data and parse it to metrics.



In [31]:

    
data['trip_dist'] = -1 # Init trip_dist

Be aware, this operation is quite slow.



In [32]:

    
# inpout for vincenty:(location.latitude, location.longitude)
from geopy.distance import vincenty
for i in range(0, (len(data)-1)):
    pickup = (data.iloc[i]['pickup_latitude'], data.iloc[i]['pickup_longitude'])
    dropoff = (data.iloc[i]['dropoff_latitude'], data.iloc[i]['dropoff_longitude'])
    data.set_value(i, 'trip_dist', vincenty(pickup,dropoff).meters)



In [33]:

    
data.trip_dist









    Out[33]:





0          876
1         4153
2          449
3         1630
4          529
5          760
6         1332
7         2297
8          689
9         1371
10        2358
11         558
12        4488
13        2146
14        2803
15        1291
16         446
17        1721
18        2882
19         674
20         927
21        3635
22        2168
23         887
24         810
25         706
26         810
27        1491
28        1003
29         935
          ... 
185714       0
185715     552
185716    2067
185717    1646
185718     535
185719     722
185720     366
185721     565
185722    1839
185723     809
185724       0
185725    1682
185726    1348
185727    1244
185728     739
185729    1314
185730    1154
185731    1694
185732    1896
185733    1061
185734    1208
185735     919
185736    1187
185737     285
185738     285
185739     831
185740     908
185741    1273
185742     906
185743      -1
Name: trip_dist, dtype: int64

We cannot use miles because vincenty ceils / floors the results



In [34]:

    
data.columns









    Out[34]:





Index(['trip_time', 'pickup_datetime', 'dropoff_datetime', 'start_station_id',
       'start_station_name', 'pickup_latitude', 'pickup_longitude',
       'end_station_id', 'end_station_name', 'dropoff_latitude',
       'dropoff_longitude', 'bikeid', 'usertype', 'birth year', 'gender',
       'trip_dist'],
      dtype='object')

Check if new column is present



In [35]:

    
# data.to_csv('data/bike_20140505_with_dist.csv')



In [36]:

    
data['avg_velocity'] = data.trip_dist.values/(data.trip_time / (np.timedelta64(1, 'h')))



In [37]:

    
data.avg_velocity









    Out[37]:





0         10912.110727
1         14225.309229
2          6820.253165
3         10988.764045
4          6925.090909
5          8658.227848
6         12787.200000
7         13713.432836
8          9119.117647
9          7466.868381
10         7867.284523
11         9611.483254
12        10785.580774
13         6966.275924
14         9628.625954
15        11199.036145
16         9122.727273
17        11388.970588
18        13963.930013
19         8543.661972
20        13565.853659
21        11061.707523
22        11086.363636
23         5357.718121
24         9112.500000
25         7241.025641
26         9286.624204
27         3029.119639
28        13035.379061
29         9402.234637
              ...     
185714        0.000000
185715     4445.637584
185716    11554.658385
185717     8417.045455
185718     8754.545455
185719     6203.341289
185720    11870.270270
185721     6647.058824
185722     8631.551499
185723     8321.142857
185724        0.000000
185725    12883.404255
185726    11637.410072
185727     8514.068441
185728     6964.397906
185729    13751.162791
185730    12861.919505
185731    11818.604651
185732    11376.000000
185733    10493.406593
185734    10090.023202
185735    10603.846154
185736    14102.970297
185737     4909.090909
185738     5428.571429
185739     7404.950495
185740    16763.076923
185741    13639.285714
185742     9678.338279
185743      -10.495627
Name: avg_velocity, dtype: float64

Parse to km/h



In [38]:

    
data.avg_velocity = data.avg_velocity/1000



In [39]:

    
data.avg_velocity









    Out[39]:





0         10.912111
1         14.225309
2          6.820253
3         10.988764
4          6.925091
5          8.658228
6         12.787200
7         13.713433
8          9.119118
9          7.466868
10         7.867285
11         9.611483
12        10.785581
13         6.966276
14         9.628626
15        11.199036
16         9.122727
17        11.388971
18        13.963930
19         8.543662
20        13.565854
21        11.061708
22        11.086364
23         5.357718
24         9.112500
25         7.241026
26         9.286624
27         3.029120
28        13.035379
29         9.402235
            ...    
185714     0.000000
185715     4.445638
185716    11.554658
185717     8.417045
185718     8.754545
185719     6.203341
185720    11.870270
185721     6.647059
185722     8.631551
185723     8.321143
185724     0.000000
185725    12.883404
185726    11.637410
185727     8.514068
185728     6.964398
185729    13.751163
185730    12.861920
185731    11.818605
185732    11.376000
185733    10.493407
185734    10.090023
185735    10.603846
185736    14.102970
185737     4.909091
185738     5.428571
185739     7.404950
185740    16.763077
185741    13.639286
185742     9.678338
185743    -0.010496
Name: avg_velocity, dtype: float64

Convert avg_velocity from km/h to miles/h. (1km = 0,621371 miles)



In [40]:

    
data['avg_velocity'] = data.avg_velocity*0.627371



In [41]:

    
# data.to_csv('data/bike_20140505_with_dist_and_avg_velo.csv')



In [42]:

    
plt.hist(data.avg_velocity, bins=60, range=[0,15])
plt.title('Distribution of avg_velocity in mph')
plt.xlabel('avg_velocity')
plt.ylabel('frequency')
plt.savefig('figures/bike_avg_vel.png', format='png', dpi=300)



In [43]:

    
data.avg_velocity.describe() # in mph









    Out[43]:





count    185744.000000
mean          5.149069
std           2.023218
min          -0.006585
25%           4.068203
50%           5.306131
75%           6.457503
max          29.006050
Name: avg_velocity, dtype: float64



In [44]:

    
data.head()









    Out[44]:






  
    
      
      trip_time
      pickup_datetime
      dropoff_datetime
      start_station_id
      start_station_name
      pickup_latitude
      pickup_longitude
      end_station_id
      end_station_name
      dropoff_latitude
      dropoff_longitude
      bikeid
      usertype
      birth year
      gender
      trip_dist
      avg_velocity
    
  
  
    
      0
      00:04:49
      2014-05-05 00:00:14
      2014-05-05 00:05:03
      495
      W 47 St & 10 Ave
      40.762699
      -73.993012
      469
      Broadway & W 53 St
      40.763441
      -73.982681
      17039
      Subscriber
      1986
      1
      876
      6.845942
    
    
      1
      00:17:31
      2014-05-05 00:00:19
      2014-05-05 00:17:50
      281
      Grand Army Plaza & Central Park S
      40.764397
      -73.973715
      236
      St Marks Pl & 2 Ave
      40.728419
      -73.987140
      17875
      Subscriber
      1985
      1
      4153
      8.924546
    
    
      2
      00:03:57
      2014-05-05 00:01:05
      2014-05-05 00:05:02
      309
      Murray St & West St
      40.714979
      -74.013012
      3002
      South End Ave & Liberty St
      40.711512
      -74.015756
      18008
      Customer
      \N
      0
      449
      4.278829
    
    
      3
      00:08:54
      2014-05-05 00:01:12
      2014-05-05 00:10:06
      151
      Cleveland Pl & Spring St
      40.721816
      -73.997203
      345
      W 13 St & 6 Ave
      40.736494
      -73.997044
      18237
      Subscriber
      1983
      1
      1630
      6.894032
    
    
      4
      00:04:35
      2014-05-05 00:01:16
      2014-05-05 00:05:51
      173
      Broadway & W 49 St
      40.760647
      -73.984427
      449
      W 52 St & 9 Ave
      40.764618
      -73.987895
      16607
      Subscriber
      1985
      1
      529
      4.344601



In [45]:

    
data.avg_velocity.quantile([.0001,.01, .5, .75, .95, .975, .99, .995])









    Out[45]:





0.0001     0.000000
0.0100     0.000000
0.5000     5.306131
0.7500     6.457503
0.9500     8.198881
0.9750     8.802942
0.9900     9.514220
0.9950    10.012335
Name: avg_velocity, dtype: float64

Be aware: We calculated the avg_velocity based on the vincenty distance (equals direct connection). A real driver would never drive that fast. We take all avg_velos >= 5mph (approx. >5%-quantile) into account. We use an upper bound, too. It might not be possible to drive "too fast". If someone drives too fast, the technology has had a malfunction.



In [46]:

    
lb = 5
ub = 15
anomaly = data.loc[(data['avg_velocity'] < lb) | (data['avg_velocity'] > ub)] 

# be careful! Maybe adjust to append.
#anomaly.loc[
anomaly.loc[data.loc[(data['avg_velocity'] < lb)].index,'flag'] = 'too_slow'
anomaly.loc[data.loc[(data['avg_velocity'] > ub)].index,'flag'] = 'too_fast'
data = data.drop(anomaly.index, errors='ignore') # ignore uncontained labels / indices
print(1-len(data)/(len(data)+len(anomaly)))









    



0.42908519252304245






    



/home/niklas/anaconda3/lib/python3.5/site-packages/pandas/core/indexing.py:284: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
/home/niklas/anaconda3/lib/python3.5/site-packages/pandas/core/indexing.py:461: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s



In [47]:

    
anomaly.head()









    Out[47]:






  
    
      
      trip_time
      pickup_datetime
      dropoff_datetime
      start_station_id
      start_station_name
      pickup_latitude
      pickup_longitude
      end_station_id
      end_station_name
      dropoff_latitude
      dropoff_longitude
      bikeid
      usertype
      birth year
      gender
      trip_dist
      avg_velocity
      flag
    
  
  
    
      2
      00:03:57
      2014-05-05 00:01:05
      2014-05-05 00:05:02
      309
      Murray St & West St
      40.714979
      -74.013012
      3002
      South End Ave & Liberty St
      40.711512
      -74.015756
      18008
      Customer
      \N
      0
      449
      4.278829
      too_slow
    
    
      4
      00:04:35
      2014-05-05 00:01:16
      2014-05-05 00:05:51
      173
      Broadway & W 49 St
      40.760647
      -73.984427
      449
      W 52 St & 9 Ave
      40.764618
      -73.987895
      16607
      Subscriber
      1985
      1
      529
      4.344601
      too_slow
    
    
      9
      00:11:01
      2014-05-05 00:02:24
      2014-05-05 00:13:25
      493
      W 45 St & 6 Ave
      40.756800
      -73.982912
      546
      E 30 St & Park Ave S
      40.744449
      -73.983035
      15186
      Subscriber
      1944
      1
      1371
      4.684497
      too_slow
    
    
      10
      00:17:59
      2014-05-05 00:02:25
      2014-05-05 00:20:24
      393
      E 5 St & Avenue C
      40.722992
      -73.979955
      223
      W 13 St & 7 Ave
      40.737815
      -73.999947
      15038
      Subscriber
      1942
      1
      2358
      4.935706
      too_slow
    
    
      13
      00:18:29
      2014-05-05 00:02:46
      2014-05-05 00:21:15
      394
      E 9 St & Avenue C
      40.725213
      -73.977688
      128
      MacDougal St & Prince St
      40.727103
      -74.002971
      18886
      Subscriber
      1966
      1
      2146
      4.370439
      too_slow

We dropped about 42% of the data!



In [48]:

    
data.avg_velocity.describe()









    Out[48]:





count    106044.000000
mean          6.502750
std           1.133101
min           5.000057
25%           5.616168
50%           6.269582
75%           7.142086
max          14.099118
Name: avg_velocity, dtype: float64



In [49]:

    
anomaly.tail()









    Out[49]:






  
    
      
      trip_time
      pickup_datetime
      dropoff_datetime
      start_station_id
      start_station_name
      pickup_latitude
      pickup_longitude
      end_station_id
      end_station_name
      dropoff_latitude
      dropoff_longitude
      bikeid
      usertype
      birth year
      gender
      trip_dist
      avg_velocity
      flag
    
  
  
    
      185728
      00:06:22
      2014-05-11 23:47:22
      2014-05-11 23:53:44
      308
      St James Pl & Oliver St
      40.713079
      -73.998512
      307
      Canal St & Rutgers St
      40.714275
      -73.989900
      19916
      Subscriber
      1983
      1
      739
      4.369261
      too_slow
    
    
      185737
      00:03:29
      2014-05-11 23:50:54
      2014-05-11 23:54:23
      526
      E 33 St & 5 Ave
      40.747659
      -73.984907
      498
      Broadway & W 32 St
      40.748549
      -73.988084
      16197
      Customer
      \N
      0
      285
      3.079821
      too_slow
    
    
      185738
      00:03:09
      2014-05-11 23:51:22
      2014-05-11 23:54:31
      526
      E 33 St & 5 Ave
      40.747659
      -73.984907
      498
      Broadway & W 32 St
      40.748549
      -73.988084
      14714
      Customer
      \N
      0
      285
      3.405728
      too_slow
    
    
      185739
      00:06:44
      2014-05-11 23:52:29
      2014-05-11 23:59:13
      369
      Washington Pl & 6 Ave
      40.732241
      -74.000264
      293
      Lafayette St & E 8 St
      40.730287
      -73.990765
      14617
      Subscriber
      1962
      1
      831
      4.645651
      too_slow
    
    
      185743
      00:05:43
      2014-05-11 23:53:44
      2014-05-11 23:59:27
      439
      E 4 St & 2 Ave
      40.726281
      -73.989780
      393
      E 5 St & Avenue C
      40.722992
      -73.979955
      15825
      Customer
      \N
      0
      -1
      -0.006585
      too_slow

A little drawing



In [50]:

    
x = data.pickup_longitude
y = data.pickup_latitude

bins = 100;
H, xedges, yedges = np.histogram2d(x, y, bins=bins)

plt.jet()
fig = plt.figure(figsize=(20, 10))
#ax.set_title('pcolormesh: exact bin edges')
#mesh = plt.pcolormesh(X, Y, H)
plt.hist2d(x,y,bins=bins)
plt.colorbar()
plt.scatter(xedges[79], yedges[61], marker='x')
ax = fig.gca()
ax.grid(False)
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.title('Pickup Frequency')
#ax.set_aspect('equal')
#plt.savefig('figure.pdf', format='pdf')
#plt.savefig('figure.png', format='png')
plt.savefig('figures/bike_stations.png', format='png', dpi=300)









    





<matplotlib.figure.Figure at 0x7f7d17d57fd0>

Where is a pickup maximum?



In [51]:

    
np.amax(H)









    Out[51]:





2001.0



In [52]:

    
np.where(H==2001)









    Out[52]:





(array([61]), array([79]))

The area with the maximum pickups is around the Grand Central Terminal. Not very surprising.



In [53]:

    
yedges[79],xedges[61],









    Out[53]:





(40.752374288829998, -73.978505949899997)



In [54]:

    
type(H)









    Out[54]:





numpy.ndarray



In [55]:

    
print(H.shape)
print(H.size)
print(H.max())



In [56]:

    
print('Current bin width:')
print(vincenty((xedges[0], yedges[0]), (xedges[1], yedges[1])).meters)









    



Current bin width:
76.03394282270413



In [57]:

    
distances = (-1)*np.ones(len(xedges)-1)
for x in range(0,len(xedges)-1, 1):
    distances[x] = vincenty((xedges[x], yedges[x]), (xedges[x+1], yedges[x+1])).meters



In [58]:

    
print('Sizes of all bins in meters:')
distances









    



Sizes of all bins in meters:






    Out[58]:





array([ 76.03394282,  76.03433763,  76.03473245,  76.03512728,
        76.03552212,  76.03591697,  76.03631184,  76.03670672,
        76.03710161,  76.03749651,  76.03789143,  76.03828635,
        76.03868129,  76.03907624,  76.0394712 ,  76.03986618,
        76.04026116,  76.04065616,  76.04105117,  76.04144619,
        76.04184122,  76.04223627,  76.04263132,  76.04302639,
        76.04342148,  76.04381656,  76.04421167,  76.04460679,
        76.04500191,  76.04539706,  76.04579221,  76.04618737,
        76.04658255,  76.04697773,  76.04737294,  76.04776815,
        76.04816337,  76.04855861,  76.04895385,  76.04934911,
        76.04974438,  76.05013966,  76.05053496,  76.05093027,
        76.05132558,  76.05172091,  76.05211626,  76.05251161,
        76.05290698,  76.05330235,  76.05369774,  76.05409315,
        76.05448856,  76.05488398,  76.05527942,  76.05567487,
        76.05607033,  76.05646581,  76.05686129,  76.05725679,
        76.05765229,  76.05804782,  76.05844335,  76.05883889,
        76.05923445,  76.05963002,  76.0600256 ,  76.06042119,
        76.06081679,  76.06121241,  76.06160804,  76.06200367,
        76.06239932,  76.06279499,  76.06319066,  76.06358635,
        76.06398205,  76.06437776,  76.06477348,  76.06516922,
        76.06556496,  76.06596072,  76.06635649,  76.06675227,
        76.06714807,  76.06754387,  76.06793969,  76.06833552,
        76.06873136,  76.06912721,  76.06952308,  76.06991895,
        76.07031484,  76.07071074,  76.07110665,  76.07150258,
        76.07189851,  76.07229446,  76.07269042,  76.07308639])



In [59]:

    
(H==0).sum()/H.size









    Out[59]:





0.96750000000000003

So we know that about 97% of the bins have 0 pickups in it. This was expected, because we have static stations only.

Only look at trips in a given bounding box



In [60]:

    
jfk_geodata = (40.641547, -73.778118)
ridgefield_geodata = (40.856406, -74.020642)
data_in_box = data.loc[(data['dropoff_latitude'] > jfk_geodata[0]) & 
                       (data['dropoff_longitude'] < jfk_geodata[1]) &
                       (data['dropoff_latitude'] < ridgefield_geodata[0]) & 
                       (data['dropoff_longitude'] > ridgefield_geodata[1]) & 
                       (data['pickup_latitude'] > jfk_geodata[0]) & 
                       (data['pickup_longitude'] < jfk_geodata[1]) &
                       (data['pickup_latitude'] < ridgefield_geodata[0]) & 
                       (data['pickup_longitude'] > ridgefield_geodata[1])         
                       ]
# taxidata = taxidata.drop(anomaly.index)



In [61]:

    
len(data_in_box)/len(data)









    Out[61]:





1.0

We lost no trips through the bounding box.



In [62]:

    
data_in_box.head()









    Out[62]:






  
    
      
      trip_time
      pickup_datetime
      dropoff_datetime
      start_station_id
      start_station_name
      pickup_latitude
      pickup_longitude
      end_station_id
      end_station_name
      dropoff_latitude
      dropoff_longitude
      bikeid
      usertype
      birth year
      gender
      trip_dist
      avg_velocity
    
  
  
    
      0
      00:04:49
      2014-05-05 00:00:14
      2014-05-05 00:05:03
      495
      W 47 St & 10 Ave
      40.762699
      -73.993012
      469
      Broadway & W 53 St
      40.763441
      -73.982681
      17039
      Subscriber
      1986
      1
      876
      6.845942
    
    
      1
      00:17:31
      2014-05-05 00:00:19
      2014-05-05 00:17:50
      281
      Grand Army Plaza & Central Park S
      40.764397
      -73.973715
      236
      St Marks Pl & 2 Ave
      40.728419
      -73.987140
      17875
      Subscriber
      1985
      1
      4153
      8.924546
    
    
      3
      00:08:54
      2014-05-05 00:01:12
      2014-05-05 00:10:06
      151
      Cleveland Pl & Spring St
      40.721816
      -73.997203
      345
      W 13 St & 6 Ave
      40.736494
      -73.997044
      18237
      Subscriber
      1983
      1
      1630
      6.894032
    
    
      5
      00:05:16
      2014-05-05 00:01:22
      2014-05-05 00:06:38
      294
      Washington Square E
      40.730494
      -73.995721
      237
      E 11 St & 2 Ave
      40.730473
      -73.986724
      15545
      Subscriber
      1983
      2
      760
      5.431921
    
    
      6
      00:06:15
      2014-05-05 00:01:48
      2014-05-05 00:08:03
      346
      Bank St & Hudson St
      40.736529
      -74.006180
      375
      Mercer St & Bleecker St
      40.726795
      -73.996951
      18794
      Subscriber
      1973
      1
      1332
      8.022318

Let's take a first look at the distribution of the cleaned target variable which we want to estimate:



In [63]:

    
h = data_in_box.avg_velocity.values
plt.figure(figsize=(20,10))
plt.hist(h, normed=False,  bins=150)
    #, histtype='stepfilled')
#plt.yscale('log')
#plt.ylabel('log(freq x)', fontsize=40)
#plt.xlabel('x = avg_amount_per_minute', fontsize=40)
#print('Min:' +  str(min(h)) + '\nMax:' +  str(max(h)))
#plt.locator_params(axis = 'x', nbins = 20)
plt.show()



In [64]:

    
data_in_box.head()









    Out[64]:






  
    
      
      trip_time
      pickup_datetime
      dropoff_datetime
      start_station_id
      start_station_name
      pickup_latitude
      pickup_longitude
      end_station_id
      end_station_name
      dropoff_latitude
      dropoff_longitude
      bikeid
      usertype
      birth year
      gender
      trip_dist
      avg_velocity
    
  
  
    
      0
      00:04:49
      2014-05-05 00:00:14
      2014-05-05 00:05:03
      495
      W 47 St & 10 Ave
      40.762699
      -73.993012
      469
      Broadway & W 53 St
      40.763441
      -73.982681
      17039
      Subscriber
      1986
      1
      876
      6.845942
    
    
      1
      00:17:31
      2014-05-05 00:00:19
      2014-05-05 00:17:50
      281
      Grand Army Plaza & Central Park S
      40.764397
      -73.973715
      236
      St Marks Pl & 2 Ave
      40.728419
      -73.987140
      17875
      Subscriber
      1985
      1
      4153
      8.924546
    
    
      3
      00:08:54
      2014-05-05 00:01:12
      2014-05-05 00:10:06
      151
      Cleveland Pl & Spring St
      40.721816
      -73.997203
      345
      W 13 St & 6 Ave
      40.736494
      -73.997044
      18237
      Subscriber
      1983
      1
      1630
      6.894032
    
    
      5
      00:05:16
      2014-05-05 00:01:22
      2014-05-05 00:06:38
      294
      Washington Square E
      40.730494
      -73.995721
      237
      E 11 St & 2 Ave
      40.730473
      -73.986724
      15545
      Subscriber
      1983
      2
      760
      5.431921
    
    
      6
      00:06:15
      2014-05-05 00:01:48
      2014-05-05 00:08:03
      346
      Bank St & Hudson St
      40.736529
      -74.006180
      375
      Mercer St & Bleecker St
      40.726795
      -73.996951
      18794
      Subscriber
      1973
      1
      1332
      8.022318

Make a new dataframe with features and targets



In [65]:

    
time_regression_df = pd.DataFrame([ #data_in_box['pickup_datetime'].dt.day, # leave this out
                          data_in_box['pickup_datetime'].dt.dayofweek,
                          data_in_box['pickup_datetime'].dt.hour,
                          data_in_box['pickup_latitude'],
                          data_in_box['pickup_longitude'],
                          data_in_box['dropoff_latitude'],
                          data_in_box['dropoff_longitude'],
                          np.ceil(data_in_box['avg_velocity'])
                         ]).T



In [66]:

    
time_regression_df.columns = [#'pickup_datetime_day', 
                              'pickup_datetime_dayofweek', 'pickup_datetime_hour',
                                 'pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude',
                                 'avg_velocity_mph']

Use minutes for prediction instead of seconds (ceil the time). Definitley more robust than seconds!



In [67]:

    
time_regression_df.head()









    Out[67]:






  
    
      
      pickup_datetime_dayofweek
      pickup_datetime_hour
      pickup_latitude
      pickup_longitude
      dropoff_latitude
      dropoff_longitude
      avg_velocity_mph
    
  
  
    
      0
      0.0
      0.0
      40.762699
      -73.993012
      40.763441
      -73.982681
      7.0
    
    
      1
      0.0
      0.0
      40.764397
      -73.973715
      40.728419
      -73.987140
      9.0
    
    
      3
      0.0
      0.0
      40.721816
      -73.997203
      40.736494
      -73.997044
      7.0
    
    
      5
      0.0
      0.0
      40.730494
      -73.995721
      40.730473
      -73.986724
      6.0
    
    
      6
      0.0
      0.0
      40.736529
      -74.006180
      40.726795
      -73.996951
      9.0



In [68]:

    
time_regression_df.tail()









    Out[68]:






  
    
      
      pickup_datetime_dayofweek
      pickup_datetime_hour
      pickup_latitude
      pickup_longitude
      dropoff_latitude
      dropoff_longitude
      avg_velocity_mph
    
  
  
    
      185735
      6.0
      23.0
      40.692395
      -73.993379
      40.688070
      -73.984106
      7.0
    
    
      185736
      6.0
      23.0
      40.707065
      -74.007319
      40.717488
      -74.010455
      9.0
    
    
      185740
      6.0
      23.0
      40.729170
      -73.998102
      40.721655
      -74.002347
      11.0
    
    
      185741
      6.0
      23.0
      40.734232
      -73.986923
      40.742909
      -73.977061
      9.0
    
    
      185742
      6.0
      23.0
      40.726281
      -73.989780
      40.722992
      -73.979955
      7.0



In [69]:

    
time_regression_df.ix[:,0:7].describe()









    Out[69]:






  
    
      
      pickup_datetime_dayofweek
      pickup_datetime_hour
      pickup_latitude
      pickup_longitude
      dropoff_latitude
      dropoff_longitude
      avg_velocity_mph
    
  
  
    
      count
      106044.000000
      106044.000000
      106044.000000
      106044.000000
      106044.000000
      106044.000000
      106044.000000
    
    
      mean
      2.603636
      13.969918
      40.735199
      -73.990941
      40.734848
      -73.991118
      7.036636
    
    
      std
      1.972009
      4.992480
      0.019369
      0.012164
      0.019295
      0.012357
      1.125662
    
    
      min
      0.000000
      0.000000
      40.680342
      -74.017134
      40.680342
      -74.017134
      6.000000
    
    
      25%
      1.000000
      9.000000
      40.722174
      -73.999733
      40.722174
      -74.000264
      6.000000
    
    
      50%
      2.000000
      15.000000
      40.737050
      -73.990765
      40.737050
      -73.990931
      7.000000
    
    
      75%
      4.000000
      18.000000
      40.750200
      -73.981948
      40.749718
      -73.982050
      8.000000
    
    
      max
      6.000000
      23.000000
      40.771522
      -73.953809
      40.771522
      -73.953809
      15.000000



In [70]:

    
print(time_regression_df.avg_velocity_mph.value_counts())
print(len(time_regression_df.avg_velocity_mph.value_counts()))









    



6.0     42576
7.0     33360
8.0     18609
9.0      7849
10.0     2705
11.0      740
12.0      162
13.0       34
14.0        7
15.0        2
Name: avg_velocity_mph, dtype: int64
10

So we hace 10 different velocities to predict.

Split the data into a training dataset and a test dataset. Evaluate the performance of the decision tree on the test data



In [71]:

    
from sklearn import cross_validation as cv
y = time_regression_df['avg_velocity_mph']
X = time_regression_df.ix[:,0:6]
X_train, X_test, y_train, y_test = cv.train_test_split(X, y,test_size=0.05,random_state=0)



In [72]:

    
from sklearn import cross_validation as cv
time_regression_df_train, time_regression_df_test = cv.train_test_split(time_regression_df, test_size=0.05, random_state=99)
y_train = time_regression_df_train['avg_velocity_mph']
x_train = time_regression_df_train.ix[:, 0:6]
y_test = time_regression_df_test['avg_velocity_mph']
x_test = time_regression_df_test.ix[:, 0:6]

Changed test size to 5%



In [73]:

    
x_test.head()









    Out[73]:






  
    
      
      pickup_datetime_dayofweek
      pickup_datetime_hour
      pickup_latitude
      pickup_longitude
      dropoff_latitude
      dropoff_longitude
    
  
  
    
      38284
      1.0
      9.0
      40.760203
      -73.964785
      40.756014
      -73.967416
    
    
      129690
      4.0
      17.0
      40.757952
      -73.977876
      40.754557
      -73.965930
    
    
      109201
      3.0
      19.0
      40.741444
      -73.975361
      40.725029
      -73.990697
    
    
      120605
      4.0
      11.0
      40.722293
      -73.991475
      40.747659
      -73.984907
    
    
      89672
      2.0
      19.0
      40.686919
      -73.976682
      40.683178
      -73.965964



In [74]:

    
y_test.head()









    Out[74]:





38284     7.0
129690    8.0
109201    8.0
120605    7.0
89672     6.0
Name: avg_velocity_mph, dtype: float64



In [75]:

    
xy_test = pd.concat([x_test, y_test], axis=1)



In [76]:

    
xy_test.head()









    Out[76]:






  
    
      
      pickup_datetime_dayofweek
      pickup_datetime_hour
      pickup_latitude
      pickup_longitude
      dropoff_latitude
      dropoff_longitude
      avg_velocity_mph
    
  
  
    
      38284
      1.0
      9.0
      40.760203
      -73.964785
      40.756014
      -73.967416
      7.0
    
    
      129690
      4.0
      17.0
      40.757952
      -73.977876
      40.754557
      -73.965930
      8.0
    
    
      109201
      3.0
      19.0
      40.741444
      -73.975361
      40.725029
      -73.990697
      8.0
    
    
      120605
      4.0
      11.0
      40.722293
      -73.991475
      40.747659
      -73.984907
      7.0
    
    
      89672
      2.0
      19.0
      40.686919
      -73.976682
      40.683178
      -73.965964
      6.0

If you want to export something...



In [77]:

    
#Xy_test.to_csv('taxi_tree_test_Xy_20130506-12.csv')
#X_test.to_csv('taxi_tree_test_X_20130506-12.csv')
#y_test.to_csv('taxi_tree_test_y_20130506-12.csv')
# Xy_test.to_csv('bike_tree_test_Xy_20140505-11.csv')
# X_test.to_csv('bike_tree_test_X_20140505-11.csv')
# y_test.to_csv('bike_tree_test_y_20140506-11.csv')



In [78]:

    
# Xy_test_sample = Xy_test.sample(10000, random_state=99)



In [79]:

    
# Xy_test_sample.to_csv('taxi_tree_test_Xy_sample.csv')



In [80]:

    
# Xy_test_sample.head()

Just to be sure



In [81]:

    
print(x_train.shape)
print(x_train.size)
print(x_test.shape)
print(X.shape)
print(x_train.shape[0]+x_test.shape[0])









    



(100741, 6)
604446
(5303, 6)
(106044, 6)
106044



In [82]:

    
import time
# Import the necessary modules and libraries
from sklearn.tree import DecisionTreeRegressor
import numpy as np
import matplotlib.pyplot as plt



In [83]:

    
regtree = DecisionTreeRegressor(min_samples_split=100, random_state=99, max_depth=20)# formerly 15. 15 is reasonable
                                                                                     # random states: 99
regtree.fit(x_train, y_train)

regtree.score(x_test, y_test)









    Out[83]:





0.11912472409623709



In [84]:

    
y_pred = regtree.predict(x_test)
diff_regtree = (y_pred-y_test)
# plt.figure(figsize=(12,10)) # not needed. set values globally
plt.hist(diff_regtree.values, bins=100, range=[-6,6])
print('Perzentile(%): ', [1,5,10,15,25,50,75,90,95,99], '\n', np.percentile(diff_regtree.values, [1,5,10,15,25,50,75,90,95,99]))
print('Absolute time deviation (in 1k): ', sum(abs(diff_regtree))/1000)
plt.title('Simple Decision Tree Regressor')
plt.xlabel('deviation in mph')
plt.ylabel('frequency')
plt.savefig('figures/bike_tree.png', format='png', dpi=300)









    



Perzentile(%):  [1, 5, 10, 15, 25, 50, 75, 90, 95, 99] 
 [-3.10410581 -1.99446494 -1.4        -1.13986047 -0.56306818  0.19925743
  0.74845592  1.16598639  1.41176471  1.9564538 ]
Absolute time deviation (in 1k):  4.488221403



In [85]:

    
diff_regtree.describe()









    Out[85]:





count    5303.000000
mean        0.003596
std         1.065819
min        -7.033333
25%        -0.563068
50%         0.199257
75%         0.748456
max         4.125000
Name: avg_velocity_mph, dtype: float64

So this is a bad result. Let's try random forest now.

Train several random forests and look at their scores



In [86]:

    
from sklearn.ensemble import RandomForestRegressor
n_est_list = list(range(2,32,8))
min_sam_leaf_list = [2,50,100,150]
max_depth_list = list(range(2,32,8))
results = np.empty([0,4])
for n_est in n_est_list:
    for min_sam_leaf in min_sam_leaf_list:
        for max_depth in max_depth_list:
            rd_regtree = RandomForestRegressor(n_estimators=n_est,n_jobs=6,min_samples_leaf=min_sam_leaf, random_state=99, max_depth=max_depth)
            rd_regtree.fit(x_train, y_train)
            score = rd_regtree.score(x_test, y_test)
            results = np.vstack((results, [n_est, min_sam_leaf, max_depth,score]))



In [87]:

    
best = np.where(results == max(results[:,3]))[0]
results[best,:]









    Out[87]:





array([[ 26.        ,   2.        ,  26.        ,   0.19979136]])

Watch at the best results.



In [88]:

    
results[np.argsort(results[:, 3])][-10:,:]









    Out[88]:





array([[ 26.        ,  50.        ,  18.        ,   0.16563993],
       [ 10.        ,  50.        ,  26.        ,   0.16709339],
       [ 18.        ,  50.        ,  26.        ,   0.17044882],
       [ 26.        ,  50.        ,  26.        ,   0.17190602],
       [ 10.        ,   2.        ,  26.        ,   0.17376701],
       [ 10.        ,   2.        ,  18.        ,   0.18050883],
       [ 18.        ,   2.        ,  18.        ,   0.18941467],
       [ 18.        ,   2.        ,  26.        ,   0.19121346],
       [ 26.        ,   2.        ,  18.        ,   0.196679  ],
       [ 26.        ,   2.        ,  26.        ,   0.19979136]])



In [89]:

    
results = np.vstack((results, [n_est, min_sam_leaf, max_depth,score]))
#results = np.vstack((results, [n_est, min_sam_leaf, max_depth,score]))
#results = np.vstack((results, [n_est, min_sam_leaf, max_depth,score]))
results[:,3]









    Out[89]:





array([ 0.03216565,  0.10243604,  0.05117017, -0.04134117,  0.03216565,
        0.10892762,  0.1338695 ,  0.13916637,  0.03216565,  0.10622744,
        0.12186341,  0.12444769,  0.03216565,  0.10673246,  0.12119634,
        0.12550215,  0.03273003,  0.14140097,  0.18050883,  0.17376701,
        0.03273003,  0.13053573,  0.16216698,  0.16709339,  0.03273003,
        0.12165977,  0.14658679,  0.14955706,  0.03273003,  0.11580282,
        0.13614928,  0.13957835,  0.03388217,  0.14354701,  0.18941467,
        0.19121346,  0.03388217,  0.13127777,  0.164049  ,  0.17044882,
        0.03388217,  0.1225744 ,  0.14795706,  0.15214733,  0.03388217,
        0.11604073,  0.13708992,  0.14088194,  0.03405952,  0.14412542,
        0.196679  ,  0.19979136,  0.03405952,  0.13139713,  0.16563993,
        0.17190602,  0.03405952,  0.12201824,  0.14815349,  0.15185501,
        0.03405952,  0.11595707,  0.13762175,  0.14100002,  0.14100002])

Train the best random forest again seperately



In [90]:

    
from sklearn.ensemble import RandomForestRegressor

#rd_regtree = RandomForestRegressor(n_estimators=10,n_jobs=6,min_samples_leaf=4, random_state=99, max_depth=20)
rd_regtree = RandomForestRegressor(n_estimators=26,n_jobs=6,min_samples_leaf=2, random_state=99, max_depth=26)
#total sum of diff: 1132
#rd_regtree = RandomForestRegressor(n_estimators=40,n_jobs=-1,min_samples_split=3, random_state=99, max_depth=11)
#total sum of diff: 1129
rd_regtree.fit(x_train, y_train)
print('R²: ', rd_regtree.score(x_test, y_test))









    



R²:  0.199791363355



In [91]:

    
rd_regtree.feature_importances_









    Out[91]:





array([ 0.12104463,  0.17781704,  0.18620672,  0.16556891,  0.1811653 ,
        0.16819739])



In [92]:

    
tree_list = rd_regtree.estimators_
for i in range(0,len(tree_list)):
    print((tree_list[i].feature_importances_))









    



[ 0.12002358  0.17554296  0.19442978  0.17023113  0.17408755  0.165685  ]
[ 0.11671855  0.17003695  0.18261556  0.18056734  0.1770114   0.1730502 ]
[ 0.1212774   0.18594576  0.17974536  0.17083764  0.1721192   0.17007464]
[ 0.11802844  0.15425361  0.19237926  0.16592169  0.19164436  0.17777264]
[ 0.12216613  0.17483906  0.18325801  0.16611135  0.18617242  0.16745304]
[ 0.12681396  0.17850674  0.18685254  0.16952322  0.16825109  0.17005246]
[ 0.11894376  0.18112627  0.18000023  0.15378966  0.18938084  0.17675924]
[ 0.11689716  0.17969124  0.18616261  0.16124365  0.18873165  0.16727368]
[ 0.12177354  0.17866025  0.18311643  0.17045256  0.17980709  0.16619014]
[ 0.12564728  0.17346771  0.18699963  0.16579276  0.17909566  0.16899696]
[ 0.12313096  0.18269516  0.18784668  0.15912749  0.1811488   0.16605092]
[ 0.11699844  0.18513238  0.18102747  0.16180719  0.17907128  0.17596324]
[ 0.11389793  0.17649736  0.18380501  0.16473432  0.1891829   0.17188249]
[ 0.12245613  0.18804531  0.17190083  0.15312974  0.1972377   0.16723029]
[ 0.12325887  0.17636399  0.19488524  0.1681122   0.17508856  0.16229114]
[ 0.12444665  0.16925641  0.19029348  0.16701825  0.18408435  0.16490086]
[ 0.12772183  0.17755013  0.18233794  0.15931925  0.17945075  0.17362011]
[ 0.11240302  0.1714663   0.18531471  0.1675688   0.19294621  0.17030095]
[ 0.12207764  0.18165057  0.19147116  0.15734331  0.18742885  0.16002846]
[ 0.1191304   0.17872016  0.19996281  0.16541011  0.17825426  0.15852225]
[ 0.11899421  0.17741194  0.19219335  0.16779293  0.17935797  0.1642496 ]
[ 0.13382566  0.18857068  0.18455538  0.16171965  0.16911262  0.16221601]
[ 0.12470386  0.17830761  0.18863256  0.15838186  0.17815828  0.17181583]
[ 0.11868714  0.1857741   0.19271346  0.17088068  0.16985072  0.16209391]
[ 0.11488257  0.1810187   0.17688767  0.18183626  0.17562905  0.16974575]
[ 0.12225527  0.1727118   0.18198743  0.16613871  0.18799431  0.16891248]



In [93]:

    
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(results[:,0],results[:,2],results[:,3], marker='o')
plt.show()

So this plot is not particularly useful in this case. But it's a hinto to how to visualize the results.



In [94]:

    
y_pred = rd_regtree.predict(x_test)
np.linalg.norm(np.ceil(y_pred)-y_test)
diff_rd = (y_pred-y_test)
# plt.figure(figsize=(12,10)) # not needed. set values globally
plt.hist(diff_rd.values, bins=100, range=[-6,6])
print('Perzentile(%): ', [1,5,10,15,25,50,75,90,95,99], '\n', np.percentile(diff_rd.values, [1,5,10,15,25,50,75,90,95,99]))
print('Absolute time deviation (in 1k): ', sum(abs(diff_rd))/1000)
plt.title('Random Forest Regressor')
plt.xlabel('deviation in mph')
plt.ylabel('frequency')
plt.savefig('figures/bike_randomforest.png', format='png', dpi=150)









    



Perzentile(%):  [1, 5, 10, 15, 25, 50, 75, 90, 95, 99] 
 [-2.92611081 -1.84347799 -1.34595238 -1.06165912 -0.51531681  0.22179487
  0.71258853  1.08463999  1.38434066  1.94501121]
Absolute time deviation (in 1k):  4.22660235146



In [95]:

    
diff_rd.describe()









    Out[95]:





count    5303.000000
mean        0.021278
std         1.015629
min        -7.083869
25%        -0.515317
50%         0.221795
75%         0.712589
max         4.156410
Name: avg_velocity_mph, dtype: float64

Dump the best random forest...



In [99]:

    
from sklearn.externals import joblib
joblib.dump(rd_regtree, 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl', protocol=2)









    Out[99]:





['randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_01.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_02.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_03.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_04.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_05.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_06.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_07.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_08.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_09.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_10.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_11.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_12.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_13.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_14.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_15.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_16.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_17.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_18.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_19.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_20.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_21.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_22.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_23.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_24.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_25.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_26.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_27.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_28.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_29.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_30.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_31.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_32.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_33.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_34.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_35.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_36.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_37.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_38.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_39.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_40.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_41.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_42.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_43.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_44.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_45.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_46.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_47.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_48.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_49.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_50.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_51.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_52.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_53.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_54.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_55.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_56.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_57.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_58.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_59.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_60.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_61.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_62.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_63.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_64.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_65.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_66.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_67.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_68.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_69.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_70.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_71.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_72.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_73.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_74.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_75.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_76.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_77.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_78.npy']



In [100]:

    
! cd randforlib && zip bike_regtree_26x_depth_26_mss_2_PY27.zip bike_regtree_26x_depth_26_mss_2_PY27.pkl*









    



  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl (deflated 76%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_01.npy (deflated 19%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_02.npy (deflated 82%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_03.npy (deflated 74%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_04.npy (deflated 19%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_05.npy (deflated 81%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_06.npy (deflated 73%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_07.npy (deflated 19%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_08.npy (deflated 82%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_09.npy (deflated 74%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_10.npy (deflated 19%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_11.npy (deflated 82%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_12.npy (deflated 74%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_13.npy (deflated 19%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_14.npy (deflated 82%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_15.npy (deflated 74%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_16.npy (deflated 19%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_17.npy (deflated 82%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_18.npy (deflated 74%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_19.npy (deflated 19%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_20.npy (deflated 82%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_21.npy (deflated 74%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_22.npy (deflated 19%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_23.npy (deflated 82%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_24.npy (deflated 74%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_25.npy (deflated 19%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_26.npy (deflated 82%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_27.npy (deflated 74%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_28.npy (deflated 19%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_29.npy (deflated 82%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_30.npy (deflated 74%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_31.npy (deflated 19%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_32.npy (deflated 82%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_33.npy (deflated 74%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_34.npy (deflated 19%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_35.npy (deflated 82%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_36.npy (deflated 74%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_37.npy (deflated 19%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_38.npy (deflated 82%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_39.npy (deflated 73%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_40.npy (deflated 19%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_41.npy (deflated 82%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_42.npy (deflated 74%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_43.npy (deflated 19%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_44.npy (deflated 82%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_45.npy (deflated 74%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_46.npy (deflated 19%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_47.npy (deflated 82%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_48.npy (deflated 74%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_49.npy (deflated 19%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_50.npy (deflated 82%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_51.npy (deflated 74%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_52.npy (deflated 19%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_53.npy (deflated 82%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_54.npy (deflated 73%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_55.npy (deflated 19%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_56.npy (deflated 82%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_57.npy (deflated 74%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_58.npy (deflated 19%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_59.npy (deflated 82%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_60.npy (deflated 74%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_61.npy (deflated 19%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_62.npy (deflated 82%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_63.npy (deflated 74%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_64.npy (deflated 19%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_65.npy (deflated 82%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_66.npy (deflated 74%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_67.npy (deflated 19%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_68.npy (deflated 82%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_69.npy (deflated 74%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_70.npy (deflated 19%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_71.npy (deflated 82%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_72.npy (deflated 74%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_73.npy (deflated 19%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_74.npy (deflated 82%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_75.npy (deflated 74%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_76.npy (deflated 19%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_77.npy (deflated 82%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_78.npy (deflated 74%)

	trip_time	start_station_id	pickup_latitude	pickup_longitude	end_station_id	dropoff_latitude	dropoff_longitude	bikeid	gender
count	185744.000000	185744.000000	185744.000000	185744.000000	185744.000000	185744.000000	185744.000000	185744.000000	185744.000000
mean	869.523107	450.726511	40.734630	-73.991079	451.404142	40.734387	-73.991147	18157.530305	1.090598
std	974.098663	370.232259	0.019489	0.012186	375.448276	0.019530	0.012298	2119.989210	0.550519
min	60.000000	72.000000	40.680342	-74.017134	72.000000	40.680342	-74.017134	14529.000000	0.000000
25%	401.000000	304.000000	40.721655	-73.999947	303.000000	40.721101	-74.000040	16315.000000	1.000000
50%	640.000000	405.000000	40.736494	-73.990765	404.000000	40.736197	-73.990931	18164.000000	1.000000
75%	1059.000000	490.000000	40.750073	-73.982050	489.000000	40.749718	-73.982050	20005.000000	1.000000
max	21549.000000	3002.000000	40.771522	-73.953809	3002.000000	40.771522	-73.953809	21679.000000	2.000000

	count	mean	std	min	25%	50%	75%	max
trip_time	185744	0 days 00:14:29.523107	0 days 00:16:14.098662	0 days 00:01:00	0 days 00:06:41	0 days 00:10:40	0 days 00:17:39	0 days 05:59:09
start_station_id	185744	450.727	370.232	72	304	405	490	3002
pickup_latitude	185744	40.7346	0.0194894	40.6803	40.7217	40.7365	40.7501	40.7715
pickup_longitude	185744	-73.9911	0.012186	-74.0171	-73.9999	-73.9908	-73.9821	-73.9538
end_station_id	185744	451.404	375.448	72	303	404	489	3002
dropoff_latitude	185744	40.7344	0.0195297	40.6803	40.7211	40.7362	40.7497	40.7715
dropoff_longitude	185744	-73.9911	0.0122976	-74.0171	-74	-73.9909	-73.9821	-73.9538
bikeid	185744	18157.5	2119.99	14529	16315	18164	20005	21679
gender	185744	1.0906	0.550519	0	1	1	1	2

	trip_time	pickup_datetime	dropoff_datetime	start_station_id	start_station_name	pickup_latitude	pickup_longitude	end_station_id	end_station_name	dropoff_latitude	dropoff_longitude	bikeid	usertype	birth year	gender
0	00:04:49	2014-05-05 00:00:14	2014-05-05 00:05:03	495	W 47 St & 10 Ave	40.762699	-73.993012	469	Broadway & W 53 St	40.763441	-73.982681	17039	Subscriber	1986	1
1	00:17:31	2014-05-05 00:00:19	2014-05-05 00:17:50	281	Grand Army Plaza & Central Park S	40.764397	-73.973715	236	St Marks Pl & 2 Ave	40.728419	-73.987140	17875	Subscriber	1985	1
2	00:03:57	2014-05-05 00:01:05	2014-05-05 00:05:02	309	Murray St & West St	40.714979	-74.013012	3002	South End Ave & Liberty St	40.711512	-74.015756	18008	Customer	\N	0
3	00:08:54	2014-05-05 00:01:12	2014-05-05 00:10:06	151	Cleveland Pl & Spring St	40.721816	-73.997203	345	W 13 St & 6 Ave	40.736494	-73.997044	18237	Subscriber	1983	1
4	00:04:35	2014-05-05 00:01:16	2014-05-05 00:05:51	173	Broadway & W 49 St	40.760647	-73.984427	449	W 52 St & 9 Ave	40.764618	-73.987895	16607	Subscriber	1985	1

	trip_time	pickup_datetime	dropoff_datetime	start_station_id	start_station_name	pickup_latitude	pickup_longitude	end_station_id	end_station_name	dropoff_latitude	dropoff_longitude	bikeid	usertype	birth year	gender	trip_dist	avg_velocity	flag
185728	00:06:22	2014-05-11 23:47:22	2014-05-11 23:53:44	308	St James Pl & Oliver St	40.713079	-73.998512	307	Canal St & Rutgers St	40.714275	-73.989900	19916	Subscriber	1983	1	739	4.369261	too_slow
185737	00:03:29	2014-05-11 23:50:54	2014-05-11 23:54:23	526	E 33 St & 5 Ave	40.747659	-73.984907	498	Broadway & W 32 St	40.748549	-73.988084	16197	Customer	\N	0	285	3.079821	too_slow
185738	00:03:09	2014-05-11 23:51:22	2014-05-11 23:54:31	526	E 33 St & 5 Ave	40.747659	-73.984907	498	Broadway & W 32 St	40.748549	-73.988084	14714	Customer	\N	0	285	3.405728	too_slow
185739	00:06:44	2014-05-11 23:52:29	2014-05-11 23:59:13	369	Washington Pl & 6 Ave	40.732241	-74.000264	293	Lafayette St & E 8 St	40.730287	-73.990765	14617	Subscriber	1962	1	831	4.645651	too_slow
185743	00:05:43	2014-05-11 23:53:44	2014-05-11 23:59:27	439	E 4 St & 2 Ave	40.726281	-73.989780	393	E 5 St & Avenue C	40.722992	-73.979955	15825	Customer	\N	0	-1	-0.006585	too_slow

	pickup_datetime_dayofweek	pickup_datetime_hour	pickup_latitude	pickup_longitude	dropoff_latitude	dropoff_longitude	avg_velocity_mph
185735	6.0	23.0	40.692395	-73.993379	40.688070	-73.984106	7.0
185736	6.0	23.0	40.707065	-74.007319	40.717488	-74.010455	9.0
185740	6.0	23.0	40.729170	-73.998102	40.721655	-74.002347	11.0
185741	6.0	23.0	40.734232	-73.986923	40.742909	-73.977061	9.0
185742	6.0	23.0	40.726281	-73.989780	40.722992	-73.979955	7.0

	pickup_datetime_dayofweek	pickup_datetime_hour	pickup_latitude	pickup_longitude	dropoff_latitude	dropoff_longitude	avg_velocity_mph
count	106044.000000	106044.000000	106044.000000	106044.000000	106044.000000	106044.000000	106044.000000
mean	2.603636	13.969918	40.735199	-73.990941	40.734848	-73.991118	7.036636
std	1.972009	4.992480	0.019369	0.012164	0.019295	0.012357	1.125662
min	0.000000	0.000000	40.680342	-74.017134	40.680342	-74.017134	6.000000
25%	1.000000	9.000000	40.722174	-73.999733	40.722174	-74.000264	6.000000
50%	2.000000	15.000000	40.737050	-73.990765	40.737050	-73.990931	7.000000
75%	4.000000	18.000000	40.750200	-73.981948	40.749718	-73.982050	8.000000
max	6.000000	23.000000	40.771522	-73.953809	40.771522	-73.953809	15.000000

	pickup_datetime_dayofweek	pickup_datetime_hour	pickup_latitude	pickup_longitude	dropoff_latitude	dropoff_longitude
38284	1.0	9.0	40.760203	-73.964785	40.756014	-73.967416
129690	4.0	17.0	40.757952	-73.977876	40.754557	-73.965930
109201	3.0	19.0	40.741444	-73.975361	40.725029	-73.990697
120605	4.0	11.0	40.722293	-73.991475	40.747659	-73.984907
89672	2.0	19.0	40.686919	-73.976682	40.683178	-73.965964