In [1]:
import os as os
import pandas as pd
import numpy as np
from scipy import stats, integrate
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
from statsmodels.distributions.empirical_distribution import ECDF
import datetime as dt
plt.style.use('seaborn-whitegrid')

plt.rcParams['image.cmap'] = 'blue'

#sns.set_context('notebook',font_scale=2)
sns.set_style("whitegrid")
% matplotlib inline
labelsize = 22
mpl.rcParams.update({'font.size': labelsize})
mpl.rcParams.update({'figure.figsize': (20,10)})
mpl.rcParams.update({'axes.titlesize': 'large'})
mpl.rcParams.update({'axes.labelsize': 'large'})
mpl.rcParams.update({'xtick.labelsize': labelsize})
mpl.rcParams.update({'ytick.labelsize': labelsize})

In [2]:
!cd data && ls


2014-05 - Citi Bike trip data.csv
bike_20140505_with_dist_and_avg_velo.csv
bike_20140505_with_dist.csv
bike_20140505_with_dist_velo.csv
bike_oneweekfrom20140505.csv
data_jan13.csv
README.txt
Taxi_from_2013-05-06_to_2013-05-13.csv
Taxi_from_2013-05-06_to_2013-05-13_testset.csv
Taxi_from_2013-05-06_to_2013-05-13testset.csv
Taxi_from_2013-05-06_to_2013-05-13_xtrain.csv
Taxi_from_2013-05-06_to_2013-05-13_ytrain.csv
taxi_oneweekfrom20130107.csv
taxi_oneweekfrom20130506.csv
taxi_oneweekfrom20140505.csv
taxi_tree_test_X_20130506-12.csv
taxi_tree_test_X.csv
taxi_tree_test_Xy_20130506-12.csv
taxi_tree_test_Xy.csv
taxi_tree_test_Xy_sample.csv
taxi_tree_test_y_20130506-12.csv
taxi_tree_test_y.csv
_testset.csv
yellow_tripdata_2013-05.csv
yellow_tripdata_2014-05.csv

Use the bash =)


In [3]:
data = pd.read_csv('data/bike_oneweekfrom20140505.csv', index_col=0, parse_dates=True)

In [4]:
data.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 185744 entries, 0 to 185743
Data columns (total 15 columns):
tripduration               185744 non-null int64
starttime                  185744 non-null object
stoptime                   185744 non-null object
start station id           185744 non-null int64
start station name         185744 non-null object
start station latitude     185744 non-null float64
start station longitude    185744 non-null float64
end station id             185744 non-null int64
end station name           185744 non-null object
end station latitude       185744 non-null float64
end station longitude      185744 non-null float64
bikeid                     185744 non-null int64
usertype                   185744 non-null object
birth year                 185744 non-null object
gender                     185744 non-null int64
dtypes: float64(4), int64(5), object(6)
memory usage: 22.7+ MB

In [5]:
data.columns


Out[5]:
Index(['tripduration', 'starttime', 'stoptime', 'start station id',
       'start station name', 'start station latitude',
       'start station longitude', 'end station id', 'end station name',
       'end station latitude', 'end station longitude', 'bikeid', 'usertype',
       'birth year', 'gender'],
      dtype='object')

Rename the columns. They're now streamlined with the taxi input.


In [6]:
new_column_names = ['trip_time', 'pickup_datetime', 'dropoff_datetime', 'start_station_id',
       'start_station_name', 'pickup_latitude',
       'pickup_longitude', 'end_station_id', 'end_station_name',
       'dropoff_latitude', 'dropoff_longitude', 'bikeid', 'usertype',
       'birth year', 'gender']

In [7]:
data.columns = new_column_names

In [8]:
data.describe()


Out[8]:
trip_time start_station_id pickup_latitude pickup_longitude end_station_id dropoff_latitude dropoff_longitude bikeid gender
count 185744.000000 185744.000000 185744.000000 185744.000000 185744.000000 185744.000000 185744.000000 185744.000000 185744.000000
mean 869.523107 450.726511 40.734630 -73.991079 451.404142 40.734387 -73.991147 18157.530305 1.090598
std 974.098663 370.232259 0.019489 0.012186 375.448276 0.019530 0.012298 2119.989210 0.550519
min 60.000000 72.000000 40.680342 -74.017134 72.000000 40.680342 -74.017134 14529.000000 0.000000
25% 401.000000 304.000000 40.721655 -73.999947 303.000000 40.721101 -74.000040 16315.000000 1.000000
50% 640.000000 405.000000 40.736494 -73.990765 404.000000 40.736197 -73.990931 18164.000000 1.000000
75% 1059.000000 490.000000 40.750073 -73.982050 489.000000 40.749718 -73.982050 20005.000000 1.000000
max 21549.000000 3002.000000 40.771522 -73.953809 3002.000000 40.771522 -73.953809 21679.000000 2.000000

Parse the dates.


In [9]:
data['pickup_datetime'] =pd.to_datetime(data['pickup_datetime'], format = '%Y-%m-%d %H:%M:%S')
data['dropoff_datetime'] =pd.to_datetime(data['dropoff_datetime'], format = '%Y-%m-%d %H:%M:%S')
data['trip_time'] = pd.to_timedelta(data['trip_time'], 's')

In [10]:
data.describe().transpose()


Out[10]:
count mean std min 25% 50% 75% max
trip_time 185744 0 days 00:14:29.523107 0 days 00:16:14.098662 0 days 00:01:00 0 days 00:06:41 0 days 00:10:40 0 days 00:17:39 0 days 05:59:09
start_station_id 185744 450.727 370.232 72 304 405 490 3002
pickup_latitude 185744 40.7346 0.0194894 40.6803 40.7217 40.7365 40.7501 40.7715
pickup_longitude 185744 -73.9911 0.012186 -74.0171 -73.9999 -73.9908 -73.9821 -73.9538
end_station_id 185744 451.404 375.448 72 303 404 489 3002
dropoff_latitude 185744 40.7344 0.0195297 40.6803 40.7211 40.7362 40.7497 40.7715
dropoff_longitude 185744 -73.9911 0.0122976 -74.0171 -74 -73.9909 -73.9821 -73.9538
bikeid 185744 18157.5 2119.99 14529 16315 18164 20005 21679
gender 185744 1.0906 0.550519 0 1 1 1 2

In [11]:
data.head()


Out[11]:
trip_time pickup_datetime dropoff_datetime start_station_id start_station_name pickup_latitude pickup_longitude end_station_id end_station_name dropoff_latitude dropoff_longitude bikeid usertype birth year gender
0 00:04:49 2014-05-05 00:00:14 2014-05-05 00:05:03 495 W 47 St & 10 Ave 40.762699 -73.993012 469 Broadway & W 53 St 40.763441 -73.982681 17039 Subscriber 1986 1
1 00:17:31 2014-05-05 00:00:19 2014-05-05 00:17:50 281 Grand Army Plaza & Central Park S 40.764397 -73.973715 236 St Marks Pl & 2 Ave 40.728419 -73.987140 17875 Subscriber 1985 1
2 00:03:57 2014-05-05 00:01:05 2014-05-05 00:05:02 309 Murray St & West St 40.714979 -74.013012 3002 South End Ave & Liberty St 40.711512 -74.015756 18008 Customer \N 0
3 00:08:54 2014-05-05 00:01:12 2014-05-05 00:10:06 151 Cleveland Pl & Spring St 40.721816 -73.997203 345 W 13 St & 6 Ave 40.736494 -73.997044 18237 Subscriber 1983 1
4 00:04:35 2014-05-05 00:01:16 2014-05-05 00:05:51 173 Broadway & W 49 St 40.760647 -73.984427 449 W 52 St & 9 Ave 40.764618 -73.987895 16607 Subscriber 1985 1

In [12]:
data.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 185744 entries, 0 to 185743
Data columns (total 15 columns):
trip_time             185744 non-null timedelta64[ns]
pickup_datetime       185744 non-null datetime64[ns]
dropoff_datetime      185744 non-null datetime64[ns]
start_station_id      185744 non-null int64
start_station_name    185744 non-null object
pickup_latitude       185744 non-null float64
pickup_longitude      185744 non-null float64
end_station_id        185744 non-null int64
end_station_name      185744 non-null object
dropoff_latitude      185744 non-null float64
dropoff_longitude     185744 non-null float64
bikeid                185744 non-null int64
usertype              185744 non-null object
birth year            185744 non-null object
gender                185744 non-null int64
dtypes: datetime64[ns](2), float64(4), int64(4), object(4), timedelta64[ns](1)
memory usage: 22.7+ MB

Have a look at every station. How many pickups are there per station?


In [13]:
group_by_start = data.groupby(data.start_station_id, )
usage_freq = group_by_start.trip_time.count()
print('Amount of stations: ' + str(len(usage_freq)))
plt.hist(usage_freq, bins = 50)
plt.title('Frequency of starts from start stations')


Amount of stations: 326
Out[13]:
<matplotlib.text.Text at 0x7f7d1705b470>

When you take a look at the map, you won't be surprised that the Grand Central Station is the one with the most frequent usage.


In [14]:
group_by_start_name = data.groupby(data.start_station_name, )
usage_freq_name = group_by_start_name.trip_time.count()
usage_freq_name.sort_values().keys()[-10:]


Out[14]:
Index(['Greenwich Ave & 8 Ave', 'Broadway & W 24 St', '8 Ave & W 33 St',
       'Broadway & E 14 St', 'W 21 St & 6 Ave', 'West St & Chambers St',
       'E 17 St & Broadway', 'Lafayette St & E 8 St', '8 Ave & W 31 St',
       'E 42 St & Vanderbilt Ave'],
      dtype='object', name='start_station_name')

In [15]:
usage_freq_name.sort_values().keys()


Out[15]:
Index(['Clinton Ave & Flushing Ave', 'Railroad Ave & Kay Ave',
       'Hanover Pl & Livingston St', 'Park Ave & St Edwards St',
       'Monroe St & Classon Ave', 'Sands St & Navy St',
       'Bedford Ave & S 9th St', 'DeKalb Ave & Skillman St',
       'Carlton Ave & Park Ave', 'Gallatin Pl & Livingston St',
       ...
       'Greenwich Ave & 8 Ave', 'Broadway & W 24 St', '8 Ave & W 33 St',
       'Broadway & E 14 St', 'W 21 St & 6 Ave', 'West St & Chambers St',
       'E 17 St & Broadway', 'Lafayette St & E 8 St', '8 Ave & W 31 St',
       'E 42 St & Vanderbilt Ave'],
      dtype='object', name='start_station_name', length=326)

So we have 326 different start stations.


In [16]:
group_by_end_name = data.groupby(data.end_station_name, )
usage_freq_name = group_by_end_name.trip_time.count()
usage_freq_name.sort_values().keys()


Out[16]:
Index(['Railroad Ave & Kay Ave', 'Clinton Ave & Flushing Ave',
       'Greenwich St & N Moore St', 'Hanover Pl & Livingston St',
       'Park Ave & St Edwards St', 'Sands St & Navy St',
       'Monroe St & Classon Ave', 'Bedford Ave & S 9th St',
       'Carlton Ave & Park Ave', 'E 4 St & 2 Ave',
       ...
       'Broadway & W 24 St', 'Broadway & W 60 St', 'University Pl & E 14 St',
       'Greenwich Ave & 8 Ave', 'W 21 St & 6 Ave', 'E 42 St & Vanderbilt Ave',
       'West St & Chambers St', 'Lafayette St & E 8 St', 'E 17 St & Broadway',
       '8 Ave & W 31 St'],
      dtype='object', name='end_station_name', length=326)

We have 326 different end stations, too. But are all end_stations contained in start_stations and vive versa?


In [17]:
group_by_start_id = data.groupby(data.start_station_id, )
start_ids = group_by_start_id.trip_time.count().sort_values().keys()
group_by_end_id = data.groupby(data.end_station_id, )
end_ids = group_by_end_id.trip_time.count().sort_values().keys()
len(set(start_ids).intersection(end_ids))


Out[17]:
326

So this is the proof, that all stations are used as start and end stations.

Check for missing and false data


In [18]:
data.isnull().sum()


Out[18]:
trip_time             0
pickup_datetime       0
dropoff_datetime      0
start_station_id      0
start_station_name    0
pickup_latitude       0
pickup_longitude      0
end_station_id        0
end_station_name      0
dropoff_latitude      0
dropoff_longitude     0
bikeid                0
usertype              0
birth year            0
gender                0
dtype: int64

So there is not that much data missing. That's quite surprising, maybe it's wrong.


In [19]:
(data==0).sum()


Out[19]:
trip_time                 0
pickup_datetime           0
dropoff_datetime          0
start_station_id          0
start_station_name        0
pickup_latitude           0
pickup_longitude          0
end_station_id            0
end_station_name          0
dropoff_latitude          0
dropoff_longitude         0
bikeid                    0
usertype                  0
birth year                0
gender                20495
dtype: int64

So we only have many zeros in the gender-feature. Gender can have 3 values (0=unknown, 1=male, 2=female)


In [20]:
from collections import Counter
print(Counter(data.gender))
Counter(data['birth year'])


Counter({1: 127926, 2: 37323, 0: 20495})
Out[20]:
Counter({'1899': 17,
         '1900': 57,
         '1901': 20,
         '1907': 5,
         '1910': 17,
         '1921': 7,
         '1922': 2,
         '1924': 1,
         '1926': 3,
         '1927': 1,
         '1931': 1,
         '1932': 7,
         '1933': 4,
         '1934': 4,
         '1935': 25,
         '1936': 9,
         '1937': 14,
         '1938': 62,
         '1939': 23,
         '1940': 77,
         '1941': 141,
         '1942': 166,
         '1943': 107,
         '1944': 176,
         '1945': 261,
         '1946': 225,
         '1947': 349,
         '1948': 486,
         '1949': 446,
         '1950': 609,
         '1951': 884,
         '1952': 777,
         '1953': 1170,
         '1954': 1267,
         '1955': 1323,
         '1956': 1589,
         '1957': 1656,
         '1958': 1856,
         '1959': 2050,
         '1960': 2495,
         '1961': 2238,
         '1962': 2729,
         '1963': 2940,
         '1964': 3065,
         '1965': 2863,
         '1966': 3016,
         '1967': 3374,
         '1968': 3305,
         '1969': 3950,
         '1970': 4198,
         '1971': 3708,
         '1972': 3681,
         '1973': 3854,
         '1974': 4086,
         '1975': 3953,
         '1976': 4418,
         '1977': 4647,
         '1978': 5117,
         '1979': 5311,
         '1980': 5737,
         '1981': 6296,
         '1982': 6424,
         '1983': 6973,
         '1984': 7168,
         '1985': 7474,
         '1986': 6451,
         '1987': 6407,
         '1988': 5885,
         '1989': 5752,
         '1990': 4953,
         '1991': 2559,
         '1992': 1405,
         '1993': 1200,
         '1994': 684,
         '1995': 516,
         '1996': 299,
         '1997': 202,
         '1998': 27,
         '\\N': 20490})

So we see, we have also missing values in "gender" and "birth year". But we are not interested in this features, therefore we can ignore this missing values at this point.

Quick pverview about the trip_times


In [21]:
plt.hist((data['trip_time'] / np.timedelta64(1, 'm')), bins=30, range=[0, 100])
plt.title('Distribution of trip_time in minutes')
plt.xlabel('trip_time')
plt.ylabel('frequency')
plt.savefig('figures/bike_trip_time.png', format='png', dpi=300)


We are interested in the trip time in minutes.


In [22]:
sns.boxplot((data['trip_time'] / np.timedelta64(1, 'm')))


Out[22]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f7d188f6898>

We have lots of outliers.

So as we can see, we have many outliers.


In [23]:
data.trip_time.quantile([0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99])


Out[23]:
0.01          00:01:59
0.05          00:03:25
0.10          00:04:25
0.25          00:06:41
0.50          00:10:40
0.75          00:17:39
0.90          00:26:54
0.95          00:33:56
0.99   01:08:57.570000
Name: trip_time, dtype: timedelta64[ns]

In [24]:
len(data.trip_time.value_counts().values)


Out[24]:
5353

Identify the the cases without geo data and remove them from our data to be processed.


In [25]:
anomaly = data.loc[(data['dropoff_longitude'].isnull()) | (data['dropoff_latitude'].isnull()) | 
                       (data['pickup_longitude'].isnull()) | (data['pickup_latitude'].isnull())]
data = data.drop(anomaly.index)

In [26]:
anomaly['flag'] = 'geo_NA'

In [27]:
data.isnull().sum()


Out[27]:
trip_time             0
pickup_datetime       0
dropoff_datetime      0
start_station_id      0
start_station_name    0
pickup_latitude       0
pickup_longitude      0
end_station_id        0
end_station_name      0
dropoff_latitude      0
dropoff_longitude     0
bikeid                0
usertype              0
birth year            0
gender                0
dtype: int64

So how many percent of data are left to be processed?


In [28]:
len(data)/(len(data)+len(anomaly))


Out[28]:
1.0

In [29]:
anomaly.tail()


Out[29]:
trip_time pickup_datetime dropoff_datetime start_station_id start_station_name pickup_latitude pickup_longitude end_station_id end_station_name dropoff_latitude dropoff_longitude bikeid usertype birth year gender flag

So we dropped nothing because of missing geo tags. </font color>


In [30]:
plt.hist(data.trip_time.values / np.timedelta64(1, 'm'), bins=50, range=[0,100])


Out[30]:
(array([  1886.,  12429.,  23265.,  25474.,  22858.,  18987.,  14736.,
         11772.,   9414.,   7721.,   6635.,   5540.,   4620.,   3809.,
          3101.,   2403.,   1852.,   1427.,   1223.,    958.,    755.,
           558.,    395.,    337.,    307.,    247.,    231.,    180.,
           141.,    134.,    124.,    122.,    134.,     72.,     84.,
            67.,     79.,     63.,     59.,     71.,     61.,     59.,
            42.,     47.,     37.,     37.,     41.,     47.,     38.,
            43.]),
 array([   0.,    2.,    4.,    6.,    8.,   10.,   12.,   14.,   16.,
          18.,   20.,   22.,   24.,   26.,   28.,   30.,   32.,   34.,
          36.,   38.,   40.,   42.,   44.,   46.,   48.,   50.,   52.,
          54.,   56.,   58.,   60.,   62.,   64.,   66.,   68.,   70.,
          72.,   74.,   76.,   78.,   80.,   82.,   84.,   86.,   88.,
          90.,   92.,   94.,   96.,   98.,  100.]),
 <a list of 50 Patch objects>)

We sometimes have some unreasonably small trip_times.

Calculate the distance of a trip. Use the difference of lon / lat data and parse it to metrics.


In [31]:
data['trip_dist'] = -1 # Init trip_dist

Be aware, this operation is quite slow.


In [32]:
# inpout for vincenty:(location.latitude, location.longitude)
from geopy.distance import vincenty
for i in range(0, (len(data)-1)):
    pickup = (data.iloc[i]['pickup_latitude'], data.iloc[i]['pickup_longitude'])
    dropoff = (data.iloc[i]['dropoff_latitude'], data.iloc[i]['dropoff_longitude'])
    data.set_value(i, 'trip_dist', vincenty(pickup,dropoff).meters)

In [33]:
data.trip_dist


Out[33]:
0          876
1         4153
2          449
3         1630
4          529
5          760
6         1332
7         2297
8          689
9         1371
10        2358
11         558
12        4488
13        2146
14        2803
15        1291
16         446
17        1721
18        2882
19         674
20         927
21        3635
22        2168
23         887
24         810
25         706
26         810
27        1491
28        1003
29         935
          ... 
185714       0
185715     552
185716    2067
185717    1646
185718     535
185719     722
185720     366
185721     565
185722    1839
185723     809
185724       0
185725    1682
185726    1348
185727    1244
185728     739
185729    1314
185730    1154
185731    1694
185732    1896
185733    1061
185734    1208
185735     919
185736    1187
185737     285
185738     285
185739     831
185740     908
185741    1273
185742     906
185743      -1
Name: trip_dist, dtype: int64

We cannot use miles because vincenty ceils / floors the results


In [34]:
data.columns


Out[34]:
Index(['trip_time', 'pickup_datetime', 'dropoff_datetime', 'start_station_id',
       'start_station_name', 'pickup_latitude', 'pickup_longitude',
       'end_station_id', 'end_station_name', 'dropoff_latitude',
       'dropoff_longitude', 'bikeid', 'usertype', 'birth year', 'gender',
       'trip_dist'],
      dtype='object')

Check if new column is present


In [35]:
# data.to_csv('data/bike_20140505_with_dist.csv')

In [36]:
data['avg_velocity'] = data.trip_dist.values/(data.trip_time / (np.timedelta64(1, 'h')))

In [37]:
data.avg_velocity


Out[37]:
0         10912.110727
1         14225.309229
2          6820.253165
3         10988.764045
4          6925.090909
5          8658.227848
6         12787.200000
7         13713.432836
8          9119.117647
9          7466.868381
10         7867.284523
11         9611.483254
12        10785.580774
13         6966.275924
14         9628.625954
15        11199.036145
16         9122.727273
17        11388.970588
18        13963.930013
19         8543.661972
20        13565.853659
21        11061.707523
22        11086.363636
23         5357.718121
24         9112.500000
25         7241.025641
26         9286.624204
27         3029.119639
28        13035.379061
29         9402.234637
              ...     
185714        0.000000
185715     4445.637584
185716    11554.658385
185717     8417.045455
185718     8754.545455
185719     6203.341289
185720    11870.270270
185721     6647.058824
185722     8631.551499
185723     8321.142857
185724        0.000000
185725    12883.404255
185726    11637.410072
185727     8514.068441
185728     6964.397906
185729    13751.162791
185730    12861.919505
185731    11818.604651
185732    11376.000000
185733    10493.406593
185734    10090.023202
185735    10603.846154
185736    14102.970297
185737     4909.090909
185738     5428.571429
185739     7404.950495
185740    16763.076923
185741    13639.285714
185742     9678.338279
185743      -10.495627
Name: avg_velocity, dtype: float64

Parse to km/h


In [38]:
data.avg_velocity = data.avg_velocity/1000

In [39]:
data.avg_velocity


Out[39]:
0         10.912111
1         14.225309
2          6.820253
3         10.988764
4          6.925091
5          8.658228
6         12.787200
7         13.713433
8          9.119118
9          7.466868
10         7.867285
11         9.611483
12        10.785581
13         6.966276
14         9.628626
15        11.199036
16         9.122727
17        11.388971
18        13.963930
19         8.543662
20        13.565854
21        11.061708
22        11.086364
23         5.357718
24         9.112500
25         7.241026
26         9.286624
27         3.029120
28        13.035379
29         9.402235
            ...    
185714     0.000000
185715     4.445638
185716    11.554658
185717     8.417045
185718     8.754545
185719     6.203341
185720    11.870270
185721     6.647059
185722     8.631551
185723     8.321143
185724     0.000000
185725    12.883404
185726    11.637410
185727     8.514068
185728     6.964398
185729    13.751163
185730    12.861920
185731    11.818605
185732    11.376000
185733    10.493407
185734    10.090023
185735    10.603846
185736    14.102970
185737     4.909091
185738     5.428571
185739     7.404950
185740    16.763077
185741    13.639286
185742     9.678338
185743    -0.010496
Name: avg_velocity, dtype: float64

Convert avg_velocity from km/h to miles/h. (1km = 0,621371 miles)


In [40]:
data['avg_velocity'] = data.avg_velocity*0.627371

In [41]:
# data.to_csv('data/bike_20140505_with_dist_and_avg_velo.csv')

In [42]:
plt.hist(data.avg_velocity, bins=60, range=[0,15])
plt.title('Distribution of avg_velocity in mph')
plt.xlabel('avg_velocity')
plt.ylabel('frequency')
plt.savefig('figures/bike_avg_vel.png', format='png', dpi=300)



In [43]:
data.avg_velocity.describe() # in mph


Out[43]:
count    185744.000000
mean          5.149069
std           2.023218
min          -0.006585
25%           4.068203
50%           5.306131
75%           6.457503
max          29.006050
Name: avg_velocity, dtype: float64

In [44]:
data.head()


Out[44]:
trip_time pickup_datetime dropoff_datetime start_station_id start_station_name pickup_latitude pickup_longitude end_station_id end_station_name dropoff_latitude dropoff_longitude bikeid usertype birth year gender trip_dist avg_velocity
0 00:04:49 2014-05-05 00:00:14 2014-05-05 00:05:03 495 W 47 St & 10 Ave 40.762699 -73.993012 469 Broadway & W 53 St 40.763441 -73.982681 17039 Subscriber 1986 1 876 6.845942
1 00:17:31 2014-05-05 00:00:19 2014-05-05 00:17:50 281 Grand Army Plaza & Central Park S 40.764397 -73.973715 236 St Marks Pl & 2 Ave 40.728419 -73.987140 17875 Subscriber 1985 1 4153 8.924546
2 00:03:57 2014-05-05 00:01:05 2014-05-05 00:05:02 309 Murray St & West St 40.714979 -74.013012 3002 South End Ave & Liberty St 40.711512 -74.015756 18008 Customer \N 0 449 4.278829
3 00:08:54 2014-05-05 00:01:12 2014-05-05 00:10:06 151 Cleveland Pl & Spring St 40.721816 -73.997203 345 W 13 St & 6 Ave 40.736494 -73.997044 18237 Subscriber 1983 1 1630 6.894032
4 00:04:35 2014-05-05 00:01:16 2014-05-05 00:05:51 173 Broadway & W 49 St 40.760647 -73.984427 449 W 52 St & 9 Ave 40.764618 -73.987895 16607 Subscriber 1985 1 529 4.344601

In [45]:
data.avg_velocity.quantile([.0001,.01, .5, .75, .95, .975, .99, .995])


Out[45]:
0.0001     0.000000
0.0100     0.000000
0.5000     5.306131
0.7500     6.457503
0.9500     8.198881
0.9750     8.802942
0.9900     9.514220
0.9950    10.012335
Name: avg_velocity, dtype: float64

Be aware: We calculated the avg_velocity based on the vincenty distance (equals direct connection). A real driver would never drive that fast. We take all avg_velos >= 5mph (approx. >5%-quantile) into account. We use an upper bound, too. It might not be possible to drive "too fast". If someone drives too fast, the technology has had a malfunction.


In [46]:
lb = 5
ub = 15
anomaly = data.loc[(data['avg_velocity'] < lb) | (data['avg_velocity'] > ub)] 

# be careful! Maybe adjust to append.
#anomaly.loc[
anomaly.loc[data.loc[(data['avg_velocity'] < lb)].index,'flag'] = 'too_slow'
anomaly.loc[data.loc[(data['avg_velocity'] > ub)].index,'flag'] = 'too_fast'
data = data.drop(anomaly.index, errors='ignore') # ignore uncontained labels / indices
print(1-len(data)/(len(data)+len(anomaly)))


0.42908519252304245
/home/niklas/anaconda3/lib/python3.5/site-packages/pandas/core/indexing.py:284: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
/home/niklas/anaconda3/lib/python3.5/site-packages/pandas/core/indexing.py:461: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s

In [47]:
anomaly.head()


Out[47]:
trip_time pickup_datetime dropoff_datetime start_station_id start_station_name pickup_latitude pickup_longitude end_station_id end_station_name dropoff_latitude dropoff_longitude bikeid usertype birth year gender trip_dist avg_velocity flag
2 00:03:57 2014-05-05 00:01:05 2014-05-05 00:05:02 309 Murray St & West St 40.714979 -74.013012 3002 South End Ave & Liberty St 40.711512 -74.015756 18008 Customer \N 0 449 4.278829 too_slow
4 00:04:35 2014-05-05 00:01:16 2014-05-05 00:05:51 173 Broadway & W 49 St 40.760647 -73.984427 449 W 52 St & 9 Ave 40.764618 -73.987895 16607 Subscriber 1985 1 529 4.344601 too_slow
9 00:11:01 2014-05-05 00:02:24 2014-05-05 00:13:25 493 W 45 St & 6 Ave 40.756800 -73.982912 546 E 30 St & Park Ave S 40.744449 -73.983035 15186 Subscriber 1944 1 1371 4.684497 too_slow
10 00:17:59 2014-05-05 00:02:25 2014-05-05 00:20:24 393 E 5 St & Avenue C 40.722992 -73.979955 223 W 13 St & 7 Ave 40.737815 -73.999947 15038 Subscriber 1942 1 2358 4.935706 too_slow
13 00:18:29 2014-05-05 00:02:46 2014-05-05 00:21:15 394 E 9 St & Avenue C 40.725213 -73.977688 128 MacDougal St & Prince St 40.727103 -74.002971 18886 Subscriber 1966 1 2146 4.370439 too_slow

We dropped about 42% of the data!


In [48]:
data.avg_velocity.describe()


Out[48]:
count    106044.000000
mean          6.502750
std           1.133101
min           5.000057
25%           5.616168
50%           6.269582
75%           7.142086
max          14.099118
Name: avg_velocity, dtype: float64

In [49]:
anomaly.tail()


Out[49]:
trip_time pickup_datetime dropoff_datetime start_station_id start_station_name pickup_latitude pickup_longitude end_station_id end_station_name dropoff_latitude dropoff_longitude bikeid usertype birth year gender trip_dist avg_velocity flag
185728 00:06:22 2014-05-11 23:47:22 2014-05-11 23:53:44 308 St James Pl & Oliver St 40.713079 -73.998512 307 Canal St & Rutgers St 40.714275 -73.989900 19916 Subscriber 1983 1 739 4.369261 too_slow
185737 00:03:29 2014-05-11 23:50:54 2014-05-11 23:54:23 526 E 33 St & 5 Ave 40.747659 -73.984907 498 Broadway & W 32 St 40.748549 -73.988084 16197 Customer \N 0 285 3.079821 too_slow
185738 00:03:09 2014-05-11 23:51:22 2014-05-11 23:54:31 526 E 33 St & 5 Ave 40.747659 -73.984907 498 Broadway & W 32 St 40.748549 -73.988084 14714 Customer \N 0 285 3.405728 too_slow
185739 00:06:44 2014-05-11 23:52:29 2014-05-11 23:59:13 369 Washington Pl & 6 Ave 40.732241 -74.000264 293 Lafayette St & E 8 St 40.730287 -73.990765 14617 Subscriber 1962 1 831 4.645651 too_slow
185743 00:05:43 2014-05-11 23:53:44 2014-05-11 23:59:27 439 E 4 St & 2 Ave 40.726281 -73.989780 393 E 5 St & Avenue C 40.722992 -73.979955 15825 Customer \N 0 -1 -0.006585 too_slow

A little drawing


In [50]:
x = data.pickup_longitude
y = data.pickup_latitude

bins = 100;
H, xedges, yedges = np.histogram2d(x, y, bins=bins)

plt.jet()
fig = plt.figure(figsize=(20, 10))
#ax.set_title('pcolormesh: exact bin edges')
#mesh = plt.pcolormesh(X, Y, H)
plt.hist2d(x,y,bins=bins)
plt.colorbar()
plt.scatter(xedges[79], yedges[61], marker='x')
ax = fig.gca()
ax.grid(False)
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.title('Pickup Frequency')
#ax.set_aspect('equal')
#plt.savefig('figure.pdf', format='pdf')
#plt.savefig('figure.png', format='png')
plt.savefig('figures/bike_stations.png', format='png', dpi=300)


<matplotlib.figure.Figure at 0x7f7d17d57fd0>

Where is a pickup maximum?


In [51]:
np.amax(H)


Out[51]:
2001.0

In [52]:
np.where(H==2001)


Out[52]:
(array([61]), array([79]))

The area with the maximum pickups is around the Grand Central Terminal. Not very surprising.


In [53]:
yedges[79],xedges[61],


Out[53]:
(40.752374288829998, -73.978505949899997)

In [54]:
type(H)


Out[54]:
numpy.ndarray

In [55]:
print(H.shape)
print(H.size)
print(H.max())


(100, 100)
10000
2001.0

In [56]:
print('Current bin width:')
print(vincenty((xedges[0], yedges[0]), (xedges[1], yedges[1])).meters)


Current bin width:
76.03394282270413

In [57]:
distances = (-1)*np.ones(len(xedges)-1)
for x in range(0,len(xedges)-1, 1):
    distances[x] = vincenty((xedges[x], yedges[x]), (xedges[x+1], yedges[x+1])).meters

In [58]:
print('Sizes of all bins in meters:')
distances


Sizes of all bins in meters:
Out[58]:
array([ 76.03394282,  76.03433763,  76.03473245,  76.03512728,
        76.03552212,  76.03591697,  76.03631184,  76.03670672,
        76.03710161,  76.03749651,  76.03789143,  76.03828635,
        76.03868129,  76.03907624,  76.0394712 ,  76.03986618,
        76.04026116,  76.04065616,  76.04105117,  76.04144619,
        76.04184122,  76.04223627,  76.04263132,  76.04302639,
        76.04342148,  76.04381656,  76.04421167,  76.04460679,
        76.04500191,  76.04539706,  76.04579221,  76.04618737,
        76.04658255,  76.04697773,  76.04737294,  76.04776815,
        76.04816337,  76.04855861,  76.04895385,  76.04934911,
        76.04974438,  76.05013966,  76.05053496,  76.05093027,
        76.05132558,  76.05172091,  76.05211626,  76.05251161,
        76.05290698,  76.05330235,  76.05369774,  76.05409315,
        76.05448856,  76.05488398,  76.05527942,  76.05567487,
        76.05607033,  76.05646581,  76.05686129,  76.05725679,
        76.05765229,  76.05804782,  76.05844335,  76.05883889,
        76.05923445,  76.05963002,  76.0600256 ,  76.06042119,
        76.06081679,  76.06121241,  76.06160804,  76.06200367,
        76.06239932,  76.06279499,  76.06319066,  76.06358635,
        76.06398205,  76.06437776,  76.06477348,  76.06516922,
        76.06556496,  76.06596072,  76.06635649,  76.06675227,
        76.06714807,  76.06754387,  76.06793969,  76.06833552,
        76.06873136,  76.06912721,  76.06952308,  76.06991895,
        76.07031484,  76.07071074,  76.07110665,  76.07150258,
        76.07189851,  76.07229446,  76.07269042,  76.07308639])

In [59]:
(H==0).sum()/H.size


Out[59]:
0.96750000000000003

So we know that about 97% of the bins have 0 pickups in it. This was expected, because we have static stations only.

Only look at trips in a given bounding box


In [60]:
jfk_geodata = (40.641547, -73.778118)
ridgefield_geodata = (40.856406, -74.020642)
data_in_box = data.loc[(data['dropoff_latitude'] > jfk_geodata[0]) & 
                       (data['dropoff_longitude'] < jfk_geodata[1]) &
                       (data['dropoff_latitude'] < ridgefield_geodata[0]) & 
                       (data['dropoff_longitude'] > ridgefield_geodata[1]) & 
                       (data['pickup_latitude'] > jfk_geodata[0]) & 
                       (data['pickup_longitude'] < jfk_geodata[1]) &
                       (data['pickup_latitude'] < ridgefield_geodata[0]) & 
                       (data['pickup_longitude'] > ridgefield_geodata[1])         
                       ]
# taxidata = taxidata.drop(anomaly.index)

In [61]:
len(data_in_box)/len(data)


Out[61]:
1.0

We lost no trips through the bounding box.


In [62]:
data_in_box.head()


Out[62]:
trip_time pickup_datetime dropoff_datetime start_station_id start_station_name pickup_latitude pickup_longitude end_station_id end_station_name dropoff_latitude dropoff_longitude bikeid usertype birth year gender trip_dist avg_velocity
0 00:04:49 2014-05-05 00:00:14 2014-05-05 00:05:03 495 W 47 St & 10 Ave 40.762699 -73.993012 469 Broadway & W 53 St 40.763441 -73.982681 17039 Subscriber 1986 1 876 6.845942
1 00:17:31 2014-05-05 00:00:19 2014-05-05 00:17:50 281 Grand Army Plaza & Central Park S 40.764397 -73.973715 236 St Marks Pl & 2 Ave 40.728419 -73.987140 17875 Subscriber 1985 1 4153 8.924546
3 00:08:54 2014-05-05 00:01:12 2014-05-05 00:10:06 151 Cleveland Pl & Spring St 40.721816 -73.997203 345 W 13 St & 6 Ave 40.736494 -73.997044 18237 Subscriber 1983 1 1630 6.894032
5 00:05:16 2014-05-05 00:01:22 2014-05-05 00:06:38 294 Washington Square E 40.730494 -73.995721 237 E 11 St & 2 Ave 40.730473 -73.986724 15545 Subscriber 1983 2 760 5.431921
6 00:06:15 2014-05-05 00:01:48 2014-05-05 00:08:03 346 Bank St & Hudson St 40.736529 -74.006180 375 Mercer St & Bleecker St 40.726795 -73.996951 18794 Subscriber 1973 1 1332 8.022318

Let's take a first look at the distribution of the cleaned target variable which we want to estimate:


In [63]:
h = data_in_box.avg_velocity.values
plt.figure(figsize=(20,10))
plt.hist(h, normed=False,  bins=150)
    #, histtype='stepfilled')
#plt.yscale('log')
#plt.ylabel('log(freq x)', fontsize=40)
#plt.xlabel('x = avg_amount_per_minute', fontsize=40)
#print('Min:' +  str(min(h)) + '\nMax:' +  str(max(h)))
#plt.locator_params(axis = 'x', nbins = 20)
plt.show()



In [64]:
data_in_box.head()


Out[64]:
trip_time pickup_datetime dropoff_datetime start_station_id start_station_name pickup_latitude pickup_longitude end_station_id end_station_name dropoff_latitude dropoff_longitude bikeid usertype birth year gender trip_dist avg_velocity
0 00:04:49 2014-05-05 00:00:14 2014-05-05 00:05:03 495 W 47 St & 10 Ave 40.762699 -73.993012 469 Broadway & W 53 St 40.763441 -73.982681 17039 Subscriber 1986 1 876 6.845942
1 00:17:31 2014-05-05 00:00:19 2014-05-05 00:17:50 281 Grand Army Plaza & Central Park S 40.764397 -73.973715 236 St Marks Pl & 2 Ave 40.728419 -73.987140 17875 Subscriber 1985 1 4153 8.924546
3 00:08:54 2014-05-05 00:01:12 2014-05-05 00:10:06 151 Cleveland Pl & Spring St 40.721816 -73.997203 345 W 13 St & 6 Ave 40.736494 -73.997044 18237 Subscriber 1983 1 1630 6.894032
5 00:05:16 2014-05-05 00:01:22 2014-05-05 00:06:38 294 Washington Square E 40.730494 -73.995721 237 E 11 St & 2 Ave 40.730473 -73.986724 15545 Subscriber 1983 2 760 5.431921
6 00:06:15 2014-05-05 00:01:48 2014-05-05 00:08:03 346 Bank St & Hudson St 40.736529 -74.006180 375 Mercer St & Bleecker St 40.726795 -73.996951 18794 Subscriber 1973 1 1332 8.022318

Make a new dataframe with features and targets


In [65]:
time_regression_df = pd.DataFrame([ #data_in_box['pickup_datetime'].dt.day, # leave this out
                          data_in_box['pickup_datetime'].dt.dayofweek,
                          data_in_box['pickup_datetime'].dt.hour,
                          data_in_box['pickup_latitude'],
                          data_in_box['pickup_longitude'],
                          data_in_box['dropoff_latitude'],
                          data_in_box['dropoff_longitude'],
                          np.ceil(data_in_box['avg_velocity'])
                         ]).T

In [66]:
time_regression_df.columns = [#'pickup_datetime_day', 
                              'pickup_datetime_dayofweek', 'pickup_datetime_hour',
                                 'pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude',
                                 'avg_velocity_mph']

Use minutes for prediction instead of seconds (ceil the time). Definitley more robust than seconds!


In [67]:
time_regression_df.head()


Out[67]:
pickup_datetime_dayofweek pickup_datetime_hour pickup_latitude pickup_longitude dropoff_latitude dropoff_longitude avg_velocity_mph
0 0.0 0.0 40.762699 -73.993012 40.763441 -73.982681 7.0
1 0.0 0.0 40.764397 -73.973715 40.728419 -73.987140 9.0
3 0.0 0.0 40.721816 -73.997203 40.736494 -73.997044 7.0
5 0.0 0.0 40.730494 -73.995721 40.730473 -73.986724 6.0
6 0.0 0.0 40.736529 -74.006180 40.726795 -73.996951 9.0

In [68]:
time_regression_df.tail()


Out[68]:
pickup_datetime_dayofweek pickup_datetime_hour pickup_latitude pickup_longitude dropoff_latitude dropoff_longitude avg_velocity_mph
185735 6.0 23.0 40.692395 -73.993379 40.688070 -73.984106 7.0
185736 6.0 23.0 40.707065 -74.007319 40.717488 -74.010455 9.0
185740 6.0 23.0 40.729170 -73.998102 40.721655 -74.002347 11.0
185741 6.0 23.0 40.734232 -73.986923 40.742909 -73.977061 9.0
185742 6.0 23.0 40.726281 -73.989780 40.722992 -73.979955 7.0

In [69]:
time_regression_df.ix[:,0:7].describe()


Out[69]:
pickup_datetime_dayofweek pickup_datetime_hour pickup_latitude pickup_longitude dropoff_latitude dropoff_longitude avg_velocity_mph
count 106044.000000 106044.000000 106044.000000 106044.000000 106044.000000 106044.000000 106044.000000
mean 2.603636 13.969918 40.735199 -73.990941 40.734848 -73.991118 7.036636
std 1.972009 4.992480 0.019369 0.012164 0.019295 0.012357 1.125662
min 0.000000 0.000000 40.680342 -74.017134 40.680342 -74.017134 6.000000
25% 1.000000 9.000000 40.722174 -73.999733 40.722174 -74.000264 6.000000
50% 2.000000 15.000000 40.737050 -73.990765 40.737050 -73.990931 7.000000
75% 4.000000 18.000000 40.750200 -73.981948 40.749718 -73.982050 8.000000
max 6.000000 23.000000 40.771522 -73.953809 40.771522 -73.953809 15.000000

In [70]:
print(time_regression_df.avg_velocity_mph.value_counts())
print(len(time_regression_df.avg_velocity_mph.value_counts()))


6.0     42576
7.0     33360
8.0     18609
9.0      7849
10.0     2705
11.0      740
12.0      162
13.0       34
14.0        7
15.0        2
Name: avg_velocity_mph, dtype: int64
10

So we hace 10 different velocities to predict.

Split the data into a training dataset and a test dataset. Evaluate the performance of the decision tree on the test data


In [71]:
from sklearn import cross_validation as cv
y = time_regression_df['avg_velocity_mph']
X = time_regression_df.ix[:,0:6]
X_train, X_test, y_train, y_test = cv.train_test_split(X, y,test_size=0.05,random_state=0)

In [72]:
from sklearn import cross_validation as cv
time_regression_df_train, time_regression_df_test = cv.train_test_split(time_regression_df, test_size=0.05, random_state=99)
y_train = time_regression_df_train['avg_velocity_mph']
x_train = time_regression_df_train.ix[:, 0:6]
y_test = time_regression_df_test['avg_velocity_mph']
x_test = time_regression_df_test.ix[:, 0:6]

Changed test size to 5%


In [73]:
x_test.head()


Out[73]:
pickup_datetime_dayofweek pickup_datetime_hour pickup_latitude pickup_longitude dropoff_latitude dropoff_longitude
38284 1.0 9.0 40.760203 -73.964785 40.756014 -73.967416
129690 4.0 17.0 40.757952 -73.977876 40.754557 -73.965930
109201 3.0 19.0 40.741444 -73.975361 40.725029 -73.990697
120605 4.0 11.0 40.722293 -73.991475 40.747659 -73.984907
89672 2.0 19.0 40.686919 -73.976682 40.683178 -73.965964

In [74]:
y_test.head()


Out[74]:
38284     7.0
129690    8.0
109201    8.0
120605    7.0
89672     6.0
Name: avg_velocity_mph, dtype: float64

In [75]:
xy_test = pd.concat([x_test, y_test], axis=1)

In [76]:
xy_test.head()


Out[76]:
pickup_datetime_dayofweek pickup_datetime_hour pickup_latitude pickup_longitude dropoff_latitude dropoff_longitude avg_velocity_mph
38284 1.0 9.0 40.760203 -73.964785 40.756014 -73.967416 7.0
129690 4.0 17.0 40.757952 -73.977876 40.754557 -73.965930 8.0
109201 3.0 19.0 40.741444 -73.975361 40.725029 -73.990697 8.0
120605 4.0 11.0 40.722293 -73.991475 40.747659 -73.984907 7.0
89672 2.0 19.0 40.686919 -73.976682 40.683178 -73.965964 6.0

If you want to export something...


In [77]:
#Xy_test.to_csv('taxi_tree_test_Xy_20130506-12.csv')
#X_test.to_csv('taxi_tree_test_X_20130506-12.csv')
#y_test.to_csv('taxi_tree_test_y_20130506-12.csv')
# Xy_test.to_csv('bike_tree_test_Xy_20140505-11.csv')
# X_test.to_csv('bike_tree_test_X_20140505-11.csv')
# y_test.to_csv('bike_tree_test_y_20140506-11.csv')

In [78]:
# Xy_test_sample = Xy_test.sample(10000, random_state=99)

In [79]:
# Xy_test_sample.to_csv('taxi_tree_test_Xy_sample.csv')

In [80]:
# Xy_test_sample.head()

Just to be sure


In [81]:
print(x_train.shape)
print(x_train.size)
print(x_test.shape)
print(X.shape)
print(x_train.shape[0]+x_test.shape[0])


(100741, 6)
604446
(5303, 6)
(106044, 6)
106044

In [82]:
import time
# Import the necessary modules and libraries
from sklearn.tree import DecisionTreeRegressor
import numpy as np
import matplotlib.pyplot as plt

In [83]:
regtree = DecisionTreeRegressor(min_samples_split=100, random_state=99, max_depth=20)# formerly 15. 15 is reasonable
                                                                                     # random states: 99
regtree.fit(x_train, y_train)

regtree.score(x_test, y_test)


Out[83]:
0.11912472409623709

In [84]:
y_pred = regtree.predict(x_test)
diff_regtree = (y_pred-y_test)
# plt.figure(figsize=(12,10)) # not needed. set values globally
plt.hist(diff_regtree.values, bins=100, range=[-6,6])
print('Perzentile(%): ', [1,5,10,15,25,50,75,90,95,99], '\n', np.percentile(diff_regtree.values, [1,5,10,15,25,50,75,90,95,99]))
print('Absolute time deviation (in 1k): ', sum(abs(diff_regtree))/1000)
plt.title('Simple Decision Tree Regressor')
plt.xlabel('deviation in mph')
plt.ylabel('frequency')
plt.savefig('figures/bike_tree.png', format='png', dpi=300)


Perzentile(%):  [1, 5, 10, 15, 25, 50, 75, 90, 95, 99] 
 [-3.10410581 -1.99446494 -1.4        -1.13986047 -0.56306818  0.19925743
  0.74845592  1.16598639  1.41176471  1.9564538 ]
Absolute time deviation (in 1k):  4.488221403

In [85]:
diff_regtree.describe()


Out[85]:
count    5303.000000
mean        0.003596
std         1.065819
min        -7.033333
25%        -0.563068
50%         0.199257
75%         0.748456
max         4.125000
Name: avg_velocity_mph, dtype: float64

So this is a bad result. Let's try random forest now.

Train several random forests and look at their scores


In [86]:
from sklearn.ensemble import RandomForestRegressor
n_est_list = list(range(2,32,8))
min_sam_leaf_list = [2,50,100,150]
max_depth_list = list(range(2,32,8))
results = np.empty([0,4])
for n_est in n_est_list:
    for min_sam_leaf in min_sam_leaf_list:
        for max_depth in max_depth_list:
            rd_regtree = RandomForestRegressor(n_estimators=n_est,n_jobs=6,min_samples_leaf=min_sam_leaf, random_state=99, max_depth=max_depth)
            rd_regtree.fit(x_train, y_train)
            score = rd_regtree.score(x_test, y_test)
            results = np.vstack((results, [n_est, min_sam_leaf, max_depth,score]))

In [87]:
best = np.where(results == max(results[:,3]))[0]
results[best,:]


Out[87]:
array([[ 26.        ,   2.        ,  26.        ,   0.19979136]])

Watch at the best results.


In [88]:
results[np.argsort(results[:, 3])][-10:,:]


Out[88]:
array([[ 26.        ,  50.        ,  18.        ,   0.16563993],
       [ 10.        ,  50.        ,  26.        ,   0.16709339],
       [ 18.        ,  50.        ,  26.        ,   0.17044882],
       [ 26.        ,  50.        ,  26.        ,   0.17190602],
       [ 10.        ,   2.        ,  26.        ,   0.17376701],
       [ 10.        ,   2.        ,  18.        ,   0.18050883],
       [ 18.        ,   2.        ,  18.        ,   0.18941467],
       [ 18.        ,   2.        ,  26.        ,   0.19121346],
       [ 26.        ,   2.        ,  18.        ,   0.196679  ],
       [ 26.        ,   2.        ,  26.        ,   0.19979136]])

In [89]:
results = np.vstack((results, [n_est, min_sam_leaf, max_depth,score]))
#results = np.vstack((results, [n_est, min_sam_leaf, max_depth,score]))
#results = np.vstack((results, [n_est, min_sam_leaf, max_depth,score]))
results[:,3]


Out[89]:
array([ 0.03216565,  0.10243604,  0.05117017, -0.04134117,  0.03216565,
        0.10892762,  0.1338695 ,  0.13916637,  0.03216565,  0.10622744,
        0.12186341,  0.12444769,  0.03216565,  0.10673246,  0.12119634,
        0.12550215,  0.03273003,  0.14140097,  0.18050883,  0.17376701,
        0.03273003,  0.13053573,  0.16216698,  0.16709339,  0.03273003,
        0.12165977,  0.14658679,  0.14955706,  0.03273003,  0.11580282,
        0.13614928,  0.13957835,  0.03388217,  0.14354701,  0.18941467,
        0.19121346,  0.03388217,  0.13127777,  0.164049  ,  0.17044882,
        0.03388217,  0.1225744 ,  0.14795706,  0.15214733,  0.03388217,
        0.11604073,  0.13708992,  0.14088194,  0.03405952,  0.14412542,
        0.196679  ,  0.19979136,  0.03405952,  0.13139713,  0.16563993,
        0.17190602,  0.03405952,  0.12201824,  0.14815349,  0.15185501,
        0.03405952,  0.11595707,  0.13762175,  0.14100002,  0.14100002])

Train the best random forest again seperately


In [90]:
from sklearn.ensemble import RandomForestRegressor

#rd_regtree = RandomForestRegressor(n_estimators=10,n_jobs=6,min_samples_leaf=4, random_state=99, max_depth=20)
rd_regtree = RandomForestRegressor(n_estimators=26,n_jobs=6,min_samples_leaf=2, random_state=99, max_depth=26)
#total sum of diff: 1132
#rd_regtree = RandomForestRegressor(n_estimators=40,n_jobs=-1,min_samples_split=3, random_state=99, max_depth=11)
#total sum of diff: 1129
rd_regtree.fit(x_train, y_train)
print('R²: ', rd_regtree.score(x_test, y_test))


R²:  0.199791363355

In [91]:
rd_regtree.feature_importances_


Out[91]:
array([ 0.12104463,  0.17781704,  0.18620672,  0.16556891,  0.1811653 ,
        0.16819739])

In [92]:
tree_list = rd_regtree.estimators_
for i in range(0,len(tree_list)):
    print((tree_list[i].feature_importances_))


[ 0.12002358  0.17554296  0.19442978  0.17023113  0.17408755  0.165685  ]
[ 0.11671855  0.17003695  0.18261556  0.18056734  0.1770114   0.1730502 ]
[ 0.1212774   0.18594576  0.17974536  0.17083764  0.1721192   0.17007464]
[ 0.11802844  0.15425361  0.19237926  0.16592169  0.19164436  0.17777264]
[ 0.12216613  0.17483906  0.18325801  0.16611135  0.18617242  0.16745304]
[ 0.12681396  0.17850674  0.18685254  0.16952322  0.16825109  0.17005246]
[ 0.11894376  0.18112627  0.18000023  0.15378966  0.18938084  0.17675924]
[ 0.11689716  0.17969124  0.18616261  0.16124365  0.18873165  0.16727368]
[ 0.12177354  0.17866025  0.18311643  0.17045256  0.17980709  0.16619014]
[ 0.12564728  0.17346771  0.18699963  0.16579276  0.17909566  0.16899696]
[ 0.12313096  0.18269516  0.18784668  0.15912749  0.1811488   0.16605092]
[ 0.11699844  0.18513238  0.18102747  0.16180719  0.17907128  0.17596324]
[ 0.11389793  0.17649736  0.18380501  0.16473432  0.1891829   0.17188249]
[ 0.12245613  0.18804531  0.17190083  0.15312974  0.1972377   0.16723029]
[ 0.12325887  0.17636399  0.19488524  0.1681122   0.17508856  0.16229114]
[ 0.12444665  0.16925641  0.19029348  0.16701825  0.18408435  0.16490086]
[ 0.12772183  0.17755013  0.18233794  0.15931925  0.17945075  0.17362011]
[ 0.11240302  0.1714663   0.18531471  0.1675688   0.19294621  0.17030095]
[ 0.12207764  0.18165057  0.19147116  0.15734331  0.18742885  0.16002846]
[ 0.1191304   0.17872016  0.19996281  0.16541011  0.17825426  0.15852225]
[ 0.11899421  0.17741194  0.19219335  0.16779293  0.17935797  0.1642496 ]
[ 0.13382566  0.18857068  0.18455538  0.16171965  0.16911262  0.16221601]
[ 0.12470386  0.17830761  0.18863256  0.15838186  0.17815828  0.17181583]
[ 0.11868714  0.1857741   0.19271346  0.17088068  0.16985072  0.16209391]
[ 0.11488257  0.1810187   0.17688767  0.18183626  0.17562905  0.16974575]
[ 0.12225527  0.1727118   0.18198743  0.16613871  0.18799431  0.16891248]

In [93]:
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(results[:,0],results[:,2],results[:,3], marker='o')
plt.show()


So this plot is not particularly useful in this case. But it's a hinto to how to visualize the results.


In [94]:
y_pred = rd_regtree.predict(x_test)
np.linalg.norm(np.ceil(y_pred)-y_test)
diff_rd = (y_pred-y_test)
# plt.figure(figsize=(12,10)) # not needed. set values globally
plt.hist(diff_rd.values, bins=100, range=[-6,6])
print('Perzentile(%): ', [1,5,10,15,25,50,75,90,95,99], '\n', np.percentile(diff_rd.values, [1,5,10,15,25,50,75,90,95,99]))
print('Absolute time deviation (in 1k): ', sum(abs(diff_rd))/1000)
plt.title('Random Forest Regressor')
plt.xlabel('deviation in mph')
plt.ylabel('frequency')
plt.savefig('figures/bike_randomforest.png', format='png', dpi=150)


Perzentile(%):  [1, 5, 10, 15, 25, 50, 75, 90, 95, 99] 
 [-2.92611081 -1.84347799 -1.34595238 -1.06165912 -0.51531681  0.22179487
  0.71258853  1.08463999  1.38434066  1.94501121]
Absolute time deviation (in 1k):  4.22660235146

In [95]:
diff_rd.describe()


Out[95]:
count    5303.000000
mean        0.021278
std         1.015629
min        -7.083869
25%        -0.515317
50%         0.221795
75%         0.712589
max         4.156410
Name: avg_velocity_mph, dtype: float64

Dump the best random forest...


In [99]:
from sklearn.externals import joblib
joblib.dump(rd_regtree, 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl', protocol=2)


Out[99]:
['randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_01.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_02.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_03.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_04.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_05.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_06.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_07.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_08.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_09.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_10.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_11.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_12.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_13.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_14.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_15.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_16.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_17.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_18.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_19.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_20.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_21.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_22.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_23.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_24.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_25.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_26.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_27.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_28.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_29.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_30.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_31.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_32.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_33.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_34.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_35.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_36.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_37.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_38.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_39.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_40.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_41.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_42.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_43.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_44.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_45.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_46.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_47.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_48.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_49.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_50.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_51.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_52.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_53.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_54.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_55.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_56.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_57.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_58.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_59.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_60.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_61.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_62.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_63.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_64.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_65.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_66.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_67.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_68.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_69.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_70.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_71.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_72.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_73.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_74.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_75.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_76.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_77.npy',
 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl_78.npy']

In [100]:
! cd randforlib && zip bike_regtree_26x_depth_26_mss_2_PY27.zip bike_regtree_26x_depth_26_mss_2_PY27.pkl*


  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl (deflated 76%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_01.npy (deflated 19%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_02.npy (deflated 82%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_03.npy (deflated 74%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_04.npy (deflated 19%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_05.npy (deflated 81%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_06.npy (deflated 73%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_07.npy (deflated 19%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_08.npy (deflated 82%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_09.npy (deflated 74%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_10.npy (deflated 19%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_11.npy (deflated 82%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_12.npy (deflated 74%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_13.npy (deflated 19%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_14.npy (deflated 82%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_15.npy (deflated 74%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_16.npy (deflated 19%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_17.npy (deflated 82%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_18.npy (deflated 74%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_19.npy (deflated 19%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_20.npy (deflated 82%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_21.npy (deflated 74%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_22.npy (deflated 19%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_23.npy (deflated 82%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_24.npy (deflated 74%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_25.npy (deflated 19%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_26.npy (deflated 82%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_27.npy (deflated 74%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_28.npy (deflated 19%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_29.npy (deflated 82%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_30.npy (deflated 74%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_31.npy (deflated 19%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_32.npy (deflated 82%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_33.npy (deflated 74%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_34.npy (deflated 19%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_35.npy (deflated 82%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_36.npy (deflated 74%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_37.npy (deflated 19%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_38.npy (deflated 82%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_39.npy (deflated 73%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_40.npy (deflated 19%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_41.npy (deflated 82%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_42.npy (deflated 74%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_43.npy (deflated 19%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_44.npy (deflated 82%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_45.npy (deflated 74%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_46.npy (deflated 19%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_47.npy (deflated 82%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_48.npy (deflated 74%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_49.npy (deflated 19%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_50.npy (deflated 82%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_51.npy (deflated 74%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_52.npy (deflated 19%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_53.npy (deflated 82%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_54.npy (deflated 73%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_55.npy (deflated 19%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_56.npy (deflated 82%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_57.npy (deflated 74%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_58.npy (deflated 19%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_59.npy (deflated 82%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_60.npy (deflated 74%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_61.npy (deflated 19%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_62.npy (deflated 82%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_63.npy (deflated 74%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_64.npy (deflated 19%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_65.npy (deflated 82%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_66.npy (deflated 74%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_67.npy (deflated 19%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_68.npy (deflated 82%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_69.npy (deflated 74%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_70.npy (deflated 19%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_71.npy (deflated 82%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_72.npy (deflated 74%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_73.npy (deflated 19%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_74.npy (deflated 82%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_75.npy (deflated 74%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_76.npy (deflated 19%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_77.npy (deflated 82%)
  adding: bike_regtree_26x_depth_26_mss_2_PY27.pkl_78.npy (deflated 74%)