In [1]:
import os as os
import pandas as pd
import numpy as np
from scipy import stats, integrate
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
from statsmodels.distributions.empirical_distribution import ECDF
import datetime as dt
plt.style.use('seaborn-whitegrid')
plt.rcParams['image.cmap'] = 'blue'
#sns.set_context('notebook',font_scale=2)
sns.set_style("whitegrid")
% matplotlib inline
labelsize = 22
mpl.rcParams.update({'font.size': labelsize})
mpl.rcParams.update({'figure.figsize': (20,10)})
mpl.rcParams.update({'axes.titlesize': 'large'})
mpl.rcParams.update({'axes.labelsize': 'large'})
mpl.rcParams.update({'xtick.labelsize': labelsize})
mpl.rcParams.update({'ytick.labelsize': labelsize})
# mpl.rcParams.keys()
In [2]:
!cd data && ls
Use the bash =)
In [3]:
data = pd.read_csv('data/Taxi_from_2013-05-06_to_2013-05-13.csv', index_col=0, parse_dates=True)
In [4]:
data.info()
So parsing does not work, do it manually:
In [5]:
data['pickup_datetime'] =pd.to_datetime(data['pickup_datetime'], format = '%Y-%m-%d %H:%M:%S')
data['dropoff_datetime'] =pd.to_datetime(data['dropoff_datetime'], format = '%Y-%m-%d %H:%M:%S')
In [6]:
data.describe().transpose()
Out[6]:
In [7]:
data.head()
Out[7]:
In [8]:
payments = data.payment_type.value_counts()
Some statistics about the payment.
In [9]:
payments/len(data)
Out[9]:
How many trips are affected by tolls?
In [10]:
data.tolls_amount.value_counts()/len(data)
Out[10]:
So 95% of the drives do not deal with tolls. We will drop the column then.
We are not interested in the following features (they do not add any further information):
In [11]:
data = data.drop(['vendor_id', 'rate_code', 'store_and_fwd_flag','payment_type','mta_tax', 'tolls_amount',
'surcharge'], axis=1)
In [12]:
data.describe().transpose()
Out[12]:
In [13]:
data['trip_time']=data.dropoff_datetime-data.pickup_datetime
data.head()
Out[13]:
In [14]:
data.info()
In [15]:
data.isnull().sum()
Out[15]:
So there is not that much data missing. That's quite surprising, maybe it's wrong.
In [16]:
(data==0).sum()
Out[16]:
So we have many zeros in the data. How much percent?
In [17]:
(data==0).sum()/len(data)
Out[17]:
In [18]:
data = data.replace(np.float64(0), np.nan);
In [19]:
data.isnull().sum()
Out[19]:
Quick preview about the trip_times
In [20]:
trip_times_in_minutes = data['trip_time'] / np.timedelta64(1, 'm')
plt.hist(trip_times_in_minutes , bins=30, range=[0, 60],
weights=np.zeros_like(trip_times_in_minutes) + 1. / trip_times_in_minutes.size)
#plt.yscale('log')
print(trip_times_in_minutes.quantile(q=[0.025, 0.5, 0.75, 0.95, 0.975, 0.99]))
plt.xlabel('Trip Time in Minutes')
plt.ylabel('Relative Frequency')
plt.title('Distribution of Trip Time')
plt.savefig('figures/trip_time_distribution.eps', format='eps', dpi=1000)
In [21]:
len(data.trip_time.value_counts().values)
Out[21]:
That many unique values do we have in trip_time.
Identify the the cases without geo data and remove them from our data to be processed.
In [22]:
anomaly = data.loc[(data['dropoff_longitude'].isnull()) | (data['dropoff_latitude'].isnull()) |
(data['pickup_longitude'].isnull()) | (data['pickup_latitude'].isnull())]
data = data.drop(anomaly.index)
In [23]:
anomaly['flag'] = 'geo_NA'
In [24]:
data.isnull().sum()
Out[24]:
So how many percent of data are left to be processed?
In [25]:
len(data)/(len(data)+len(anomaly))
Out[25]:
In [26]:
anomaly.tail()
Out[26]:
In [27]:
anomaly = anomaly.append(data.loc[(data['trip_distance'].isnull())])
anomaly.loc[data.loc[(data['trip_distance'].isnull())].index,'flag'] = 'trip_dist_NA'
anomaly.tail()
Out[27]:
In [28]:
data = data.drop(anomaly.index, errors='ignore') # ignore uncontained labels
In [29]:
data.isnull().sum()
Out[29]:
In [30]:
1-len(data)/(len(data)+len(anomaly))
Out[30]:
In [31]:
anomaly = anomaly.append(data.loc[(data['trip_time'].isnull())])
anomaly.loc[data.loc[(data['trip_time'].isnull())].index,'flag'] = 'trip_time_NA'
anomaly.tail()
Out[31]:
In [32]:
data = data.drop(anomaly.index, errors='ignore') # ignore uncontained labels
In [33]:
data.describe().transpose()
Out[33]:
In [34]:
plt.hist(data.trip_time.values / np.timedelta64(1, 'm'), bins=50, range=[0,100])
Out[34]:
In [35]:
print(data.trip_time.describe())
np.percentile(data.trip_time, [1,5,10,15,25,50,75,85,95,99]) / np.timedelta64(1,'m')
Out[35]:
In [36]:
anomaly.tail()
Out[36]:
In [37]:
1-len(data)/(len(data)+len(anomaly))
Out[37]:
In [38]:
data.isnull().sum()
Out[38]:
Correct the avg amount for the initial charge.
In [39]:
data['avg_amount_per_minute'] = (data.fare_amount-2.5) / (data.trip_time / np.timedelta64(1,'m'))
In [40]:
data.avg_amount_per_minute.describe()
Out[40]:
Distribution of the avg_amount_per_minute
In [41]:
h = data.avg_amount_per_minute
plt.figure(figsize=(20,10))
plt.hist(h, normed=False, stacked=True, bins=40, range=[0 , 100], )
#, histtype='stepfilled')
plt.yscale('log')
plt.ylabel('log(freq x)', fontsize=40)
plt.xlabel('x = avg_amount_per_minute', fontsize=40)
print('Min:' + str(min(h)) + '\nMax:' + str(max(h)))
plt.yticks(fontsize=40)
plt.xticks(fontsize=40)
plt.locator_params(axis = 'x', nbins = 20)
plt.show()
In [42]:
data.head()
Out[42]:
In [43]:
data.avg_amount_per_minute.quantile([.0001,.01, .5, .75, .95, .975, .99, .995])
Out[43]:
In [44]:
lb = 0.5
ub = 2.5
anomaly = anomaly.append(data.loc[(data['avg_amount_per_minute'] > ub) |
(data['avg_amount_per_minute'] < lb)])
anomaly.loc[data.loc[(data['avg_amount_per_minute'] > ub)].index,'flag'] = 'too fast'
anomaly.loc[data.loc[(data['avg_amount_per_minute'] < lb)].index,'flag'] = 'too slow'
data = data.drop(anomaly.index, errors='ignore') # ignore uncontained labels / indices
print(1-len(data)/(len(data)+len(anomaly)))
So we dropped around 6% of the data.
In [45]:
data.avg_amount_per_minute.describe()
Out[45]:
In [46]:
anomaly.tail()
Out[46]:
In [47]:
jfk_geodata = (40.641547, -73.778118)
ridgefield_geodata = (40.856406, -74.020642)
data_in_box = data.loc[(data['dropoff_latitude'] > jfk_geodata[0]) &
(data['dropoff_longitude'] < jfk_geodata[1]) &
(data['dropoff_latitude'] < ridgefield_geodata[0]) &
(data['dropoff_longitude'] > ridgefield_geodata[1]) &
(data['pickup_latitude'] > jfk_geodata[0]) &
(data['pickup_longitude'] < jfk_geodata[1]) &
(data['pickup_latitude'] < ridgefield_geodata[0]) &
(data['pickup_longitude'] > ridgefield_geodata[1])
]
# taxidata = taxidata.drop(anomaly.index)
In [48]:
data_in_box.head()
Out[48]:
In [49]:
print(jfk_geodata < ridgefield_geodata,
len(data_in_box)/len(data))
So we've omitted about 2% of the data because the trips do not start and end in the box
In [50]:
x = data_in_box.pickup_longitude
y = data_in_box.pickup_latitude
plt.jet()
H, xedges, yedges = np.histogram2d(x, y, bins=300)#, normed=False, weights=None)
fig = plt.figure(figsize=(20, 10))
plt.hist2d(x, y, bins=300, range=[[min(x.values),-73.95],[40.675,40.8]])
plt.colorbar()
plt.title('Pickup density (first full week in May 2013)')
plt.ylabel('Latitude')
plt.xlabel('Longitude')
ax = fig.gca()
ax.grid(False)
# plt.savefig('figures/pickup_density_manhattan_13.png', format='png', dpi=150)
In [51]:
h = data_in_box.trip_time.values / np.timedelta64(1, 'm')
plt.hist(h, normed=False, bins=150)
plt.yticks(fontsize=40)
plt.xticks(fontsize=40)
plt.show()
In [52]:
data_in_box.head()
Out[52]:
In [53]:
time_regression_df = pd.DataFrame([#data_in_box['pickup_datetime'].dt.day,
data_in_box['pickup_datetime'].dt.dayofweek,
data_in_box['pickup_datetime'].dt.hour,
data_in_box['pickup_latitude'],
data_in_box['pickup_longitude'],
data_in_box['dropoff_latitude'],
data_in_box['dropoff_longitude'],
np.ceil(data_in_box['trip_time']/np.timedelta64(1, 'm')),
]).T
In [54]:
time_regression_df.columns = ['pickup_datetime_dayofweek', 'pickup_datetime_hour',
'pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude',
'trip_time']
Use minutes for prediction instead of seconds (ceil the time). Definitley more robust than seconds!
In [55]:
time_regression_df.tail()
Out[55]:
In [56]:
time_regression_df.head()
Out[56]:
In [57]:
time_regression_df.ix[:,0:6].describe()
Out[57]:
In [58]:
print(time_regression_df.trip_time.value_counts())
print(len(time_regression_df.trip_time.value_counts()))
In [59]:
time_regression_df.trip_time.quantile([0.05, 0.95])
Out[59]:
So 90% of the trip_times are between 3 and 30 minutes.
In [60]:
hour_stats = time_regression_df.groupby(time_regression_df.pickup_datetime_hour)
In [61]:
plt.bar(left = hour_stats.pickup_datetime_hour.count().keys(), height=hour_stats.pickup_datetime_hour.count().values/7,
tick_label=hour_stats.pickup_datetime_hour.count().keys(), align='center')
plt.title('Avg. pickups per hour')
plt.xlabel('datetime_hour')
plt.ylabel('frequency')
plt.savefig('avg_pickups_per_hour.png')
In [62]:
print('Avg. pickups per half-hour (summarized over 1 week)')
hour_stats.pickup_datetime_hour.count()/14
Out[62]:
In [63]:
(hour_stats.count()/14).quantile([.5])
Out[63]:
In [64]:
time_regression_df.columns
Out[64]:
In [65]:
from sklearn import cross_validation as cv
time_regression_df_train, time_regression_df_test = cv.train_test_split(time_regression_df, test_size=0.1, random_state=99)
y_train = time_regression_df_train['trip_time']
x_train = time_regression_df_train.ix[:, 0:6]
y_test = time_regression_df_test['trip_time']
x_test = time_regression_df_test.ix[:, 0:6]
In [66]:
time_regression_df_train.tail()
Out[66]:
In [67]:
len(x_train)
Out[67]:
In [68]:
xy_test = pd.concat([x_test, y_test], axis=1)
In [69]:
xy_test.head()
Out[69]:
In [70]:
# xy_test.to_csv('taxi_tree_test_Xy_20130506-12.csv')
# x_test.to_csv('taxi_tree_test_X_20130506-12.csv')
# y_test.to_csv('taxi_tree_test_y_20130506-12.csv')
In [71]:
# xy_test_sample = Xy_test.sample(10000, random_state=99)
In [72]:
# xy_test_sample.to_csv('taxi_tree_test_Xy_sample.csv')
In [73]:
# xy_test_sample.head()
In [74]:
print(x_train.shape)
print(x_train.size)
print(x_test.shape)
print(time_regression_df.shape)
print(x_train.shape[0]+x_test.shape[0])
In [75]:
import time
# Import the necessary modules and libraries
from sklearn.tree import DecisionTreeRegressor
import numpy as np
import matplotlib.pyplot as plt
In [76]:
#features = ['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude','pickup_datetime']
#print("* features:", features, sep="\n")
max_depth_list = (10,15,20,25,30)
scores = [-1, -1, -1, -1, -1]
sum_abs_devs = [-1, -1, -1, -1, -1]
times = [-1, -1, -1, -1, -1]
for i in range(0,len(max_depth_list)):
start = time.time()
regtree = DecisionTreeRegressor(min_samples_split=1000, random_state=10, max_depth=max_depth_list[i])# formerly 15. 15 is reasonable,
# 30 brings best results # random states: 99
regtree.fit(x_train, y_train)
scores[i]= regtree.score(x_test, y_test)
y_pred = regtree.predict(x_test)
sum_abs_devs[i] = sum(abs(y_pred-y_test))
times[i] = time.time() - start
print(max_depth_list)
print(scores)
print(sum_abs_devs)
print(times)
| Sum of abs. deviation | max_depth | max_depth | max_depth | max_depth | max_depth |
|---|---|---|---|---|---|
| min_samples_split | 10 | 15 | 20 | 25 | 30 |
| 3 | 1543 | 1267 | 1127 | 1088 | 1139 |
| 10 | 1544 | 1266 | 1117 | 1062 | 1086 |
| 20 | 1544 | 1265 | 1108 | 1037 | 1034 |
| 50 | 1544 | 1263 | 1097 | 1011 | 994 |
| 250 | 1544 | 1266 | 1103 | 1019 | 1001 |
| 1000 | 1548 | 1284 | 1144 | 1085 | 1077 |
| 2500 | 1555 | 1307 | 1189 | 1150 | 1146 |
Min_samples_split = 3
(10, 15, 20, 25, 30)
[0.51550436937183575, 0.64824394212610637, 0.68105673170887715, 0.66935222696811203, 0.62953726391785103]
[1543779.4758261547, 1267630.6429649692, 1126951.2647852183, 1088342.055931434, 1139060.7870262777]
[14.802491903305054, 21.25719118118286, 27.497225046157837, 32.381808280944824, 35.0844943523407]
Min_samples_split = 10
(10, 15, 20, 25, 30)
[0.51546967657630205, 0.65055440252664309, 0.69398351369676525, 0.69678113708751077, 0.67518497976746361]
[1543829.4000325042, 1266104.6486240581, 1117165.9640872395, 1061893.3390857978, 1086045.4846943137]
[14.141993999481201, 20.831212759017944, 25.626588821411133, 29.81039047241211, 32.23483180999756]
Min_samples_split = 20
(10, 15, 20, 25, 30)
[0.51537943698967736, 0.65215078696481421, 0.70216115764491505, 0.71547757670696144, 0.70494598277965781]
[1543841.1100632891, 1264595.0251062319, 1108064.4596608584, 1036593.8033015681, 1039378.3133869285]
[14.048030376434326, 20.481205463409424, 25.652794361114502, 29.03341507911682, 31.56394076347351]
min_samples_split=50
(10, 15, 20, 25, 30)
[0.51540742268899331, 0.65383862050244068, 0.71125658610588971, 0.73440457163892259, 0.73435595461521908]
[1543721.3435906437, 1262877.4227863667, 1097080.889761846, 1010511.305738725, 994244.46643680066]
[14.682952404022217, 21.243955373764038, 25.80405569076538, 28.731933116912842, 32.00149917602539]
min_samples_split=250
(10, 15, 20, 25, 30)
[0.51532618474195502, 0.65304694576643452, 0.712453138233199, 0.73862283625684677, 0.74248829470934752]
[1544004.1103626473, 1266358.9437320188, 1102793.6462709717, 1018555.9754967012, 1000675.2014443219]
[14.215412378311157, 20.32301664352417, 25.39385199546814, 27.81620717048645, 28.74960231781006]
min_samples_split=1000
(10, 15, 20, 25, 30)
[0.51337097515902541, 0.64409382777503155, 0.6957163207523871, 0.71429589738370614, 0.7159815227278703]
[1547595.3414912082, 1284490.8114952976, 1143568.0997977962, 1084873.9820350427, 1077427.5321143884]
[14.676559448242188, 20.211236476898193, 23.846965551376343, 26.270352125167847, 26.993313789367676]
min_samples_split=2500
(10, 15, 20, 25, 30)
[0.50872112253965895, 0.63184888428446373, 0.67528344919996985, 0.68767132817144228, 0.68837707513473978]
[1554528.9746030923, 1306995.3609336747, 1188981.9585730932, 1149615.9326777055, 1146209.3017767756]
[14.31177806854248, 20.02240490913391, 23.825161457061768, 24.616609811782837, 25.06274127960205]
Train the most promising decision tree again
In [77]:
start = time.time()
regtree = DecisionTreeRegressor(min_samples_split=50, random_state=10, max_depth=25, splitter='best' )
regtree.fit(x_train, y_train)
regtree.score(x_test, y_test)
y_pred = regtree.predict(x_test)
sum_abs_devs = sum(abs(y_pred-y_test))
elapsed = time.time() - start
print(elapsed)
A tree with this depth is too big to dump. Graphviz works fine until around depth 12.
In [78]:
# from sklearn import tree
# tree.export_graphviz(regtree, out_file='figures/tree_d10.dot', feature_names=time_regression_df.ix[:,0:6].columns, class_names=time_regression_df.columns[6])
In [79]:
regtree.tree_.impurity
Out[79]:
In [80]:
y_train.describe()
Out[80]:
In [81]:
print('R²: ', regtree.score(x_test, y_test))
In [82]:
from sklearn.externals import joblib
joblib.dump(regtree, 'treelib/regtree_depth_25_mss_50_rs_10.pkl', protocol=2)
Out[82]:
A few stats about the trained tree:
In [83]:
print(regtree.feature_importances_ ,'\n',
regtree.class_weight,'\n',
regtree.min_samples_leaf,'\n',
regtree.tree_.n_node_samples,'\n'
)
In [84]:
y_pred = regtree.predict(x_test)
np.linalg.norm(np.ceil(y_pred)-y_test)
diff = (y_pred-y_test)
# plt.figure(figsize=(12,10)) # not needed. set values globally
plt.hist(diff.values, bins=100, range=[-50, 50])
print('Perzentile(%): ', [1,5,10,15,25,50,75,90,95,99], '\n', np.percentile(diff.values, [1,5,10,15,25,50,75,90,95,99]))
print('Absolute time deviation (in 1k): ', sum(abs(diff))/1000)
plt.title('Error Distribution on the 2013 Test Set')
plt.xlabel('Error in Minutes')
plt.ylabel('Frequency')
plt.savefig('figures/simple_tree_error_d25_msp_50.eps', format='eps', dpi=1000)
In [85]:
diff.describe()
Out[85]:
In [86]:
leaves = regtree.tree_.children_left*regtree.tree_.children_right
for idx, a in enumerate(leaves):
if a==1:
x=1# do nothing
else:
leaves[idx] = 0
print(leaves)
In [699]:
print(leaves[leaves==1].sum())
len(leaves[leaves==1])
Out[699]:
So we have 67260 leaves.
In [88]:
len(leaves[leaves==1])/regtree.tree_.node_count
Out[88]:
So 50% of the nodes are leaves. A little bit cross-checking:
In [89]:
print((leaves==1).sum()+(leaves==0).sum())
print(len(leaves))
In [90]:
node_samples = regtree.tree_.n_node_samples
node_samples
Out[90]:
In [91]:
leaf_samples = np.multiply(leaves, node_samples)
stats = np.unique(leaf_samples, return_counts=True)
stats
Out[91]:
In [92]:
plt.scatter(stats[0][1:], stats[1][1:])
plt.yscale('log')
plt.xscale('log')
In [93]:
node_perc = np.cumsum(stats[1][1:]) # Cumulative sum of nodes
samples_perc = np.cumsum(np.multiply(stats[0][1:],stats[1][1:]))
node_perc = node_perc / node_perc[-1]
samples_perc = samples_perc / samples_perc[-1]
plt.plot(node_perc, samples_perc)
plt.plot((np.array(range(0,100,1))/100), (np.array(range(0,100,1))/100), color='black')
plt.ylim(0,1)
plt.xlim(0,1)
plt.title('Lorenz Curve Between Nodes And Samples')
plt.xlabel('Leaves %')
plt.ylabel('Samples %')
plt.fill_between(node_perc, samples_perc , color='blue', alpha='1')
plt.savefig('figures/lorenzcurve_d25_msp_50.eps', format='eps', dpi=1000)
plt.savefig('figures/lorenzcurve_d25_msp_50.png', format='png', dpi=300)
In [651]:
len(leaf_samples)==regtree.tree_.node_count
Out[651]:
We found out that all samples have been considered.
In [697]:
max_leaf = [np.argmax(leaf_samples), max(leaf_samples)]
print('So node no.', max_leaf[0] ,'is a leaf and has', max_leaf[1] ,'samples in it.')
print(max_leaf)
In [680]:
# Inspired by: http://stackoverflow.com/questions/20224526/
# how-to-extract-the-decision-rules-from-scikit-learn-decision-tree
def get_rule(tree, feature_names, leaf):
left = tree.tree_.children_left
right = tree.tree_.children_right
threshold = tree.tree_.threshold
features = [feature_names[i] for i in tree.tree_.feature]
value = tree.tree_.value
samples = tree.tree_.n_node_samples
global count
count = 0;
global result
result = {};
def recurse_up(left, right, threshold, features, node):
global count
global result
count = count+1;
#print(count)
if node != 0:
for i, j in enumerate(right):
if j == node:
print( 'Node:', node, 'is right of:',i, ' with ', features[i], '>', threshold[i])
result[count] = [features[i], False, threshold[i]]
return(recurse_up(left, right, threshold, features, i))
for i, j in enumerate(left):
if j == node:
print('Node:', node, 'is left of',i,' with ', features[i], '<= ', threshold[i])
result[count] = [features[i], True, threshold[i]]
return(recurse_up(left, right, threshold, features, i))
else :
return(result)
print('Leaf:',leaf, ', value: ', value[leaf][0][0], ', samples: ', samples[leaf])
recurse_up(left, right, threshold, features, leaf)
return(result)
In [681]:
branch_to_leaf=get_rule(regtree, time_regression_df.ix[:,0:6].columns,max_leaf[0])
In [682]:
branch_to_leaf
Out[682]:
Processing is nicer if the path is in a data frame.
In [683]:
splitsdf = pd.DataFrame(branch_to_leaf).transpose()
splitsdf.columns = ['features', 'leq', 'value']
splitsdf
Out[683]:
Via grouping, we can extract the relevant splits that are always the ones towards the end of the branch. Earlier splits become obsolete if the feature is splitted in the same manner again downwards the tree.
In [685]:
splitstats = splitsdf.groupby(['features','leq'])
splitstats.groups
Out[685]:
Groupby is very helpful here. Choose always the split with the first index. "min()" is used here for demonstration purposes only.
In [698]:
splitstats.min()
Out[698]:
One might use an own get_group method. This will throw less exceptions if the key is not valid (e.g. there is no lower range on day_of_week). This can especially happen in trees with low depth.
In [688]:
def get_group(g, key):
if key in g.groups: return g.get_group(key)
return pd.DataFrame(list(key).append(np.nan))
In [689]:
area_coords = dict()
area_coords['dropoff_upper_left'] = [splitstats.get_group(('dropoff_latitude', True)).iloc[0].value,
splitstats.get_group(('dropoff_longitude', False)).iloc[0].value]
area_coords['dropoff_lower_right'] = [splitstats.get_group(('dropoff_latitude',False)).iloc[0].value,
splitstats.get_group(('dropoff_longitude',True)).iloc[0].value]
area_coords['pickup_upper_left'] = [splitstats.get_group(('pickup_latitude',True)).iloc[0].value,
splitstats.get_group(('pickup_longitude',False)).iloc[0].value]
area_coords['pickup_lower_right'] = [splitstats.get_group(('pickup_latitude',False)).iloc[0].value,
splitstats.get_group(('pickup_longitude',True)).iloc[0].value]
area_coords
Out[689]:
In [690]:
import operator
dropoff_rect_len = list(map(operator.sub,area_coords['dropoff_upper_left'],
area_coords['dropoff_lower_right']))
pickup_rect_len = list(map(operator.sub,area_coords['pickup_upper_left'],
area_coords['pickup_lower_right']))
dropoff_rect_len, pickup_rect_len
Out[690]:
In order to draw the rectangle, we need the side lengths of the areas.
In [691]:
import matplotlib.patches as patches
x = data_in_box.pickup_longitude
y = data_in_box.pickup_latitude
fig = plt.figure(figsize=(20, 10))
# Reduce the plot to Manhattan
plt.hist2d(x, y, bins=300, range=[[min(x.values),-73.95],[40.675,40.8]])
plt.colorbar()
plt.title('Pickup density (first full week in May 2013)')
plt.ylabel('Latitude')
plt.xlabel('Longitude')
plt.hold(True)
ax = fig.gca()
ax.add_patch(patches.Rectangle((area_coords['dropoff_upper_left'][1], area_coords['dropoff_lower_right'][0]),
abs(dropoff_rect_len[1]), dropoff_rect_len[0], fill=False, edgecolor='red', linewidth=5))
ax.add_patch(patches.Rectangle((area_coords['pickup_upper_left'][1], area_coords['pickup_lower_right'][0]),
abs(pickup_rect_len[1]), pickup_rect_len[0], fill=False, edgecolor='white', linewidth=5))
ax.grid(False)
plt.hold(False)
In [692]:
trips_of_leaf = x_train.loc[(x_train['dropoff_latitude'] > area_coords['dropoff_lower_right'][0]) &
(x_train['dropoff_longitude'] < area_coords['dropoff_lower_right'][1]) &
(x_train['dropoff_latitude'] < area_coords['dropoff_upper_left'][0]) &
(x_train['dropoff_longitude'] > area_coords['dropoff_upper_left'][1]) &
(x_train['pickup_latitude'] > area_coords['pickup_lower_right'][0]) &
(x_train['pickup_longitude'] < area_coords['pickup_lower_right'][1]) &
(x_train['pickup_latitude'] < area_coords['pickup_upper_left'][0]) &
(x_train['pickup_longitude'] > area_coords['pickup_upper_left'][1]) &
(x_train['pickup_datetime_dayofweek'] < 4.5) &
(x_train['pickup_datetime_hour'] < 18.5) &
(x_train['pickup_datetime_hour'] > 7.5)
]
In [693]:
trips_of_leaf.head()
Out[693]:
In [694]:
print('Filtered trips: ', len(trips_of_leaf))
print('Trips in leaf: ', max_leaf[1])
len(trips_of_leaf) == max_leaf[1]
Out[694]:
In [162]:
import gmaps
import gmaps.datasets
gmaps.configure(api_key='AI****') # Fill in your API-Code here
In [650]:
trips_of_leaf_pickup_list = trips_of_leaf.iloc[:,[2,3]].as_matrix().tolist()
trips_of_leaf_dropoff_list = trips_of_leaf.iloc[:,[4,5]].as_matrix().tolist()
In [645]:
data = gmaps.datasets.load_dataset('taxi_rides')
pickups_gmap = gmaps.Map()
dropoffs_gmap = gmaps.Map()
pickups_gmap.add_layer(gmaps.Heatmap(data=trips_of_leaf_pickup_list[0:1000]))
dropoffs_gmap.add_layer(gmaps.Heatmap(data=trips_of_leaf_dropoff_list[0:1000]))
In [695]:
pickups_gmap
In [696]:
dropoffs_gmap