In [1]:
import os as os
import pandas as pd
import numpy as np
from scipy import stats, integrate
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
from statsmodels.distributions.empirical_distribution import ECDF
import datetime as dt
plt.style.use('seaborn-whitegrid')
plt.rcParams['image.cmap'] = 'blue'
#sns.set_context('notebook',font_scale=2)
sns.set_style("whitegrid")
% matplotlib inline
labelsize = 22
mpl.rcParams.update({'font.size': labelsize})
mpl.rcParams.update({'figure.figsize': (20,10)})
mpl.rcParams.update({'axes.titlesize': 'large'})
mpl.rcParams.update({'axes.labelsize': 'large'})
mpl.rcParams.update({'xtick.labelsize': labelsize})
mpl.rcParams.update({'ytick.labelsize': labelsize})
In [2]:
!cd data && ls
Use the bash =)
In [3]:
data = pd.read_csv('data/bike_oneweekfrom20140505.csv', index_col=0, parse_dates=True)
In [4]:
data.info()
In [5]:
data.columns
Out[5]:
Rename the columns. They're now streamlined with the taxi input.
In [6]:
new_column_names = ['trip_time', 'pickup_datetime', 'dropoff_datetime', 'start_station_id',
'start_station_name', 'pickup_latitude',
'pickup_longitude', 'end_station_id', 'end_station_name',
'dropoff_latitude', 'dropoff_longitude', 'bikeid', 'usertype',
'birth year', 'gender']
In [7]:
data.columns = new_column_names
In [8]:
data.describe()
Out[8]:
Parse the dates.
In [9]:
data['pickup_datetime'] =pd.to_datetime(data['pickup_datetime'], format = '%Y-%m-%d %H:%M:%S')
data['dropoff_datetime'] =pd.to_datetime(data['dropoff_datetime'], format = '%Y-%m-%d %H:%M:%S')
data['trip_time'] = pd.to_timedelta(data['trip_time'], 's')
In [10]:
data.describe().transpose()
Out[10]:
In [11]:
data.head()
Out[11]:
In [12]:
data.info()
Have a look at every station. How many pickups are there per station?
In [13]:
group_by_start = data.groupby(data.start_station_id, )
usage_freq = group_by_start.trip_time.count()
print('Amount of stations: ' + str(len(usage_freq)))
plt.hist(usage_freq, bins = 50)
plt.title('Frequency of starts from start stations')
Out[13]:
When you take a look at the map, you won't be surprised that the Grand Central Station is the one with the most frequent usage.
In [14]:
group_by_start_name = data.groupby(data.start_station_name, )
usage_freq_name = group_by_start_name.trip_time.count()
usage_freq_name.sort_values().keys()[-10:]
Out[14]:
In [15]:
usage_freq_name.sort_values().keys()
Out[15]:
So we have 326 different start stations.
In [16]:
group_by_end_name = data.groupby(data.end_station_name, )
usage_freq_name = group_by_end_name.trip_time.count()
usage_freq_name.sort_values().keys()
Out[16]:
We have 326 different end stations, too. But are all end_stations contained in start_stations and vive versa?
In [17]:
group_by_start_id = data.groupby(data.start_station_id, )
start_ids = group_by_start_id.trip_time.count().sort_values().keys()
group_by_end_id = data.groupby(data.end_station_id, )
end_ids = group_by_end_id.trip_time.count().sort_values().keys()
len(set(start_ids).intersection(end_ids))
Out[17]:
So this is the proof, that all stations are used as start and end stations.
In [18]:
data.isnull().sum()
Out[18]:
So there is not that much data missing. That's quite surprising, maybe it's wrong.
In [19]:
(data==0).sum()
Out[19]:
So we only have many zeros in the gender-feature. Gender can have 3 values (0=unknown, 1=male, 2=female)
In [20]:
from collections import Counter
print(Counter(data.gender))
Counter(data['birth year'])
Out[20]:
So we see, we have also missing values in "gender" and "birth year". But we are not interested in this features, therefore we can ignore this missing values at this point.
Quick pverview about the trip_times
In [21]:
plt.hist((data['trip_time'] / np.timedelta64(1, 'm')), bins=30, range=[0, 100])
plt.title('Distribution of trip_time in minutes')
plt.xlabel('trip_time')
plt.ylabel('frequency')
plt.savefig('figures/bike_trip_time.png', format='png', dpi=300)
We are interested in the trip time in minutes.
In [22]:
sns.boxplot((data['trip_time'] / np.timedelta64(1, 'm')))
Out[22]:
We have lots of outliers.
So as we can see, we have many outliers.
In [23]:
data.trip_time.quantile([0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99])
Out[23]:
In [24]:
len(data.trip_time.value_counts().values)
Out[24]:
Identify the the cases without geo data and remove them from our data to be processed.
In [25]:
anomaly = data.loc[(data['dropoff_longitude'].isnull()) | (data['dropoff_latitude'].isnull()) |
(data['pickup_longitude'].isnull()) | (data['pickup_latitude'].isnull())]
data = data.drop(anomaly.index)
In [26]:
anomaly['flag'] = 'geo_NA'
In [27]:
data.isnull().sum()
Out[27]:
So how many percent of data are left to be processed?
In [28]:
len(data)/(len(data)+len(anomaly))
Out[28]:
In [29]:
anomaly.tail()
Out[29]:
In [30]:
plt.hist(data.trip_time.values / np.timedelta64(1, 'm'), bins=50, range=[0,100])
Out[30]:
Calculate the distance of a trip. Use the difference of lon / lat data and parse it to metrics.
In [31]:
data['trip_dist'] = -1 # Init trip_dist
Be aware, this operation is quite slow.
In [32]:
# inpout for vincenty:(location.latitude, location.longitude)
from geopy.distance import vincenty
for i in range(0, (len(data)-1)):
pickup = (data.iloc[i]['pickup_latitude'], data.iloc[i]['pickup_longitude'])
dropoff = (data.iloc[i]['dropoff_latitude'], data.iloc[i]['dropoff_longitude'])
data.set_value(i, 'trip_dist', vincenty(pickup,dropoff).meters)
In [33]:
data.trip_dist
Out[33]:
We cannot use miles because vincenty ceils / floors the results
In [34]:
data.columns
Out[34]:
Check if new column is present
In [35]:
# data.to_csv('data/bike_20140505_with_dist.csv')
In [36]:
data['avg_velocity'] = data.trip_dist.values/(data.trip_time / (np.timedelta64(1, 'h')))
In [37]:
data.avg_velocity
Out[37]:
Parse to km/h
In [38]:
data.avg_velocity = data.avg_velocity/1000
In [39]:
data.avg_velocity
Out[39]:
Convert avg_velocity from km/h to miles/h. (1km = 0,621371 miles)
In [40]:
data['avg_velocity'] = data.avg_velocity*0.627371
In [41]:
# data.to_csv('data/bike_20140505_with_dist_and_avg_velo.csv')
In [42]:
plt.hist(data.avg_velocity, bins=60, range=[0,15])
plt.title('Distribution of avg_velocity in mph')
plt.xlabel('avg_velocity')
plt.ylabel('frequency')
plt.savefig('figures/bike_avg_vel.png', format='png', dpi=300)
In [43]:
data.avg_velocity.describe() # in mph
Out[43]:
In [44]:
data.head()
Out[44]:
In [45]:
data.avg_velocity.quantile([.0001,.01, .5, .75, .95, .975, .99, .995])
Out[45]:
In [46]:
lb = 5
ub = 15
anomaly = data.loc[(data['avg_velocity'] < lb) | (data['avg_velocity'] > ub)]
# be careful! Maybe adjust to append.
#anomaly.loc[
anomaly.loc[data.loc[(data['avg_velocity'] < lb)].index,'flag'] = 'too_slow'
anomaly.loc[data.loc[(data['avg_velocity'] > ub)].index,'flag'] = 'too_fast'
data = data.drop(anomaly.index, errors='ignore') # ignore uncontained labels / indices
print(1-len(data)/(len(data)+len(anomaly)))
In [47]:
anomaly.head()
Out[47]:
In [48]:
data.avg_velocity.describe()
Out[48]:
In [49]:
anomaly.tail()
Out[49]:
In [50]:
x = data.pickup_longitude
y = data.pickup_latitude
bins = 100;
H, xedges, yedges = np.histogram2d(x, y, bins=bins)
plt.jet()
fig = plt.figure(figsize=(20, 10))
#ax.set_title('pcolormesh: exact bin edges')
#mesh = plt.pcolormesh(X, Y, H)
plt.hist2d(x,y,bins=bins)
plt.colorbar()
plt.scatter(xedges[79], yedges[61], marker='x')
ax = fig.gca()
ax.grid(False)
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.title('Pickup Frequency')
#ax.set_aspect('equal')
#plt.savefig('figure.pdf', format='pdf')
#plt.savefig('figure.png', format='png')
plt.savefig('figures/bike_stations.png', format='png', dpi=300)
Where is a pickup maximum?
In [51]:
np.amax(H)
Out[51]:
In [52]:
np.where(H==2001)
Out[52]:
The area with the maximum pickups is around the Grand Central Terminal. Not very surprising.
In [53]:
yedges[79],xedges[61],
Out[53]:
In [54]:
type(H)
Out[54]:
In [55]:
print(H.shape)
print(H.size)
print(H.max())
In [56]:
print('Current bin width:')
print(vincenty((xedges[0], yedges[0]), (xedges[1], yedges[1])).meters)
In [57]:
distances = (-1)*np.ones(len(xedges)-1)
for x in range(0,len(xedges)-1, 1):
distances[x] = vincenty((xedges[x], yedges[x]), (xedges[x+1], yedges[x+1])).meters
In [58]:
print('Sizes of all bins in meters:')
distances
Out[58]:
In [59]:
(H==0).sum()/H.size
Out[59]:
So we know that about 97% of the bins have 0 pickups in it. This was expected, because we have static stations only.
In [60]:
jfk_geodata = (40.641547, -73.778118)
ridgefield_geodata = (40.856406, -74.020642)
data_in_box = data.loc[(data['dropoff_latitude'] > jfk_geodata[0]) &
(data['dropoff_longitude'] < jfk_geodata[1]) &
(data['dropoff_latitude'] < ridgefield_geodata[0]) &
(data['dropoff_longitude'] > ridgefield_geodata[1]) &
(data['pickup_latitude'] > jfk_geodata[0]) &
(data['pickup_longitude'] < jfk_geodata[1]) &
(data['pickup_latitude'] < ridgefield_geodata[0]) &
(data['pickup_longitude'] > ridgefield_geodata[1])
]
# taxidata = taxidata.drop(anomaly.index)
In [61]:
len(data_in_box)/len(data)
Out[61]:
In [62]:
data_in_box.head()
Out[62]:
Let's take a first look at the distribution of the cleaned target variable which we want to estimate:
In [63]:
h = data_in_box.avg_velocity.values
plt.figure(figsize=(20,10))
plt.hist(h, normed=False, bins=150)
#, histtype='stepfilled')
#plt.yscale('log')
#plt.ylabel('log(freq x)', fontsize=40)
#plt.xlabel('x = avg_amount_per_minute', fontsize=40)
#print('Min:' + str(min(h)) + '\nMax:' + str(max(h)))
#plt.locator_params(axis = 'x', nbins = 20)
plt.show()
In [64]:
data_in_box.head()
Out[64]:
In [65]:
time_regression_df = pd.DataFrame([ #data_in_box['pickup_datetime'].dt.day, # leave this out
data_in_box['pickup_datetime'].dt.dayofweek,
data_in_box['pickup_datetime'].dt.hour,
data_in_box['pickup_latitude'],
data_in_box['pickup_longitude'],
data_in_box['dropoff_latitude'],
data_in_box['dropoff_longitude'],
np.ceil(data_in_box['avg_velocity'])
]).T
In [66]:
time_regression_df.columns = [#'pickup_datetime_day',
'pickup_datetime_dayofweek', 'pickup_datetime_hour',
'pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude',
'avg_velocity_mph']
Use minutes for prediction instead of seconds (ceil the time). Definitley more robust than seconds!
In [67]:
time_regression_df.head()
Out[67]:
In [68]:
time_regression_df.tail()
Out[68]:
In [69]:
time_regression_df.ix[:,0:7].describe()
Out[69]:
In [70]:
print(time_regression_df.avg_velocity_mph.value_counts())
print(len(time_regression_df.avg_velocity_mph.value_counts()))
So we hace 10 different velocities to predict.
In [71]:
from sklearn import cross_validation as cv
y = time_regression_df['avg_velocity_mph']
X = time_regression_df.ix[:,0:6]
X_train, X_test, y_train, y_test = cv.train_test_split(X, y,test_size=0.05,random_state=0)
In [72]:
from sklearn import cross_validation as cv
time_regression_df_train, time_regression_df_test = cv.train_test_split(time_regression_df, test_size=0.05, random_state=99)
y_train = time_regression_df_train['avg_velocity_mph']
x_train = time_regression_df_train.ix[:, 0:6]
y_test = time_regression_df_test['avg_velocity_mph']
x_test = time_regression_df_test.ix[:, 0:6]
In [73]:
x_test.head()
Out[73]:
In [74]:
y_test.head()
Out[74]:
In [75]:
xy_test = pd.concat([x_test, y_test], axis=1)
In [76]:
xy_test.head()
Out[76]:
If you want to export something...
In [77]:
#Xy_test.to_csv('taxi_tree_test_Xy_20130506-12.csv')
#X_test.to_csv('taxi_tree_test_X_20130506-12.csv')
#y_test.to_csv('taxi_tree_test_y_20130506-12.csv')
# Xy_test.to_csv('bike_tree_test_Xy_20140505-11.csv')
# X_test.to_csv('bike_tree_test_X_20140505-11.csv')
# y_test.to_csv('bike_tree_test_y_20140506-11.csv')
In [78]:
# Xy_test_sample = Xy_test.sample(10000, random_state=99)
In [79]:
# Xy_test_sample.to_csv('taxi_tree_test_Xy_sample.csv')
In [80]:
# Xy_test_sample.head()
Just to be sure
In [81]:
print(x_train.shape)
print(x_train.size)
print(x_test.shape)
print(X.shape)
print(x_train.shape[0]+x_test.shape[0])
In [82]:
import time
# Import the necessary modules and libraries
from sklearn.tree import DecisionTreeRegressor
import numpy as np
import matplotlib.pyplot as plt
In [83]:
regtree = DecisionTreeRegressor(min_samples_split=100, random_state=99, max_depth=20)# formerly 15. 15 is reasonable
# random states: 99
regtree.fit(x_train, y_train)
regtree.score(x_test, y_test)
Out[83]:
In [84]:
y_pred = regtree.predict(x_test)
diff_regtree = (y_pred-y_test)
# plt.figure(figsize=(12,10)) # not needed. set values globally
plt.hist(diff_regtree.values, bins=100, range=[-6,6])
print('Perzentile(%): ', [1,5,10,15,25,50,75,90,95,99], '\n', np.percentile(diff_regtree.values, [1,5,10,15,25,50,75,90,95,99]))
print('Absolute time deviation (in 1k): ', sum(abs(diff_regtree))/1000)
plt.title('Simple Decision Tree Regressor')
plt.xlabel('deviation in mph')
plt.ylabel('frequency')
plt.savefig('figures/bike_tree.png', format='png', dpi=300)
In [85]:
diff_regtree.describe()
Out[85]:
In [86]:
from sklearn.ensemble import RandomForestRegressor
n_est_list = list(range(2,32,8))
min_sam_leaf_list = [2,50,100,150]
max_depth_list = list(range(2,32,8))
results = np.empty([0,4])
for n_est in n_est_list:
for min_sam_leaf in min_sam_leaf_list:
for max_depth in max_depth_list:
rd_regtree = RandomForestRegressor(n_estimators=n_est,n_jobs=6,min_samples_leaf=min_sam_leaf, random_state=99, max_depth=max_depth)
rd_regtree.fit(x_train, y_train)
score = rd_regtree.score(x_test, y_test)
results = np.vstack((results, [n_est, min_sam_leaf, max_depth,score]))
In [87]:
best = np.where(results == max(results[:,3]))[0]
results[best,:]
Out[87]:
Watch at the best results.
In [88]:
results[np.argsort(results[:, 3])][-10:,:]
Out[88]:
In [89]:
results = np.vstack((results, [n_est, min_sam_leaf, max_depth,score]))
#results = np.vstack((results, [n_est, min_sam_leaf, max_depth,score]))
#results = np.vstack((results, [n_est, min_sam_leaf, max_depth,score]))
results[:,3]
Out[89]:
In [90]:
from sklearn.ensemble import RandomForestRegressor
#rd_regtree = RandomForestRegressor(n_estimators=10,n_jobs=6,min_samples_leaf=4, random_state=99, max_depth=20)
rd_regtree = RandomForestRegressor(n_estimators=26,n_jobs=6,min_samples_leaf=2, random_state=99, max_depth=26)
#total sum of diff: 1132
#rd_regtree = RandomForestRegressor(n_estimators=40,n_jobs=-1,min_samples_split=3, random_state=99, max_depth=11)
#total sum of diff: 1129
rd_regtree.fit(x_train, y_train)
print('R²: ', rd_regtree.score(x_test, y_test))
In [91]:
rd_regtree.feature_importances_
Out[91]:
In [92]:
tree_list = rd_regtree.estimators_
for i in range(0,len(tree_list)):
print((tree_list[i].feature_importances_))
In [93]:
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(results[:,0],results[:,2],results[:,3], marker='o')
plt.show()
So this plot is not particularly useful in this case. But it's a hinto to how to visualize the results.
In [94]:
y_pred = rd_regtree.predict(x_test)
np.linalg.norm(np.ceil(y_pred)-y_test)
diff_rd = (y_pred-y_test)
# plt.figure(figsize=(12,10)) # not needed. set values globally
plt.hist(diff_rd.values, bins=100, range=[-6,6])
print('Perzentile(%): ', [1,5,10,15,25,50,75,90,95,99], '\n', np.percentile(diff_rd.values, [1,5,10,15,25,50,75,90,95,99]))
print('Absolute time deviation (in 1k): ', sum(abs(diff_rd))/1000)
plt.title('Random Forest Regressor')
plt.xlabel('deviation in mph')
plt.ylabel('frequency')
plt.savefig('figures/bike_randomforest.png', format='png', dpi=150)
In [95]:
diff_rd.describe()
Out[95]:
In [99]:
from sklearn.externals import joblib
joblib.dump(rd_regtree, 'randforlib/bike_regtree_26x_depth_26_mss_2_PY27.pkl', protocol=2)
Out[99]:
In [100]:
! cd randforlib && zip bike_regtree_26x_depth_26_mss_2_PY27.zip bike_regtree_26x_depth_26_mss_2_PY27.pkl*