In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
plt.rcParams["figure.figsize"] = (10,10)
In [2]:
import random
import matplotlib.patches as mpatches
In [3]:
from IPython.display import Image, HTML
In [2]:
fares_data_file = '../../trip_fare/trip_fare_2.csv'
trips_data_file = '../../trip_data/trip_data_2.csv'
In [3]:
fares_pd = pd.read_csv(fares_data_file, usecols=[0, 1, 3, 4, 5, 8, 10])
fares_old_columns = fares_pd.columns
fares_pd_columns = [header.strip() for header in fares_old_columns]
fares_pd.columns = fares_pd_columns
print fares_pd_columns
#fares_pd.describe()
#fares_pd.head()
fares_short_pd = fares_pd
fares_short_pd['medallion'] = fares_pd['medallion'].apply(lambda x: x[:8])
fares_short_pd['hack_license'] = fares_pd['hack_license'].apply(lambda x: x[:8])
del fares_pd
fares_short_pd.head()
Out[3]:
In [4]:
fares_short_pd.to_csv('../data/trip_fare_short_2.csv')
del fares_short_pd
In [5]:
trips_pd = pd.read_csv(trips_data_file, usecols=[1]+range(5, 14))
trips_old_columns = trips_pd.columns
trips_pd_columns = [header.strip() for header in trips_old_columns]
trips_pd.columns = trips_pd_columns
print trips_pd_columns
trips_short_pd = trips_pd
trips_short_pd['hack_license'] = trips_pd['hack_license'].apply(lambda x: x[:8])
del trips_pd
trips_short_pd.head()
Out[5]:
In [6]:
trips_short_pd.to_csv('../data/trip_data_short_2.csv')
del trips_short_pd
In [ ]:
In [7]:
fares_data_file = '../data/trip_fare_short_2.csv'
trips_data_file = '../data/trip_data_short_2.csv'
fares_pd = pd.read_csv(fares_data_file)
trips_pd = pd.read_csv(trips_data_file)
taxi_pd = pd.merge(fares_pd, trips_pd, how='outer')
del fares_pd, trips_pd
taxi_pd.head()
Out[7]:
In [8]:
taxi_pd.to_csv('../data/taxi_short_2.csv')
del taxi_pd
In [ ]:
In [5]:
taxi_full_pd = pd.read_csv('../data/taxi_short_2.csv')
#taxi_pd.describe()
In [6]:
taxi_full_pd['tip_frac'] = (100.*taxi_full_pd.tip_amount \
/(taxi_full_pd.total_amount - taxi_full_pd.tip_amount))
taxi_full_pd['tip_frac'].describe()
Out[6]:
In [7]:
#NYW Lat and Longitude
center_lat = 40.76
center_lng = -73.925
dlat = 0.1
dlng = 0.1
min_lat = center_lat - dlat
max_lat = center_lat + dlat
min_lng = center_lng - dlng
max_lng = center_lng + dlng
In [9]:
taxi_pd = pd.DataFrame(taxi_full_pd[(taxi_full_pd.trip_distance <= 50) &
(taxi_full_pd.trip_distance > 0.1) &
(taxi_full_pd.trip_time_in_secs > 1) &
(taxi_full_pd.fare_amount < 50) &
(taxi_full_pd.fare_amount*2 % 1 == 0) &
(taxi_full_pd.tip_frac >= 0.1) &
(taxi_full_pd.tip_frac < 50) &
(taxi_full_pd.pickup_latitude > center_lat - dlat) &
(taxi_full_pd.pickup_latitude < center_lat + dlat) &
(taxi_full_pd.pickup_longitude > center_lng - dlng) &
(taxi_full_pd.pickup_longitude < center_lng + dlng)])
del taxi_full_pd
In [10]:
taxi_pd.columns
Out[10]:
In [11]:
taxi_pd.drop(taxi_pd.columns[:2],axis=1,inplace=True)
taxi_pd.head()
Out[11]:
In [12]:
min_date = min(taxi_pd['pickup_datetime'])
max_date = max(taxi_pd['dropoff_datetime'])
num_trips = len(taxi_pd)
num_drivers = len(taxi_pd['hack_license'].unique())
total_miles = taxi_pd['trip_distance'].sum()
total_time = taxi_pd['trip_time_in_secs'].sum()
print 'Using date range {0} to {1}.'.format(min_date, max_date)
print 'Total of {0:,} trips and {1:,} drivers.'.format(num_trips, num_drivers, grouping=True)
print 'Total of {0:,.2f} miles and {1:,.2f} hours.'.format(total_miles, total_time/3600., grouping=True)
# print 'Total of ${0:,.2f} = ${1:,.2f} in fares, ${2:,.2f} in tips, and ${3:,.2f} in tolls/fees.'.format(total_total_amount,
# total_fare_amount,
# total_tip_amount,
# total_fees_amount)
In [13]:
#scatter plot of pickups (red) and dropoffs(blue)
num_scatter_pts = 200000
scatter_ix = random.sample(range(len(taxi_pd)), num_scatter_pts)
pick_loc_pd = taxi_pd[['pickup_longitude', 'pickup_latitude']]
drop_loc_pd = taxi_pd[['dropoff_longitude', 'dropoff_latitude']]
In [14]:
plt.figure(figsize=(12, 6))
plt.rcParams.update({'font.size': 14})
plt.subplot(1,2,1)
plt.scatter(drop_loc_pd.ix[scatter_ix,0], drop_loc_pd.ix[scatter_ix,1],
s=1, alpha=0.05*num_scatter_pts/100000, c='b', lw=0)
plt.xlim(min_lng, max_lng)
plt.ylim(min_lat, max_lat)
plt.xlabel('longitude (degrees)', size=14)
plt.ylabel('latitude (degrees)', size=14)
plt.title('Drop Offs')
plt.subplot(1,2,2)
plt.scatter(pick_loc_pd.ix[scatter_ix,0], pick_loc_pd.ix[scatter_ix,1],
s=1, alpha=0.05*num_scatter_pts/100000, c='r', lw=0)
plt.xlim(min_lng, max_lng)
plt.ylim(min_lat, max_lat)
plt.xlabel('longitude (degrees)', size=14)
#plt.ylabel('latitude (degrees)', size=14)
plt.title('Pick Ups')
#plt.savefig('scatter.png')
plt.show()
In [15]:
num_lat_bins = 40
num_lng_bins = 40
lat_bins = np.linspace(min_lat, max_lat, num_lat_bins+1)
lng_bins = np.linspace(min_lng, max_lng, num_lng_bins+1)
In [16]:
fig, ax = plt.subplots(1)
plt.figure(1,figsize=(8, 8))
plt.subplots
plt.rcParams.update({'font.size': 14})
ax.scatter(drop_loc_pd.ix[scatter_ix,0], drop_loc_pd.ix[scatter_ix,1],
s=1, alpha=0.05*num_scatter_pts/100000, c='b', lw=0, label='dropoffs')
plt.xlim(min_lng, max_lng)
plt.ylim(min_lat, max_lat)
ax.set_xticks(lng_bins, minor=True)
ax.set_yticks(lat_bins, minor=True)
ax.xaxis.grid(True, which='minor')
ax.yaxis.grid(True, which='minor')
plt.xlabel('longitude (degrees)', size=14)
plt.ylabel('latitude (degrees)', size=14)
plt.title('Binning Data')
#plt.savefig('figures/scatter-zones.png')
plt.show()
In [17]:
#bin stops (pickup or dropoff) in uniform lng/lat bins
lat_lng_mi = pd.MultiIndex.from_product([range(0, num_lat_bins), range(0, num_lng_bins)], names=['lat', 'lng'])
pick_clat = pd.cut(taxi_pd.pickup_latitude.values, lat_bins)
pick_clng = pd.cut(taxi_pd.pickup_longitude.values, lng_bins)
drop_clat = pd.cut(taxi_pd.dropoff_latitude.values, lat_bins)
drop_clng = pd.cut(taxi_pd.dropoff_longitude.values, lng_bins)
print pick_clat
print lat_bins
print pick_clat.codes[:5]
print pick_clat[:5]
In [18]:
pick_s = pd.Series(taxi_pd.pickup_longitude)
drop_s = pd.Series(taxi_pd.dropoff_longitude)
#total number of stops (pickup or dropoff) in each lng/lat bin
stop_binned_cnts = pick_s.groupby([pick_clat.codes, pick_clng.codes]).count() + \
drop_s.groupby([drop_clat.codes, drop_clng.codes]).count()
stop_binned_cnts_ri = stop_binned_cnts.reindex(lat_lng_mi).fillna(0)
In [19]:
#plot stops in all bins
plt.figure(figsize=(10, 10))
plt.rcParams.update({'font.size': 14})
weighted_coord_bins_cnts = np.reshape(stop_binned_cnts_ri.values, (num_lat_bins, num_lng_bins))
extent = [lng_bins[0], lng_bins[-1], lat_bins[0], lat_bins[-1]]
plt.imshow(np.log10(weighted_coord_bins_cnts + 0.1), interpolation='none', origin='lower',
extent=extent, vmin=0, vmax=int(0.8*np.log10(2*num_trips))+1, cmap="RdYlBu")
plt.xlabel('longitude (degrees)', size=14)
plt.ylabel('latitude (degrees)', size=14)
cbar = plt.colorbar(shrink=0.7, ticks=range(int(0.8*np.log10(num_trips))+2), pad=0.025)
cbar.set_label('number of fares', rotation=270, labelpad=13, size=14)
cbar.set_ticklabels([r'$\leq1$', r'$10^1$', r'$10^2$', r'$10^3$', r'$10^4$', r'$10^5$', r'$\geq10^6$'])
cbar.ax.tick_params(labelsize=16)
plt.savefig('binned_fares.png')
plt.show()
In [20]:
#define zones = top #=num_zones - 1 bins with the most stops
#lump all other stops in other bins (that were previously nonzero) into zone_other
num_zones = 250
zone_cutoff_cnt = np.sort(stop_binned_cnts_ri)[-num_zones]
zone_cnts = stop_binned_cnts_ri[stop_binned_cnts_ri > zone_cutoff_cnt]
zone_zero_cnts = stop_binned_cnts_ri[stop_binned_cnts_ri < 10.]
zone_other_cnts = sum(stop_binned_cnts_ri[stop_binned_cnts_ri <= zone_cutoff_cnt])
zone_cnts_ri = zone_cnts.reindex(lat_lng_mi).fillna(float(zone_other_cnts) / (num_lat_bins*num_lng_bins - num_zones - len(zone_zero_cnts) + 1))
zone_cnts_ri[stop_binned_cnts_ri < 10.] = 0.
In [21]:
#plot stops in all zones
fig = plt.figure(figsize=(10, 10))
weighted_coord_bins_cnts = np.reshape(zone_cnts_ri.values, (num_lat_bins, num_lng_bins))
extent = [lng_bins[0], lng_bins[-1], lat_bins[0], lat_bins[-1]]
plt.imshow(np.log10(weighted_coord_bins_cnts + 0.1), interpolation='none', origin='lower',
extent=extent, vmin=int(np.log10(zone_cutoff_cnt)), vmax=int(0.8*np.log10(2*num_trips))+1, cmap="RdYlBu")
plt.xlabel('longitude (degrees)', size=14)
plt.ylabel('latitude (degrees)', size=14)
cbar = plt.colorbar(shrink=0.7, ticks=range(int(np.log10(zone_cutoff_cnt)), int(0.8*np.log10(num_trips))+2), pad=0.025)
cbar.set_label('number of fares', rotation=270, labelpad=13, size=14)
cbar.set_ticklabels([r'$\leq10^3$', r'$10^4$', r'$10^5$', r'$\geq10^6$'])
cbar.ax.tick_params(labelsize=16)
#plt.savefig('figures/zoned_fares.png')
ax = fig.gca()
ax.grid(True, which='minor')
#plt.grid(True, which='minor')
plt.show()
In [22]:
#pickle numpy matrix of zones
#zone numbers start at 1 in lower left, go L to R then up
#other zone is set to 0
#empty zoned are set to -1
zone_matrix = np.reshape(stop_binned_cnts_ri.values, (num_lat_bins, num_lng_bins))
zone_matrix[zone_matrix == 0] = -1.
zone_matrix[(zone_matrix < zone_cutoff_cnt) * (zone_matrix > 0)] = 0.
zone_matrix = (zone_matrix > 0) * np.reshape(range(1, num_lat_bins*num_lng_bins+1), (num_lat_bins, num_lng_bins))
In [23]:
plt.figure(figsize=(10, 10))
plt.imshow(pd.DataFrame(zone_matrix[::-1]))
plt.show()
In [24]:
np.savetxt('../data/zone.txt', zone_matrix, delimiter=',')
In [25]:
lat_lng_mi = pd.MultiIndex.from_product([range(0, num_lat_bins), range(0, num_lng_bins)], names=['lat', 'lng'])
zone_dict = np.append([0],zone_matrix.flatten())
def zone_from_lng_lat(lng, lat, lng_bins, lat_bins):
num_lng_bins = len(lng_bins) - 1
num_lat_bins = len(lat_bins) - 1
in_range = (min_lng <= lng) & (lng <= max_lng) & (min_lat <= lat) & (lat <= max_lat)
return zone_dict[np.vectorize(int)(in_range) * (np.digitize(lng, lng_bins) + (np.digitize(lat, lat_bins) - 1)*num_lng_bins)]
In [26]:
#convert lng/lat to zone
taxi_pd['pickup_zone'] = zone_from_lng_lat(taxi_pd.pickup_longitude, taxi_pd.pickup_latitude, lng_bins, lat_bins)
taxi_pd['dropoff_zone'] = zone_from_lng_lat(taxi_pd.dropoff_longitude, taxi_pd.dropoff_latitude, lng_bins, lat_bins)
In [27]:
taxi_pd.head()
Out[27]:
In [62]:
#taxi_pd.to_csv('../data/taxi_short_zoned_2.csv')
In [ ]:
In [28]:
delta_lat = lat_bins[1] - lat_bins[0]
delta_lng = lng_bins[1] - lng_bins[0]
radius_earth_miles = 3963.17
delta_lat_miles = np.deg2rad(delta_lat) * radius_earth_miles
delta_lng_miles = np.deg2rad(delta_lng) * radius_earth_miles
In [29]:
def zone_dist_sq(zone1, zone2):
#calculate zone-to-zone distance using data
return taxi_pd[(taxi_pd.pickup_zone == zone1) & (taxi_pd.dropoff_zone == zone2)].trip_distance.mean()**2
def zone_euclid_dist_sq(zone1, zone2):
if (zone1 == 0) or (zone2 == 0):
#use data for trips to/from "other" zone
return zone_dist_sq(zone1, zone2)
else:
#otherwise use euclidean distance
zone1_lat_bin = np.floor(zone1 / num_lng_bins) + 1
zone1_lng_bin = zone1 % num_lng_bins
zone2_lat_bin = np.floor(zone2 / num_lng_bins) + 1
zone2_lng_bin = zone2 % num_lng_bins
lat_miles = (zone2_lat_bin - zone1_lat_bin) * delta_lat_miles
lng_miles = (zone2_lng_bin - zone1_lng_bin) * delta_lat_miles
return np.around(lat_miles**2 + lng_miles**2, decimals=2)
In [30]:
zones = np.unique(zone_matrix)
num_zones = len(zones)
print "Length of zones", num_zones
euclid_dist_array = np.around(np.sqrt(np.array([zone_euclid_dist_sq(zone1, zone2) for zone1 in zones for zone2 in zones])), 2)
euclid_dist_matrix = np.around(np.sqrt(np.array([[zone_euclid_dist_sq(zone1, zone2) for zone1 in zones] for zone2 in zones])), 2)
pick_zone_i = [zone1 for zone1 in zones for zone2 in zones]
drop_zone_i = [zone2 for zone1 in zones for zone2 in zones]
In [31]:
euclid_dist_matrix_pd = pd.DataFrame(np.array([pick_zone_i, drop_zone_i, euclid_dist_array]).T, columns=['pickup_zone',
'dropoff_zone',
'euclid_distance'])
In [32]:
euclid_dist_matrix_pd.head()
Out[32]:
In [33]:
euclid_dist_matrix_pd.to_pickle('../data/euclid_distance_251x251_pd_new.pkl')
In [ ]:
In [34]:
taxi_pd.drop_duplicates(keep = 'first')
taxi_pd = taxi_pd.sort_values(['hack_license','pickup_datetime'])
taxi_pd.head()
Out[34]:
In [35]:
taxi_pd.pickup_datetime = pd.to_datetime(taxi_pd.pickup_datetime)
taxi_pd.dropoff_datetime = pd.to_datetime(taxi_pd.dropoff_datetime)
#find previous dropoff zone and time elapsed
taxi_pd['prev_dropoff_zone'] = taxi_pd.groupby('hack_license').dropoff_zone.shift().values
taxi_pd['time_since_prev_fare'] = (taxi_pd.pickup_datetime - \
taxi_pd.groupby('hack_license').dropoff_datetime.shift().values).astype('timedelta64[s]')
In [36]:
#if time since previous fare is greater than 6 hours, assume new shift and drop
taxi_pd = taxi_pd[taxi_pd.time_since_prev_fare <= 6*3600]
In [37]:
taxi_pd['day_of_week'] = taxi_pd.pickup_datetime.apply(lambda x: x.dayofweek)
taxi_pd['pickup_hour'] = taxi_pd.pickup_datetime.apply(lambda x: x.hour)
#taxi_pd['pickup_minute'] = taxi_pd.pickup_datetime.apply(lambda x: x.hour*60 + x.minute)
#taxi_pd.head()
In [39]:
taxi_pd = taxi_pd.groupby('hack_license').filter(lambda x: len(x) > 300)
In [40]:
taxi_pd.columns
Out[40]:
In [41]:
taxi_pd.drop(['medallion', 'payment_type', ], axis=1, inplace=True)
In [42]:
taxi_pd[['dropoff_zone', 'prev_dropoff_zone']].iloc[:5]
Out[42]:
In [43]:
taxi_pd.prev_dropoff_zone = taxi_pd.prev_dropoff_zone.astype(int)
In [46]:
taxi_pd.to_csv('../data/taxi_beforeMerge_2.csv')
In [ ]:
In [4]:
taxi_pd = pd.read_csv('../data/taxi_beforeMerge_2.csv')
taxi_pd.head()
Out[4]:
In [5]:
#del euclid_dist_matrix_pd
euclid_dist_matrix_pd = pd.read_pickle('../data/euclid_distance_251x251_pd_new.pkl')
euclid_dist_matrix_pd.head()
Out[5]:
In [6]:
print euclid_dist_matrix_pd.columns
In [7]:
euclid_dist_matrix_pd.pickup_zone = euclid_dist_matrix_pd.pickup_zone.astype(int)
euclid_dist_matrix_pd.dropoff_zone = euclid_dist_matrix_pd.dropoff_zone.astype(int)
euclid_dist_matrix_pd.head()
Out[7]:
In [8]:
taxi_pd['pickDrop'] = zip(taxi_pd.dropoff_zone, taxi_pd.prev_dropoff_zone)
In [9]:
euclid_dist_matrix_pd['pickDrop'] = zip(euclid_dist_matrix_pd.pickup_zone,
euclid_dist_matrix_pd.dropoff_zone)
euclid_dist_matrix_pd.head()
Out[9]:
In [15]:
print taxi_pd.columns
print euclid_dist_matrix_pd.columns
print euclid_dist_matrix_pd.euclid_distance.values[:5]
print euclid_dist_matrix_pd.pickDrop.values[:5]
In [21]:
euclid_dist_matrix_pd.loc[:5,['euclid_distance', 'pickDrop']]
Out[21]:
In [22]:
taxi_pd = pd.merge(taxi_pd, euclid_dist_matrix_pd.loc[:,['euclid_distance', 'pickDrop']], how='left',
left_on = 'pickDrop', right_on = 'pickDrop')
taxi_pd.head()
Out[22]:
In [23]:
taxi_pd.columns
Out[23]:
In [ ]:
In [ ]:
taxi_pd.drop(['Unnamed: 0','pickup_datetime','dropoff_datetime',
'pickup_longitude', 'pickup_latitude',
'dropoff_longitude', 'dropoff_latitude', 'prev_dropoff_zone'],axis=1,inplace=True)
In [27]:
taxi_pd.head()
Out[27]:
In [26]:
taxi_pd.to_csv('../data/taxi_short_zoned_minuted_2.csv')
In [ ]:
In [28]:
#calculate some performance stats for each driver
perf_pd = pd.DataFrame([])
perf_pd['hack_license'] = taxi_pd.hack_license.unique()
perf_pd['total_time_on_shift'] = taxi_pd.groupby('hack_license').trip_time_in_secs.sum().values + \
taxi_pd.groupby('hack_license').time_since_prev_fare.sum().values
perf_pd['total_dist_on_shift'] = taxi_pd.groupby('hack_license').trip_distance.sum().values + \
taxi_pd.groupby('hack_license').euclid_distance.sum().values
perf_pd['avg_speed_mph'] = perf_pd.total_dist_on_shift / (perf_pd.total_time_on_shift / 3600.)
perf_pd['total_fare'] = taxi_pd.groupby('hack_license').fare_amount.sum().values
perf_pd['total_tip'] = taxi_pd.groupby('hack_license').tip_amount.sum().values
perf_pd['total_total'] = taxi_pd.groupby('hack_license').total_amount.sum().values
perf_pd['fare_per_hour'] = perf_pd.total_fare / (perf_pd.total_time_on_shift / 3600.)
perf_pd['tip_per_hour'] = perf_pd.total_tip / (perf_pd.total_time_on_shift / 3600.)
perf_pd['dollars_per_hour'] = perf_pd.fare_per_hour + perf_pd.tip_per_hour
perf_pd['dollars_per_mile'] = (perf_pd.total_fare + perf_pd.total_tip) / perf_pd.total_dist_on_shift
miles_per_gallon = 15.
dollars_per_gallon = 2.5
dollars_per_mile = dollars_per_gallon / miles_per_gallon
perf_pd['net_dollars_per_hour'] = perf_pd.dollars_per_hour - dollars_per_mile*perf_pd.avg_speed_mph
In [29]:
net_dollars_per_hour_bot = np.percentile(perf_pd.net_dollars_per_hour, 5)
net_dollars_per_hour_top = np.percentile(perf_pd.net_dollars_per_hour, 95)
def performance(net_dollars_per_hour):
return 1 * (net_dollars_per_hour >= net_dollars_per_hour_top) + \
-1 * (net_dollars_per_hour <= net_dollars_per_hour_bot)
perf_pd['performance'] = perf_pd.net_dollars_per_hour.apply(performance)
In [33]:
plt.figure(figsize=(1.5*8, 1.5*4.95))
plt.rcParams.update({'font.size': 24})
plt.hist(perf_pd.net_dollars_per_hour,
range=(np.around(net_dollars_per_hour_bot),np.around(net_dollars_per_hour_top)),
bins=2*(np.around(net_dollars_per_hour_top)-np.around(net_dollars_per_hour_bot)), alpha=0.4, color='g')
plt.hist(perf_pd.net_dollars_per_hour, range=(np.around(net_dollars_per_hour_top),50),
bins=2*(50 - np.around(net_dollars_per_hour_top)), alpha=0.7, label='good hacks', color='r')
plt.hist(perf_pd.net_dollars_per_hour, range=(5,np.around(net_dollars_per_hour_bot)),
bins=2*(np.around(net_dollars_per_hour_bot) - 5), alpha=0.7, label='bad hacks', color='b')
plt.legend(frameon=False)
plt.xlabel('net dollars per hour')
plt.ylabel('number of hacks')
# savefig('figures/hack_performance.png')
plt.show()
In [34]:
perf_pd.head()
Out[34]:
In [35]:
#make dataframe of hack vs. performance tag
hack_perf_pd = pd.DataFrame([])
hack_perf_pd['hack_license'] = perf_pd.hack_license
hack_perf_pd['performance'] = perf_pd.performance
hack_perf_pd.head()
Out[35]:
In [37]:
taxi_pd['day_of_week_binned'] = 1 + (taxi_pd.day_of_week.values >= 4)
taxi_pd['pickup_hour_binned'] = np.floor(taxi_pd.pickup_hour.values / 2).astype(int)
taxi_pd['pickup_day_hour_binned'] = zip(taxi_pd.day_of_week_binned, taxi_pd.pickup_hour_binned)
taxi_pd['pickup_zone_time'] = zip(taxi_pd.pickup_zone, taxi_pd.pickup_day_hour_binned)
In [39]:
taxi_pd.head()
Out[39]:
In [38]:
pickup_zonetimes_pd = pd.DataFrame([])
pickup_zonetimes_pd['pickup_zone_time'] = np.sort(taxi_pd.pickup_zone_time.unique())
pickup_zonetimes_pd['pickup_zone_time_renamed'] = range(len(pickup_zonetimes_pd))
pickup_zonetimes_pd.head()
Out[38]:
In [ ]:
taxi_pd = pd.merge(taxi_pd, pickup_zonetimes_pd, how='left',
left_on='pickup_zone_time', right_on = 'pickup_zone_time')
#taxi_pd = taxi_pd.drop('pickup_zone_time', axis=1)
In [41]:
taxi_pd.head()
Out[41]:
In [ ]:
In [42]:
taxi_pd = pd.merge(taxi_pd, hack_perf_pd, how='left',
left_on='hack_license', right_on = 'hack_license')
taxi_pd.head()
Out[42]:
In [ ]:
In [43]:
taxi_pd = taxi_pd[taxi_pd.performance != 0]
In [47]:
num_hacks = len(taxi_pd.hack_license.unique())
num_zonetimes = len(pickup_zonetimes_pd)
print 'Number of drivers', num_hacks
print 'Number of location and time zones', num_zonetimes
print num_hacks*num_zonetimes
In [48]:
hack_zonetime_cnt_s = taxi_pd.groupby('hack_license')\
['pickup_zone_time_renamed'].apply(lambda x: np.bincount( x.tolist(),minlength=num_zonetimes) )
hack_zonetime_cnt_pd = pd.DataFrame([])
hack_zonetime_cnt_pd['hack_license'] = hack_zonetime_cnt_s.index.values
hack_zonetime_cnt_pd['zonetime_cnt'] = hack_zonetime_cnt_s.values
In [49]:
hack_zonetime_cnt_pd.head()
Out[49]:
In [50]:
hack_zonetime_cnt_perf_pd = pd.merge(hack_zonetime_cnt_pd, hack_perf_pd, how='left',
left_on='hack_license', right_on = 'hack_license')
hack_zonetime_cnt_perf_pd.head()
Out[50]:
In [54]:
data = np.hstack((hack_zonetime_cnt_perf_pd.zonetime_cnt.values)).reshape(num_hacks, num_zonetimes)
In [55]:
target = hack_zonetime_cnt_perf_pd.performance.values
In [58]:
print 'First few predictors for 5 samples are :\n', data[:5,:3]
print 'Target values for the samples are:', target[:5]
In [64]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
In [65]:
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.5, random_state=0)
In [67]:
X_train.shape, y_train.shape
Out[67]:
In [68]:
X_test.shape, y_test.shape
Out[68]:
In [69]:
# svm1 = LinearSVC(C=0.06, penalty="l1", dual=False)
svm1 = LinearSVC()
svm_fit = svm1.fit(X_train, y_train)
In [70]:
print 'Mean accuracy of the given test data :', svm_fit.score(X_test, y_test)