In [1]:
%pylab inline
import requests, zipfile
import StringIO
import datetime
import pandas as pd
import numpy as np
from workflow.data import *
import pdb
import seaborn as sns
from matplotlib import pyplot


Populating the interactive namespace from numpy and matplotlib

Most of the routines tested here are found in the workflow/data.py or workflow/features.py scripts. Please refer to those files for better context into how they work.


In [2]:
# find csv file for tripdata
year = 2015
month = 3
#csvPath = '{}{:02}-citibike-tripdata.csv'.format(year, month)
#df = pd.read_csv(basepath + csvPath, parse_dates = ['Start Time', 'Stop Time'])
df = trip_data(year, month)
#df['trip_id'] = df.index.values
df.head()


Out[2]:
duration start_time stop_time start_id start_name start_lat start_long stop_id stop_name stop_lat stop_long bike_id user_type birth_year gender
0 669 2015-03-01 00:00:00 2015-03-01 00:11:00 164 E 47 St & 2 Ave 40.753231 -73.970325 477 W 41 St & 8 Ave 40.756405 -73.990026 21409 Subscriber 1987.0 1
1 750 2015-03-01 00:01:00 2015-03-01 00:14:00 258 DeKalb Ave & Vanderbilt Ave 40.689407 -73.968855 436 Hancock St & Bedford Ave 40.682166 -73.953990 19397 Subscriber 1968.0 1
2 663 2015-03-01 00:01:00 2015-03-01 00:12:00 497 E 17 St & Broadway 40.737050 -73.990093 477 W 41 St & 8 Ave 40.756405 -73.990026 20998 Customer NaN 0
3 480 2015-03-01 00:02:00 2015-03-01 00:10:00 470 W 20 St & 8 Ave 40.743453 -74.000040 491 E 24 St & Park Ave S 40.740964 -73.986022 21565 Subscriber 1983.0 1
4 1258 2015-03-01 00:02:00 2015-03-01 00:23:00 345 W 13 St & 6 Ave 40.736494 -73.997044 473 Rivington St & Chrystie St 40.721101 -73.991925 14693 Subscriber 1970.0 1

In [3]:
rebals = rebal_data(year,month)
rebals.head()


Out[3]:
duration start_time stop_time start_id start_name start_lat start_long stop_id stop_name stop_lat stop_long bike_id user_type birth_year gender trip_id
0 345 2015-03-17 15:42:00 2015-03-16 22:35:00 402 Broadway & E 22 St 40.740343 -73.989551 487.0 E 20 St & FDR Drive 40.733143 -73.975739 14529 Subscriber 1979.0 1 161757
1 820 2015-03-19 21:31:00 2015-03-19 20:12:00 523 W 38 St & 8 Ave 40.754666 -73.991382 480.0 W 53 St & 10 Ave 40.766697 -73.990617 14529 Subscriber 1982.0 1 197300
2 520 2015-03-30 20:05:00 2015-03-30 18:20:00 491 E 24 St & Park Ave S 40.740964 -73.986022 229.0 Great Jones St 40.727434 -73.993790 14529 Subscriber 1957.0 1 325665
3 624 2015-03-31 07:57:00 2015-03-30 21:05:00 501 FDR Drive & E 35 St 40.744219 -73.971212 487.0 E 20 St & FDR Drive 40.733143 -73.975739 14529 Subscriber 1960.0 1 329600
4 1464 2015-03-07 18:02:00 2015-03-07 02:12:00 410 Suffolk St & Stanton St 40.720664 -73.985180 438.0 St Marks Pl & 1 Ave 40.727791 -73.985649 14530 Subscriber 1966.0 1 35360

Interesting stats


In [4]:
print "Total trips for {}-{:02}: {}".format(year, month, df.shape[0])


Total trips for 2015-03: 341826

In [5]:
sub_trips = df[df.user_type == 'Subscriber'].shape[0] / float(df.shape[0])
print "% subscribers: {:.3f}".format(100*sub_trips)


% subscribers: 96.240

In [6]:
male_trips = df[df.gender == 1].shape[0] / float(df.shape[0])
female_trips = df[df.gender == 2].shape[0] / float(df.shape[0])# add a column called trip_id within
print "% Male trips: {:.3f}".format(100*male_trips)
print "% Female trips: {:.3f}".format(100*female_trips)
print "% other trips: {:.3f}".format(100*(1 - male_trips - female_trips))


% Male trips: 77.773
% Female trips: 18.443
% other trips: 3.784

In [8]:
# top 10 starting locations
top_starts = df.groupby(['start_name', 'start_lat', 'start_long', 'start_id']).count().duration.sort_values(ascending = False)
top_starts = top_starts.reset_index()
top_starts['number_trips'] = top_starts.duration
top_starts

top_starts.sort_values('number_trips').tail(10).plot(kind = 'barh', 
                         x = 'start_name', y = 'number_trips',figsize=(7,7))
plt.tick_params(labelsize=18)



In [9]:
# top 10 stopping locations
top_stops = df.groupby(['stop_name', 'stop_lat', 'stop_long', 'stop_id']).count().duration.sort_values(ascending = False)
top_stops = top_stops.reset_index()
top_stops['number_trips'] = top_stops.duration
top_stops

top_stops.sort_values('number_trips').tail(10).plot(kind = 'barh', 
                         x = 'stop_name', y = 'number_trips', figsize=(7,7))
plt.tick_params(labelsize=18)



In [10]:
def plot_top_stations(df, x_values, y_values):
    pl = df.plot(kind = 'scatter', x = x_values, y = y_values, 
                alpha = 0.5, figsize = (5,8))
    df.head(10).reset_index().plot(kind = 'scatter', x = x_values,
                 color = 'r', y = y_values, ax = pl)

plot_top_stations(top_starts, 'start_long', 'start_lat')
plot_top_stations(top_stops, 'stop_long', 'stop_lat')


Rebalancing


In [ ]:
oneday = df[(df.start_time.dt.day == 1) & (df.start_time.dt.month == 3)]

In [ ]:
bike_list = df.bike_id.unique()

# collect rebal events for the first 100 bikes
df = df.sort_values(['bike_id', 'start_time'])
#cols = ['start_id','stop_id', 'start_time', 'stop_time', 'bike_id']
df1 = df[df.bike_id.isin(bike_list[:100])]

# track rebalancing events 
#(REMEMBER THAT REBAL EVENTS USE STOP_TIME AS THE STARTING TIME)

shift_cols = ['stop_id','stop_time', 'stop_long', 'stop_lat', 'stop_name']

def shift_cond(bike_df):
    bike_df[shift_cols] = bike_df[shift_cols].shift(1)
    return bike_df[bike_df.start_id != bike_df.stop_id]

In [ ]:
rebal_times = ((rebal_df.start_time - rebal_df.stop_time) / 2) + rebal_df.stop_time
rebal_spreads = (rebal_df.start_time - rebal_df.stop_time) / 2

Calculate fluxes


In [ ]:
#oneday = df[(df.start_time.dt.day == 1) & (df.start_time.dt.month) == 1]

In [ ]:
df['start_hour'] = df.start_time.dt.hour
df['stop_hour'] = df.stop_time.dt.hour
#df[['start_hour', 'stop_hour']] = df[['start_time', 'stop_time']].apply(lambda x: x.dt.hour)
df[['start_date', 'stop_date']] = df[['start_time', 'stop_time']].apply(lambda x: x.dt.floor('d'))
df.tail()

In [ ]:
def split_off_times(df):
    df['start_hour'] = df.start_time.dt.hour
    df['stop_hour'] = df.stop_time.dt.hour
    df[['start_date', 'stop_date']] = df[['start_time', 'stop_time']].apply(lambda x: x.dt.floor('d'))
    return df

def create_fluxes(df, id_key, date_key, hour_key, fl_key):
    # id_key and hour_key are start_id/stop_id and start_date/end_date
    # start is associated with an fl_key = 'flux_out',
    # and stop is associated with fl_key = 'flux_in'
    use_cols = [id_key, date_key, hour_key, 'duration']

    flux = df.groupby([id_key, date_key, hour_key]).count()
    flux = flux.reset_index()[use_cols]
    col_dict = {'duration': fl_key,
                date_key: 'date', hour_key: 'hour',
                id_key: 'id'}
    return flux.rename(columns = col_dict)

def transform_times(df):
    # calculate approximate pickup and drop-off times for rebalancing trips
    t_start = df.start_time
    t_end = df.stop_time
    time_diff = t_start - t_end

    r_start = t_end + time_diff/3.
    r_end = t_end + time_diff*(2/3.)

    df['start_time'] = r_start
    df['stop_time'] = r_end
    return df.rename(columns = {'start_id':'stop_id', 'stop_id':'start_id'})


def merge_fluxes(df1, df2):
    # concatenate fluxes or any other dataset with the keys
    # 'id' 'date' and 'hour
    return pd.merge(df1, df2, how='outer',
                              on = ['id', 'date', 'hour']).fillna(0)

In [ ]:
# create fluxes from normal trips and merge
bikes_out = create_fluxes(df, 'start_id', 'start_date','start_hour', 'bikes_out')
bikes_in = create_fluxes(df, 'stop_id', 'stop_date','stop_hour', 'bikes_in')

merged = merge_fluxes(bikes_out, bikes_in)

In [ ]:
import holidays
us_holidays = holidays.UnitedStates()

merged['dayofweek'] = merged.date.dt.weekday
merged['month'] = merged.date.dt.month
merged['year'] = merged.date.dt.year
merged['is_weekday'] = merged.dayofweek.isin(range(5))
merged['is_holiday'] = merged.date.isin(holidays.UnitedStates())

In [ ]:
merged.head()

In [ ]:
# merge rebal fluxes
rebals = transform_times(rebals)
rebals = split_off_times(rebals)
#create fluxes from rebalanced trips
rflux_out = create_fluxes(rebals, 'start_id', 'start_date', 'start_hour', 'rbikes_out')
rflux_in = create_fluxes(rebals, 'stop_id', 'stop_date', 'stop_hour', 'rbikes_in')

rmerged = merge_fluxes(rflux_out, rflux_in)
rmerged['rebal_net_flux'] = rmerged.rbikes_in - rmerged.rbikes_out
rmerged = rmerged.drop(['rbikes_in', 'rbikes_out'], axis=1)

In [ ]:
rmerged.head()

In [ ]:
merged.head()

In [ ]:
feat = merge_fluxes(merged, rmerged)
feat.head()

In [ ]:
avail_db = station_data(year,month)

In [ ]:
avail_db.head()

In [ ]:
feat.head()

In [ ]:


In [ ]:


In [ ]:


In [ ]:
'''
def create_fluxes(df, id_key, time_key, fl_key):
    # id_key and time_key are start_id/stop_id and start_time/end_time
    # for normal trips, start is associated with an fl_key = 'flux_out',
    # and stop is associated with fl_key = 'flux_in'

    flux = df[[id_key, time_key]].groupby([df[time_key].dt.to_period('H'), id_key]).count()
    flux = flux[time_key].fillna(0).unstack(level=0).stack()
    flux = flux.reset_index().set_index(time_key)
    flux.index.name = 'date'
    return flux.rename(columns = {0:fl_key, id_key:'id'}).reset_index()

def merge_fluxes(flux_out, flux_in):
    return pd.merge(flux_out, flux_in, how='outer',
                    on = ['date', 'id']).fillna(0)
'''

In [ ]:
'''
# create fluxes from normal trips
flux_out = create_fluxes(trips, 'start_id', 'start_time', 'flux_out')
flux_in = create_fluxes(trips, 'stop_id', 'stop_time', 'flux_in')

merged = merge_fluxes(flux_out, flux_in)

# create weekday column with values 1 if weekday and 0 if weekend
wkd_cond = lambda x: 1 if True else 0
merged['weekday'] = merged.date.dt.weekday.isin(range(5)).apply(wkd_cond)
merged['hour'] = merged.date.dt.hour
'''

In [ ]:
ages = year - trips.birth_year 
trips['ages'] = ages
trips.groupby([trips.start_time.dt.to_period('H'), ages]).mean()

In [ ]:


In [ ]:
def transform_times(df):
    # calculate approximate pickup and drop-off times for rebalancing trips
    t_start = df.start_time
    t_end = df.stop_time
    time_diff = t_start - t_end

    r_start = t_end + time_diff/3.
    r_end = t_end + time_diff*(2/3.)
    
    df['start_time'] = r_start
    df['stop_time'] = r_end
    return df.rename(columns = {'start_id':'stop_id', 'stop_id':'start_id'})

def split_off_times(df):
    df['start_hour'] = df.start_time.dt.hour
    df['stop_hour'] = df.stop_time.dt.hour
    df[['start_date', 'stop_date']] = df[['start_time', 'stop_time']].apply(lambda x: x.dt.floor('d'))
    return df

rebal_df = transform_times(rebal_df)
rebal_df = split_off_times(rebal_df)