In [1]:
%pylab inline
import requests, zipfile
import StringIO
import datetime
import pandas as pd
import numpy as np
from workflow.data import *
import pdb
import seaborn as sns
from matplotlib import pyplot
Most of the routines tested here are found in the workflow/data.py
or workflow/features.py
scripts. Please refer to those files for better context into how they work.
In [2]:
# find csv file for tripdata
year = 2015
month = 3
#csvPath = '{}{:02}-citibike-tripdata.csv'.format(year, month)
#df = pd.read_csv(basepath + csvPath, parse_dates = ['Start Time', 'Stop Time'])
df = trip_data(year, month)
#df['trip_id'] = df.index.values
df.head()
Out[2]:
In [3]:
rebals = rebal_data(year,month)
rebals.head()
Out[3]:
In [4]:
print "Total trips for {}-{:02}: {}".format(year, month, df.shape[0])
In [5]:
sub_trips = df[df.user_type == 'Subscriber'].shape[0] / float(df.shape[0])
print "% subscribers: {:.3f}".format(100*sub_trips)
In [6]:
male_trips = df[df.gender == 1].shape[0] / float(df.shape[0])
female_trips = df[df.gender == 2].shape[0] / float(df.shape[0])# add a column called trip_id within
print "% Male trips: {:.3f}".format(100*male_trips)
print "% Female trips: {:.3f}".format(100*female_trips)
print "% other trips: {:.3f}".format(100*(1 - male_trips - female_trips))
In [8]:
# top 10 starting locations
top_starts = df.groupby(['start_name', 'start_lat', 'start_long', 'start_id']).count().duration.sort_values(ascending = False)
top_starts = top_starts.reset_index()
top_starts['number_trips'] = top_starts.duration
top_starts
top_starts.sort_values('number_trips').tail(10).plot(kind = 'barh',
x = 'start_name', y = 'number_trips',figsize=(7,7))
plt.tick_params(labelsize=18)
In [9]:
# top 10 stopping locations
top_stops = df.groupby(['stop_name', 'stop_lat', 'stop_long', 'stop_id']).count().duration.sort_values(ascending = False)
top_stops = top_stops.reset_index()
top_stops['number_trips'] = top_stops.duration
top_stops
top_stops.sort_values('number_trips').tail(10).plot(kind = 'barh',
x = 'stop_name', y = 'number_trips', figsize=(7,7))
plt.tick_params(labelsize=18)
In [10]:
def plot_top_stations(df, x_values, y_values):
pl = df.plot(kind = 'scatter', x = x_values, y = y_values,
alpha = 0.5, figsize = (5,8))
df.head(10).reset_index().plot(kind = 'scatter', x = x_values,
color = 'r', y = y_values, ax = pl)
plot_top_stations(top_starts, 'start_long', 'start_lat')
plot_top_stations(top_stops, 'stop_long', 'stop_lat')
In [ ]:
oneday = df[(df.start_time.dt.day == 1) & (df.start_time.dt.month == 3)]
In [ ]:
bike_list = df.bike_id.unique()
# collect rebal events for the first 100 bikes
df = df.sort_values(['bike_id', 'start_time'])
#cols = ['start_id','stop_id', 'start_time', 'stop_time', 'bike_id']
df1 = df[df.bike_id.isin(bike_list[:100])]
# track rebalancing events
#(REMEMBER THAT REBAL EVENTS USE STOP_TIME AS THE STARTING TIME)
shift_cols = ['stop_id','stop_time', 'stop_long', 'stop_lat', 'stop_name']
def shift_cond(bike_df):
bike_df[shift_cols] = bike_df[shift_cols].shift(1)
return bike_df[bike_df.start_id != bike_df.stop_id]
In [ ]:
rebal_times = ((rebal_df.start_time - rebal_df.stop_time) / 2) + rebal_df.stop_time
rebal_spreads = (rebal_df.start_time - rebal_df.stop_time) / 2
In [ ]:
#oneday = df[(df.start_time.dt.day == 1) & (df.start_time.dt.month) == 1]
In [ ]:
df['start_hour'] = df.start_time.dt.hour
df['stop_hour'] = df.stop_time.dt.hour
#df[['start_hour', 'stop_hour']] = df[['start_time', 'stop_time']].apply(lambda x: x.dt.hour)
df[['start_date', 'stop_date']] = df[['start_time', 'stop_time']].apply(lambda x: x.dt.floor('d'))
df.tail()
In [ ]:
def split_off_times(df):
df['start_hour'] = df.start_time.dt.hour
df['stop_hour'] = df.stop_time.dt.hour
df[['start_date', 'stop_date']] = df[['start_time', 'stop_time']].apply(lambda x: x.dt.floor('d'))
return df
def create_fluxes(df, id_key, date_key, hour_key, fl_key):
# id_key and hour_key are start_id/stop_id and start_date/end_date
# start is associated with an fl_key = 'flux_out',
# and stop is associated with fl_key = 'flux_in'
use_cols = [id_key, date_key, hour_key, 'duration']
flux = df.groupby([id_key, date_key, hour_key]).count()
flux = flux.reset_index()[use_cols]
col_dict = {'duration': fl_key,
date_key: 'date', hour_key: 'hour',
id_key: 'id'}
return flux.rename(columns = col_dict)
def transform_times(df):
# calculate approximate pickup and drop-off times for rebalancing trips
t_start = df.start_time
t_end = df.stop_time
time_diff = t_start - t_end
r_start = t_end + time_diff/3.
r_end = t_end + time_diff*(2/3.)
df['start_time'] = r_start
df['stop_time'] = r_end
return df.rename(columns = {'start_id':'stop_id', 'stop_id':'start_id'})
def merge_fluxes(df1, df2):
# concatenate fluxes or any other dataset with the keys
# 'id' 'date' and 'hour
return pd.merge(df1, df2, how='outer',
on = ['id', 'date', 'hour']).fillna(0)
In [ ]:
# create fluxes from normal trips and merge
bikes_out = create_fluxes(df, 'start_id', 'start_date','start_hour', 'bikes_out')
bikes_in = create_fluxes(df, 'stop_id', 'stop_date','stop_hour', 'bikes_in')
merged = merge_fluxes(bikes_out, bikes_in)
In [ ]:
import holidays
us_holidays = holidays.UnitedStates()
merged['dayofweek'] = merged.date.dt.weekday
merged['month'] = merged.date.dt.month
merged['year'] = merged.date.dt.year
merged['is_weekday'] = merged.dayofweek.isin(range(5))
merged['is_holiday'] = merged.date.isin(holidays.UnitedStates())
In [ ]:
merged.head()
In [ ]:
# merge rebal fluxes
rebals = transform_times(rebals)
rebals = split_off_times(rebals)
#create fluxes from rebalanced trips
rflux_out = create_fluxes(rebals, 'start_id', 'start_date', 'start_hour', 'rbikes_out')
rflux_in = create_fluxes(rebals, 'stop_id', 'stop_date', 'stop_hour', 'rbikes_in')
rmerged = merge_fluxes(rflux_out, rflux_in)
rmerged['rebal_net_flux'] = rmerged.rbikes_in - rmerged.rbikes_out
rmerged = rmerged.drop(['rbikes_in', 'rbikes_out'], axis=1)
In [ ]:
rmerged.head()
In [ ]:
merged.head()
In [ ]:
feat = merge_fluxes(merged, rmerged)
feat.head()
In [ ]:
avail_db = station_data(year,month)
In [ ]:
avail_db.head()
In [ ]:
feat.head()
In [ ]:
In [ ]:
In [ ]:
In [ ]:
'''
def create_fluxes(df, id_key, time_key, fl_key):
# id_key and time_key are start_id/stop_id and start_time/end_time
# for normal trips, start is associated with an fl_key = 'flux_out',
# and stop is associated with fl_key = 'flux_in'
flux = df[[id_key, time_key]].groupby([df[time_key].dt.to_period('H'), id_key]).count()
flux = flux[time_key].fillna(0).unstack(level=0).stack()
flux = flux.reset_index().set_index(time_key)
flux.index.name = 'date'
return flux.rename(columns = {0:fl_key, id_key:'id'}).reset_index()
def merge_fluxes(flux_out, flux_in):
return pd.merge(flux_out, flux_in, how='outer',
on = ['date', 'id']).fillna(0)
'''
In [ ]:
'''
# create fluxes from normal trips
flux_out = create_fluxes(trips, 'start_id', 'start_time', 'flux_out')
flux_in = create_fluxes(trips, 'stop_id', 'stop_time', 'flux_in')
merged = merge_fluxes(flux_out, flux_in)
# create weekday column with values 1 if weekday and 0 if weekend
wkd_cond = lambda x: 1 if True else 0
merged['weekday'] = merged.date.dt.weekday.isin(range(5)).apply(wkd_cond)
merged['hour'] = merged.date.dt.hour
'''
In [ ]:
ages = year - trips.birth_year
trips['ages'] = ages
trips.groupby([trips.start_time.dt.to_period('H'), ages]).mean()
In [ ]:
In [ ]:
def transform_times(df):
# calculate approximate pickup and drop-off times for rebalancing trips
t_start = df.start_time
t_end = df.stop_time
time_diff = t_start - t_end
r_start = t_end + time_diff/3.
r_end = t_end + time_diff*(2/3.)
df['start_time'] = r_start
df['stop_time'] = r_end
return df.rename(columns = {'start_id':'stop_id', 'stop_id':'start_id'})
def split_off_times(df):
df['start_hour'] = df.start_time.dt.hour
df['stop_hour'] = df.stop_time.dt.hour
df[['start_date', 'stop_date']] = df[['start_time', 'stop_time']].apply(lambda x: x.dt.floor('d'))
return df
rebal_df = transform_times(rebal_df)
rebal_df = split_off_times(rebal_df)