In [1]:
import zipfile
import requests
import numpy as np
print('numpy '+ np.__version__)
import pandas as pd
print ('pandas '+ pd.__version__)
import seaborn as sns
print('seaborn '+ sns.__version__)
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
#Station information
station_status_url = 'http://www.citibikenyc.com/stations/json'
resp=requests.get(station_status_url)
resp.json().keys()
Out[2]:
In [3]:
resp.json()['stationBeanList'][0]
Out[3]:
In [4]:
station_info = pd.DataFrame(resp.json()['stationBeanList'])
station_info.head(2)
Out[4]:
In [9]:
filename = 'data/201501-citibike-tripdata.zip'
with zipfile.ZipFile(filename) as myzip:
with myzip.open(myzip.namelist()[0]) as myfile:
cb_trip_df = pd.read_csv(myfile)
runit = 0
if runit == 1:
filenames = ['data/201402-citibike-tripdata.zip','data/201403-citibike-tripdata.zip','data/201404-citibike-tripdata.zip','data/201405-citibike-tripdata.zip',
'data/201406-citibike-tripdata.zip','data/201407-citibike-tripdata.zip','data/201408-citibike-tripdata.zip','data/201409-citibike-tripdata.zip',
'data/201410-citibike-tripdata.zip','data/201411-citibike-tripdata.zip','data/201412-citibike-tripdata.zip','data/201501-citibike-tripdata.zip',
'data/201502-citibike-tripdata.zip','data/201503-citibike-tripdata.zip','data/201504-citibike-tripdata.zip','data/201505-citibike-tripdata.zip',
'data/201506-citibike-tripdata.zip','data/201507-citibike-tripdata.zip','data/201508-citibike-tripdata.zip','data/201509-citibike-tripdata.zip',
'data/201510-citibike-tripdata.zip','data/201511-citibike-tripdata.zip','data/201512-citibike-tripdata.zip']
for filen in filenames:
print filen
with zipfile.ZipFile(filen) as myzip:
with myzip.open(myzip.namelist()[0]) as myfile:
temp_df = pd.read_csv(myfile)
cb_trip_df = cb_trip_df.append(temp_df, ignore_index=True)
cb_trip_df.head(3)
Out[9]:
In [10]:
cb_trip_df.info()
In [14]:
cb_trip_df.tail(2)
Out[14]:
I am going to do a bunch of datetime converting and add columns with more specific and useful info. These will be:
In [12]:
# Convert to datetime timestamps
cb_trip_df['starttime']=pd.to_datetime(cb_trip_df['starttime'],infer_datetime_format=True)
cb_trip_df['stoptime']=pd.to_datetime(cb_trip_df['stoptime'],infer_datetime_format=True)
#cb_trip_df['birth year']=pd.to_datetime(cb_trip_df['birth year'],infer_datetime_format=True)
In [13]:
cb_trip_df['startyear']=cb_trip_df['starttime'].apply(lambda x: x.year)
cb_trip_df['starthour']=cb_trip_df['starttime'].apply(lambda x: x.hour)
cb_trip_df['startminute']=cb_trip_df['starttime'].apply(lambda x: x.minute)
cb_trip_df['startday']=cb_trip_df['starttime'].apply(lambda x: x.day)
cb_trip_df['stophour']=cb_trip_df['stoptime'].apply(lambda x: x.hour)
cb_trip_df['stopminute']=cb_trip_df['stoptime'].apply(lambda x: x.minute)
cb_trip_df['stopday']=cb_trip_df['stoptime'].apply(lambda x: x.day)
cb_trip_df['weekday']=cb_trip_df['starttime'].apply(lambda x: x.year)
cb_trip_df['userage']=cb_trip_df['startyear']-cb_trip_df['birth year']
cb_trip_df.info()
In [15]:
cb_trip_df.describe()
Out[15]:
In [18]:
print "Number of bikes: ",(len(cb_trip_df['bikeid'].unique()))
print "Number of stations: ",(len(cb_trip_df['start station id'].unique()))
print(cb_trip_df['bikeid'].iloc[0])
In [19]:
print(len(cb_trip_df['start station id'].unique()))
In [21]:
cb_trip_df['bikeid'].value_counts()
Out[21]:
In [59]:
bid_hist = cb_trip_df[cb_trip_df['bikeid']==21099]
bid_hist.head()
Out[59]:
Take one bike and the first datetime in our dataframe. 1.find the first occurence of a trip for that bike. 2.take that 'start station id' (sid), set the range the bid was in strtid from first datetime to 'starttime' 3.for dts in that range, add to station_dt_dict[sid][dt]=bid or station_dt_dict[sid][dt].append(bid) 3.take the 'stop station id' as new datetime, go to next row and repeat
In [22]:
import datetime as dt
from collections import defaultdict
import timeit
In [23]:
station_dt_dict=defaultdict(lambda : defaultdict(list))
tic=timeit.default_timer()
#perform for each bike
for bid in cb_trip_df['bikeid'].unique():
#filter data for bike, initialize strttime1 to the first day of our time period
bid_hist = cb_trip_df[cb_trip_df['bikeid']==bid]
strttime1= dt.datetime(2015,5,1,0,0)
#iterate thru the rows of bicycle history, note it is already ordered chronologically
for row in bid_hist.itertuples():
sid = row._4 # is the ['start station id']
stoptime = row.starttime - dt.timedelta(minutes=row.starttime.minute % 10,
seconds=row.starttime.second)
times_at_sid = pd.date_range(start=strttime1,end=stoptime,freq='10min')
#check if times_at_sid is zero, set it to strrtime1 (as a date_range)
if len(times_at_sid)==0:
times_at_sid = pd.date_range(start=strttime1,end=strttime1 + dt.timedelta(minutes=10),freq='10min')
#add station and times to dict
for t in times_at_sid:
station_dt_dict[sid][t.isoformat()].append(bid)
#set new strttime1 (here starttime1 is when the bike is put onto the next row's station)
strttime1 = row.stoptime + dt.timedelta(minutes=10 - row.stoptime.minute % 10) - dt.timedelta(seconds=row.stoptime.second)
toc=timeit.default_timer()
print toc - tic
In [119]:
station_dt_dict=defaultdict(lambda : defaultdict(list))
tic=timeit.default_timer()
for bid in [15003]:#cb_trip_df['bikeid'].unique():
#bid = 17390
bid_hist = cb_trip_df[cb_trip_df['bikeid']==bid]
strttime1= dt.datetime(2015,5,1,0,0)
for row in bid_hist.itertuples():
#print row
sid = row._4 # is the ['start station id']
stoptime = row.starttime - dt.timedelta(minutes=row.starttime.minute % 10,
seconds=row.starttime.second)
times_at_sid = pd.date_range(start=strttime1,end=stoptime,freq='10min')
if len(times_at_sid)==0:
times_at_sid = pd.date_range(start=strttime1,end=strttime1 + dt.timedelta(minutes=10),freq='10min')
#print len(times_at_sid), sid, times_at_sid[0].isoformat()
for t in times_at_sid:
#print sid, t #512,265
station_dt_dict[sid][t.isoformat()].append(bid)
strttime1 = row.stoptime + dt.timedelta(minutes=10 - row.stoptime.minute % 10) - dt.timedelta(seconds=row.stoptime.second)
toc=timeit.default_timer()
print toc - tic
In [120]:
pd.options.display.max_columns = 25
bid_hist[bid_hist['start station id']==507]
Out[120]:
In [121]:
bid_hist[bid_hist['end station id']==507]
Out[121]:
In [122]:
for key in sorted(station_dt_dict[507].iterkeys()):
print "%s: %s" % (key, station_dt_dict[507][key])
In [91]:
print len(station_dt_dict.keys())
print station_dt_dict.keys()
In [126]:
times_at_sid = pd.date_range(start=dt.datetime(2015,5,1,0,0),end=dt.datetime(2015,5,2,0,0),freq='10min')
for t in times_at_sid:
print "%s: %s" % (t.isoformat(), station_dt_dict[507][t.isoformat()])
#for key in sorted(station_dt_dict[507].iterkeys()):
# print "%s: %s" % (key, station_dt_dict[507][key])
In [145]:
#import operator
#print reduce(operator.add, test_df.iloc[0].values)
#print test_df.iloc[0].map(lambda x: x.extend(x))
#test_df.fillna([np.nan],inplace=True)
station_dt_count = defaultdict(lambda: defaultdict(list))
for k1 in station_dt_dict:
for k2 in station_dt_dict[k1]:
station_dt_count[k1][k2]=len(station_dt_dict[k1][k2])
test_df = pd.DataFrame.from_dict(station_dt_count)
print test_df.info()
test_df.fillna(np.nan,inplace=True)
#test_df.head()
#def gather( df, key, value, cols ):
id_vars = [test_df.columns]
#id_values = [test_df.columns]
var_name = 'stationid'
value_name = 'num_bikes'
test_it_df=pd.melt( test_df, id_vars, var_name, value_name )
test_it_df.head()
In [58]:
[sid][bid_hist
print bid_hist.iloc[0]['starttime']
print bid_hist.iloc[0]['starttime']-dt.datetime(2015,5,1,0,0)
times_at_st = pd.date_range(start=dt.datetime(2015,5,1,0,0),end=bid_hist.iloc[0]['starttime'],freq='10min')
station_dt_dict[sid][this_dt]=bid
#station_dt_dict[bid_hist.iloc[0]]
In [13]:
all_bike_ids = cb_trip_df['bikeid'].unique()
cb_trip_df[cb_trip_df['bikeid']==all_bike_ids[0]].head(2)
station_dict={}
for bi in all_bike_ids:
row = cb_trip_df[cb_trip_df['bikeid']==bi].iloc[0]
if row['start station id'] in station_dict:
station_dict[row['start station id']].append(bi)
else:
station_dict[row['start station id']]=[bi]
In [16]:
print(station_dict[518])
In [10]:
cb_dat.shape
In [11]:
cb_dat.describe()
Out[11]:
In [17]:
fig,ax = plt.subplots(figsize=(9,9))
ax.
In [ ]: