In [ ]:
#Restriction criteria 1
#We would like to work only with stations which opened the 2014-10-13 and had trips until 2016-8-31
temp1 = trip.loc[(trip.date == date(2014,10,13)),['from_station_id','to_station_id']]
set_temp1 = set.union(set(temp1.from_station_id.values),set(temp1.from_station_id.values))
temp2 = trip.loc[(trip.date == date(2015,8,31)),['from_station_id','to_station_id']]
set_temp2 = set.union(set(temp2.from_station_id.values),set(temp2.from_station_id.values))
station_ids = set.intersection(set_temp1, set_temp2)
station_ids.remove('PS-04') #PS-04 to be removed (data missing)
station_ids.remove('ID-04') #ID-04 also to be removed (data missing)
station_ids = list(station_ids)
In [ ]:
#Restriction criteria 2
date_start = date(2014,10,13)
date_end = date(2016,3,18) #reduction in the number of data in order to improve computation time
dates = [date_start + timedelta(days=x) for x in range((date_end-date_start).days + 1)]
In [ ]:
#Station by station (station_ids already provided)
sampled_variation = pd.DataFrame(columns=['daily_variation','station_id']) #dataframe to store for each station
for station_id in station_ids:
### Instanteanous Total Variation
extract = trip.loc[(trip.from_station_id==station_id) | (trip.to_station_id==station_id),:]
extract = extract.loc[(extract.date<=date_end)&(date_start<=extract.date),:]
def incrementation(row):
if (row['from_station_id']==station_id)&(row['to_station_id']==station_id):
return int(0)
if (row['from_station_id']==station_id):
return int(-1)
if (row['to_station_id']==station_id):
return int(1)
extract['incrementation'] = trip.apply(incrementation, axis=1)
extract = extract.set_index('trip_id')
#Start and Stop
temp1 = extract.loc[(extract.incrementation==0),['starttime','stoptime','bikeid','to_station_id','incrementation']]
instanteanous_variation = pd.DataFrame(columns=['trip_id','time', 'bikeid', 'destination_id', 'incrementation'])
for i in range(temp1.shape[0]):
#-1
serie1 = dict(trip_id=temp1.index[i],bikeid=temp1.bikeid.values[i],destination_id=temp1.to_station_id.values[i])
serie1['incrementation'] = -1
serie1['time'] = temp1.starttime.values[i]
#+1
serie2 = dict(trip_id=temp1.index[i],bikeid=temp1.bikeid.values[i],destination_id=temp1.to_station_id.values[i])
serie2['incrementation'] = 1
serie2['time'] = temp1.stoptime.values[i]
instanteanous_variation = instanteanous_variation.append(serie1, ignore_index=True)
instanteanous_variation = instanteanous_variation.append(serie2, ignore_index=True)
instanteanous_variation = instanteanous_variation.set_index('trip_id')
instanteanous_variation.index = instanteanous_variation.index.astype(int)
#Stop
temp2 = extract.loc[(extract.incrementation==1.0),['stoptime','bikeid','from_station_id','incrementation']]
temp2.columns=['time','bikeid','destination_id','incrementation']
instanteanous_variation=instanteanous_variation.append(temp2)
#Start
temp3 = extract.loc[(extract.incrementation==-1.0),['starttime','bikeid','to_station_id','incrementation']]
temp3.columns=['time','bikeid','destination_id','incrementation']
instanteanous_variation=instanteanous_variation.append(temp3)
#Sort by time before doing cumulative
instanteanous_variation.time = pd.to_datetime(instanteanous_variation.time)
instanteanous_variation = instanteanous_variation.sort_values('time')
### Resample of instanteanous_variation towards regular time step
sample_variation_station = instanteanous_variation.set_index('time').incrementation.groupby(pd.TimeGrouper(freq='15Min')).sum() #every 15Mins
sample_variation_station = pd.DataFrame(sample_variation_station, columns=['incrementation']) #to dataframe
sample_variation_station = sample_variation_station.fillna(value=0) #transform NaN to 0
sample_variation_station['date'] = sample_variation_station.index.date #faster access to date in the following
#From incrementation to daily_variation
daily = []
for d in dates:
temp = sample_variation_station.loc[(sample_variation_station.date==d),['incrementation']].cumsum().values
daily = np.append(daily,temp)
sample_variation_station['incrementation'] = pd.Series(daily, index=sample_variation_station.index)
sample_variation_station.columns = ['daily_variation','date']
#Removal of first and last day to have full periods of 24h (96 by day)
sample_variation_station=sample_variation_station.loc[(sample_variation_station.date!=date_start)&(sample_variation_station.date!=date_end),:]
#Adding StationId label
sample_variation_station['station_id'] = station_id
#Removing useless column
sample_variation_station = sample_variation_station.loc[:,['daily_variation','station_id']]
#Append to final dataframe
sampled_variation = sampled_variation.append(sample_variation_station)
#Final Cleaning
sampled_variation = sampled_variation.loc[sampled_variation.index.date != date(2016,3,17),:]
In [ ]:
#Update of dates
date_start = date(2014,10,13)
date_end = date(2016,3,17) #reduction in the number of data in order to improve computation time
dates = [date_start + timedelta(days=x) for x in range((date_end-date_start).days + 1)]