In [ ]:
#Restriction criteria 1

#We would like to work only with stations which opened the 2014-10-13 and had trips until 2016-8-31
temp1 = trip.loc[(trip.date == date(2014,10,13)),['from_station_id','to_station_id']]
set_temp1 = set.union(set(temp1.from_station_id.values),set(temp1.from_station_id.values))

temp2 = trip.loc[(trip.date == date(2015,8,31)),['from_station_id','to_station_id']]
set_temp2 = set.union(set(temp2.from_station_id.values),set(temp2.from_station_id.values))

station_ids = set.intersection(set_temp1, set_temp2)

station_ids.remove('PS-04') #PS-04 to be removed (data missing)
station_ids.remove('ID-04') #ID-04 also to be removed (data missing)
station_ids = list(station_ids)

In [ ]:
#Restriction criteria 2

date_start = date(2014,10,13)
date_end = date(2016,3,18) #reduction in the number of data in order to improve computation time
dates = [date_start + timedelta(days=x) for x in range((date_end-date_start).days + 1)]

In [ ]:
#Station by station (station_ids already provided)
sampled_variation = pd.DataFrame(columns=['daily_variation','station_id']) #dataframe to store for each station

for station_id in station_ids:
    ### Instanteanous Total Variation
    extract = trip.loc[(trip.from_station_id==station_id) | (trip.to_station_id==station_id),:]
    extract = extract.loc[(extract.date<=date_end)&(date_start<=extract.date),:]
    
    def incrementation(row):
        if (row['from_station_id']==station_id)&(row['to_station_id']==station_id):
            return int(0)
        if (row['from_station_id']==station_id):
            return int(-1)
        if (row['to_station_id']==station_id):
            return int(1)
    extract['incrementation'] = trip.apply(incrementation, axis=1)
    extract = extract.set_index('trip_id')
    
    #Start and Stop
    temp1 = extract.loc[(extract.incrementation==0),['starttime','stoptime','bikeid','to_station_id','incrementation']]
    instanteanous_variation = pd.DataFrame(columns=['trip_id','time', 'bikeid', 'destination_id', 'incrementation'])

    for i in range(temp1.shape[0]):
        #-1
        serie1 = dict(trip_id=temp1.index[i],bikeid=temp1.bikeid.values[i],destination_id=temp1.to_station_id.values[i])
        serie1['incrementation'] = -1
        serie1['time'] = temp1.starttime.values[i]
    
        #+1
        serie2 = dict(trip_id=temp1.index[i],bikeid=temp1.bikeid.values[i],destination_id=temp1.to_station_id.values[i])
        serie2['incrementation'] = 1
        serie2['time'] = temp1.stoptime.values[i]
    
        instanteanous_variation = instanteanous_variation.append(serie1, ignore_index=True)
        instanteanous_variation = instanteanous_variation.append(serie2, ignore_index=True)
    instanteanous_variation = instanteanous_variation.set_index('trip_id')
    instanteanous_variation.index = instanteanous_variation.index.astype(int) 

    #Stop
    temp2 = extract.loc[(extract.incrementation==1.0),['stoptime','bikeid','from_station_id','incrementation']]
    temp2.columns=['time','bikeid','destination_id','incrementation']
    instanteanous_variation=instanteanous_variation.append(temp2)

    #Start
    temp3 = extract.loc[(extract.incrementation==-1.0),['starttime','bikeid','to_station_id','incrementation']]
    temp3.columns=['time','bikeid','destination_id','incrementation']
    instanteanous_variation=instanteanous_variation.append(temp3)

    #Sort by time before doing cumulative
    instanteanous_variation.time = pd.to_datetime(instanteanous_variation.time)
    instanteanous_variation = instanteanous_variation.sort_values('time')

    ### Resample of instanteanous_variation towards regular time step

    sample_variation_station = instanteanous_variation.set_index('time').incrementation.groupby(pd.TimeGrouper(freq='15Min')).sum() #every 15Mins
    sample_variation_station = pd.DataFrame(sample_variation_station, columns=['incrementation']) #to dataframe
    sample_variation_station = sample_variation_station.fillna(value=0) #transform NaN to 0
    sample_variation_station['date'] = sample_variation_station.index.date #faster access to date in the following

    #From incrementation to daily_variation
    daily = []
    for d in dates:
        temp = sample_variation_station.loc[(sample_variation_station.date==d),['incrementation']].cumsum().values
        daily = np.append(daily,temp) 
    sample_variation_station['incrementation'] = pd.Series(daily, index=sample_variation_station.index)
    sample_variation_station.columns = ['daily_variation','date']


    #Removal of first and last day to have full periods of 24h (96 by day)
    sample_variation_station=sample_variation_station.loc[(sample_variation_station.date!=date_start)&(sample_variation_station.date!=date_end),:]

    #Adding StationId label
    sample_variation_station['station_id'] = station_id

    #Removing useless column
    sample_variation_station = sample_variation_station.loc[:,['daily_variation','station_id']]
    
    #Append to final dataframe
    sampled_variation = sampled_variation.append(sample_variation_station)

#Final Cleaning
sampled_variation = sampled_variation.loc[sampled_variation.index.date != date(2016,3,17),:]

In [ ]:
#Update of dates
date_start = date(2014,10,13)
date_end = date(2016,3,17) #reduction in the number of data in order to improve computation time
dates = [date_start + timedelta(days=x) for x in range((date_end-date_start).days + 1)]