In [ ]:
print('The Station {}'.format(station_id)+' has {} docks in total.'.format(station.loc[station_id,'install_dockcount']))

#Station by station
extract = trip.loc[(trip.from_station_id==station_id) | (trip.to_station_id==station_id),:]
def incrementation(row):
    if (row['from_station_id']==station_id)&(row['to_station_id']==station_id):
        return int(0)
    if (row['from_station_id']==station_id):
        return int(-1)
    if (row['to_station_id']==station_id):
        return int(1)
extract['incrementation'] = trip.apply(incrementation, axis=1)
extract = extract.set_index('trip_id')

#Start and Stop
temp1 = extract.loc[(extract.incrementation==0),['starttime','stoptime','bikeid','to_station_id','incrementation']]
instanteanous_variation = pd.DataFrame(columns=['trip_id','time', 'bikeid', 'destination_id', 'incrementation'])
for i in range(temp1.shape[0]):
    #-1
    serie1 = dict(trip_id=temp1.index[i],bikeid=temp1.bikeid.values[i],destination_id=temp1.to_station_id.values[i])
    serie1['incrementation'] = -1
    serie1['time'] = temp1.starttime.values[i]
    
    #+1
    serie2 = dict(trip_id=temp1.index[i],bikeid=temp1.bikeid.values[i],destination_id=temp1.to_station_id.values[i])
    serie2['incrementation'] = 1
    serie2['time'] = temp1.stoptime.values[i]
    
    instanteanous_variation = instanteanous_variation.append(serie1, ignore_index=True)
    instanteanous_variation = instanteanous_variation.append(serie2, ignore_index=True)
instanteanous_variation = instanteanous_variation.set_index('trip_id')
instanteanous_variation.index = instanteanous_variation.index.astype(int) 

#Stop
temp2 = extract.loc[(extract.incrementation==1.0),['stoptime','bikeid','from_station_id','incrementation']]
temp2.columns=['time','bikeid','destination_id','incrementation']
instanteanous_variation=instanteanous_variation.append(temp2)

#Start
temp3 = extract.loc[(extract.incrementation==-1.0),['starttime','bikeid','to_station_id','incrementation']]
temp3.columns=['time','bikeid','destination_id','incrementation']
instanteanous_variation=instanteanous_variation.append(temp3)

#Sort by time before doing cumulative
instanteanous_variation.time = pd.to_datetime(instanteanous_variation.time)
instanteanous_variation = instanteanous_variation.sort_values('time')

#Computation of the total cumulative variation
instanteanous_variation['total_variation'] = instanteanous_variation['incrementation'].cumsum()

In [ ]:
date_start = date(2014,10,13)
date_end = date(2016,8,31)
dates = [date_start + timedelta(days=x) for x in range((date_end-date_start).days + 1)]

daily = []
for d in dates:
    temp = instanteanous_variation.loc[(instanteanous_variation.time.dt.date==d),['incrementation']].cumsum().values
    daily = np.append(daily,temp) 
instanteanous_variation['daily_variation'] = pd.Series(daily, index=instanteanous_variation.index)

In [ ]:
###### Resample of instanteanous_variation towards regular time step #####

sampled_variation = instanteanous_variation.set_index('time').incrementation.groupby(pd.TimeGrouper(freq='15Min')).sum() #every 15Mins
sampled_variation = pd.DataFrame(sampled_variation, columns=['incrementation']) #to dataframe
sampled_variation = sampled_variation.fillna(value=0) #transform NaN to 0
sampled_variation['date'] = sampled_variation.index.date #faster access to date in the following

#From incrementation to daily_variation
daily = []
for d in dates:
    temp = sampled_variation.loc[(sampled_variation.date==d),['incrementation']].cumsum().values
    daily = np.append(daily,temp) 
sampled_variation['incrementation'] = pd.Series(daily, index=sampled_variation.index)
sampled_variation.columns = ['daily_variation','date']

#Removal of first and last day to have full periods of 24h (96 by day)
sampled_variation=sampled_variation.loc[(sampled_variation.date!=date_start)&(sampled_variation.date!=date_end),:]

In [ ]:
###### Concatenate dataframes to regress on later ####
columns_weather = ['Events','Mean_Temperature_F','Precipitation_In '] #weather data we will use for the regression
repeat_weather = pd.concat([weather.loc[(weather.index.date!=date_start)&(weather.index.date!=date_end),columns_weather]]*96).sort_index(axis=0)# repeated 96 times every day
repeat_weather.index=sampled_variation.index #same index to ease concatenation

data_to_regress = pd.concat([repeat_weather,sampled_variation],axis=1) #original data to regress on (need then to be numerized)

###### Adding a few useful features ######
data_to_regress['date'] = data_to_regress.index.month
data_to_regress['weekday'] = data_to_regress.index.dayofweek
data_to_regress['hour'] = data_to_regress.index.hour + data_to_regress.index.minute/60
data_to_regress.columns = ['Events','Mean_Temperature_F','Precipitation_In ','daily_variation','month','weekday','hour']

###### Numerizing Events #####
data_to_regress.Events = data_to_regress.Events.fillna(value=0)
to_one = ['Fog']
for k, st in enumerate(to_one):
    data_to_regress.loc[(data_to_regress.Events == st),['Events']]=1
to_two = ['Rain','Fog , Rain','Fog-Rain', 'Rain-Thunderstorm','Rain , Thunderstorm']
for k, st in enumerate(to_two):
    data_to_regress.loc[(data_to_regress.Events == st),['Events']]=2
to_three = ['Snow','Rain-Snow','Rain , Snow']
for k, st in enumerate(to_three):
    data_to_regress.loc[(data_to_regress.Events == st),['Events']]=3

###### More cleaning ^^ #####
data_to_regress = data_to_regress.dropna(axis=0)
#print(data_to_regress.isnull().any())

In [ ]:
#Printing
display(data_to_regress.head())
print('DataFrame shape used for regression: {}'.format(data_to_regress.shape))