In [ ]:
#Concatenate dataframes to regress on later
columns_weather = ['Events','Mean_Temperature_F','Precipitation_In '] #weather data we will use for the regression

# Compute Boolean Vector to select among our data
boolean_selection = False
for d in dates:
    boolean_selection = boolean_selection | (weather.index.date==d)

extract_weather = weather.loc[(boolean_selection),columns_weather]
repeat_weather = pd.concat([extract_weather.loc[(extract_weather.index.date!=date_start)&(extract_weather.index.date!=date_end),:]]*96).sort_index(axis=0)# repeated 96 times every day
repeat_weather.index = sampled_variation.loc[(sampled_variation.station_id == station_ids[0]),:].index #same index to ease concatenation
repeat_weather = pd.concat([repeat_weather]*len(station_ids))

data_to_regress = pd.concat([repeat_weather,sampled_variation],axis=1) #original data to regress on (need then to be numerized)

#Adding a few useful features
data_to_regress['month'] = data_to_regress.index.month
data_to_regress['weekday'] = data_to_regress.index.dayofweek
data_to_regress['hour'] = data_to_regress.index.hour + data_to_regress.index.minute/60

display(data_to_regress.head())

In [ ]:
#Visualization (not used here)
# station_to_visualize = station_ids[1]
# view_per_station = data_to_regress.loc[(data_to_regress.station_id==station_to_visualize),:]

# plt.figure()
# plt.title('Average Daily Variation per Month @ ' + station_to_visualize, fontsize=20)
# view_per_station.daily_variation.groupby(view_per_station.index.month).mean().plot(kind='bar', figsize=(30,10), fontsize=20);
# plt.figure()
# plt.title('Average Daily Variation per Hour (mean) @ ' + station_to_visualize, fontsize=20)
# view_per_station.daily_variation.groupby(view_per_station.index.hour).mean().plot(kind='bar', figsize=(30,10), fontsize=20);