In [ ]:
#Numerizing Events
data_to_regress.Events = data_to_regress.Events.fillna(value=0)
to_one = ['Fog']
for st in to_one:
data_to_regress.loc[(data_to_regress.Events == st),['Events']]=1
to_two = ['Rain','Fog , Rain','Fog-Rain', 'Rain-Thunderstorm','Rain , Thunderstorm']
for st in to_two:
data_to_regress.loc[(data_to_regress.Events == st),['Events']]=2
to_three = ['Snow','Rain-Snow','Rain , Snow']
for st in to_three:
data_to_regress.loc[(data_to_regress.Events == st),['Events']]=3
#Numerizing Stations
station_to_int = {} #dictionaries to store mapping name and int
int_to_station = {}
for k, st in enumerate(station_ids):
station_to_int[st] = k
int_to_station[k] = st
data_to_regress.loc[(data_to_regress.station_id == st),['station_id']]=k
#More cleaning
data_to_regress = data_to_regress.dropna(axis=0)
#Numerizing the entire dataframe
Xtot = data_to_regress.as_matrix(columns=['Events','Mean_Temperature_F','Precipitation_In ','month','weekday','hour','station_id'])
ytot = data_to_regress.as_matrix(columns=['daily_variation'])
#Saving data since takes to much time to compute
np.save('Xtot',Xtot)
np.save('ytot',ytot)
print('Database shape used for regression: {}'.format(Xtot.shape))