In [ ]:
#Numerizing Events
data_to_regress.Events = data_to_regress.Events.fillna(value=0)
to_one = ['Fog']
for st in to_one:
    data_to_regress.loc[(data_to_regress.Events == st),['Events']]=1
to_two = ['Rain','Fog , Rain','Fog-Rain', 'Rain-Thunderstorm','Rain , Thunderstorm']
for st in to_two:
    data_to_regress.loc[(data_to_regress.Events == st),['Events']]=2
to_three = ['Snow','Rain-Snow','Rain , Snow']
for st in to_three:
    data_to_regress.loc[(data_to_regress.Events == st),['Events']]=3

#Numerizing Stations
station_to_int = {} #dictionaries to store mapping name and int
int_to_station = {}
for k, st in enumerate(station_ids):
    station_to_int[st] = k
    int_to_station[k] = st
    data_to_regress.loc[(data_to_regress.station_id == st),['station_id']]=k
    
#More cleaning
data_to_regress = data_to_regress.dropna(axis=0)


#Numerizing the entire dataframe
Xtot = data_to_regress.as_matrix(columns=['Events','Mean_Temperature_F','Precipitation_In ','month','weekday','hour','station_id'])
ytot = data_to_regress.as_matrix(columns=['daily_variation'])

#Saving data since takes to much time to compute
np.save('Xtot',Xtot)
np.save('ytot',ytot)

print('Database shape used for regression: {}'.format(Xtot.shape))