In [113]:
from datetime import datetime,timedelta, time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from data_helper_functions import *
from IPython.display import display
pd.options.display.max_columns = 999
%matplotlib inline
desired_channel = 'BAND_01'
desired_date = datetime(2014, 4, 8)
desired_timedelta = timedelta(hours = 18)
desired_datetime = desired_date + desired_timedelta
satellite_filefolder = 'data/satellite/colorado/summer6months/data/'
sensor_filefolder = 'data/sensor_data/colorado6months/'
pvoutput_filefolder = 'data/pvoutput/pvoutput6months/'
#satellite data
satellite_filename = find_filename(desired_datetime, desired_channel, satellite_filefolder)
lons, lats, data = return_satellite_data(satellite_filename, satellite_filefolder)
plt.figure(figsize=(8, 8))
imgplot = plt.imshow(data)
imgplot.set_interpolation('none')
plt.show()
#sensor data
sensor_filename = find_file_from_date(desired_date, sensor_filefolder)
df_sensor = return_sensor_data(sensor_filename, sensor_filefolder).ix[:,-15:-1]
df_sensor[df_sensor.index == desired_datetime]
display(df_sensor[df_sensor.index == desired_datetime])
In [3]:
np.ravel(data).shape
Out[3]:
In [7]:
#iterate over datetimes:
mytime = datetime(2014, 4, 1, 13)
times = make_time(mytime)
# Now that we can call data up over any datetime and we have a list of interested datetimes,
# we can finally construct an X matrix and y vector for regression.
desired_channel = 'BAND_01'
satellite_filefolder = 'data/satellite/colorado/summer6months/data/'
sensor_filefolder = 'data/sensor_data/colorado6months/'
X = []
Y = []
for desired_datetime in times:
try: #ignore data without satellite images, should update to output datetime of occurrance
desired_date = (desired_datetime - timedelta(hours=6)).date() #make sure correct date
desired_date = datetime.combine(desired_date, time.min) #get into datetime format
satellite_filename = find_filename(desired_datetime, desired_channel, satellite_filefolder)
lons, lats, data = return_satellite_data(satellite_filename, satellite_filefolder)
sensor_filename = find_file_from_date(desired_date, sensor_filefolder)
df_sensor = return_sensor_data(sensor_filename, sensor_filefolder).ix[:,-15:-1]
df_sensor[df_sensor.index == desired_datetime]
# pvoutput_filename = find_file_from_date(desired_date, pvoutput_filefolder)
# df_pvoutput = return_pvoutput_data(pvoutput_filename, pvoutput_filefolder)
X.append(np.ravel(data))
Y.append(df_sensor[df_sensor.index == desired_datetime].values[0])
except:
pass
In [43]:
X = np.array(X)
Y = np.array(Y)
In [44]:
print X.shape
print Y.shape
In [123]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import scale, StandardScaler
from sklearn.cross_validation import train_test_split
from sklearn.metrics import *
from lasagne import layers
from lasagne.nonlinearities import softmax, rectify, sigmoid, linear, very_leaky_rectify, tanh
from lasagne.updates import nesterov_momentum, adagrad, momentum
from nolearn.lasagne import NeuralNet
In [19]:
y = Y.astype('float32') #big Y to little y
x = X.astype('float32') #big X to little x
scaler = StandardScaler()
scaled_x = scaler.fit_transform(x)
x_train, x_test, y_train, y_test = train_test_split(scaled_x, y, test_size = 0.2, random_state = 12)
In [25]:
x.shape[1]
Out[25]:
In [33]:
scaled_x[3]
Out[33]:
In [32]:
net1 = NeuralNet(
layers=[ # three layers: one hidden layer
('input', layers.InputLayer),
('hidden', layers.DenseLayer),
('output', layers.DenseLayer),
],
# layer parameters:
input_shape=(None, x.shape[1]), # image size
hidden_num_units=100, # number of units in hidden layer
output_nonlinearity=None, # output layer uses identity function
output_num_units=14, # 14 target values
# optimization method:
update=nesterov_momentum,
update_learning_rate=0.01,
update_momentum=0.09,
regression=True, # flag to indicate we're dealing with regression problem
max_epochs=400, # we want to train this many epochs
verbose=1,
)
net1.fit(scaled_x, y)
Out[32]:
In [55]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor(oob_score=True)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 14)
In [56]:
rfr.fit(X_train,Y_train)
Out[56]:
In [57]:
rfr.score(X_test,Y_test)
Out[57]:
In [58]:
rfr.oob_score_
Out[58]:
In [59]:
Y_pred = rfr.predict(X_test)
In [68]:
val = 100
print Y_pred[val]
print Y_test[val]
In [253]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 14)
In [254]:
lr.fit(X_train,Y_train)
Out[254]:
In [255]:
lr.score(X_test,Y_test)
Out[255]:
In [256]:
Y_pred = rfr.predict(X_test)
In [268]:
from random import randint
val = randint(0,508)
print Y_pred[val]
print Y_test[val]
In [289]:
from sklearn.linear_model import Ridge
ridge = Ridge()
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.5, random_state = 14)
In [ ]:
In [290]:
ridge.fit(X_train, Y_train)
Out[290]:
In [291]:
ridge.score(X_test,Y_test)
Out[291]:
In [292]:
Y_pred = ridge.predict(X_test)
In [293]:
val = randint(0,508)
print Y_pred[val]
print Y_test[val]
suprisingly, this is very fast and works the best due to it's speed!
In [1]:
from datetime import datetime,timedelta, time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from data_helper_functions import *
from IPython.display import display
pd.options.display.max_columns = 999
from __future__ import division
%matplotlib inline
#iterate over datetimes:
mytime = datetime(2014, 4, 1, 13)
times = make_time(mytime)
satellite_filefolder = 'data/satellite/colorado/summer6months/data/'
sensor_filefolder = 'data/sensor_data/colorado6months/'
X = []
Y = []
for desired_datetime in times:
try: #ignore data without satellite images, should update to output datetime of occurrance
desired_date = (desired_datetime - timedelta(hours=6)).date() #make sure correct date
desired_date = datetime.combine(desired_date, time.min) #get into datetime format
desired_channel = 'BAND_01' #problems with an inner for loop (doesn't look good, but works)
satellite_filename = find_filename(desired_datetime, desired_channel, satellite_filefolder)
lons, lats, data1 = return_satellite_data(satellite_filename, satellite_filefolder)
desired_channel = 'BAND_02'
satellite_filename = find_filename(desired_datetime, desired_channel, satellite_filefolder)
lons, lats, data2 = return_satellite_data(satellite_filename, satellite_filefolder)
desired_channel = 'BAND_03'
satellite_filename = find_filename(desired_datetime, desired_channel, satellite_filefolder)
lons, lats, data3 = return_satellite_data(satellite_filename, satellite_filefolder)
desired_channel = 'BAND_04'
satellite_filename = find_filename(desired_datetime, desired_channel, satellite_filefolder)
lons, lats, data4 = return_satellite_data(satellite_filename, satellite_filefolder)
desired_channel = 'BAND_06'
satellite_filename = find_filename(desired_datetime, desired_channel, satellite_filefolder)
lons, lats, data5 = return_satellite_data(satellite_filename, satellite_filefolder)
sensor_filename = find_file_from_date(desired_date, sensor_filefolder)
df_sensor = return_sensor_data(sensor_filename, sensor_filefolder).ix[:,-15:-1]
df_sensor[df_sensor.index == desired_datetime]
Y.append(df_sensor[df_sensor.index == desired_datetime].values[0])
X.append(np.hstack( ( np.ravel(data1) , np.ravel(data2), np.ravel(data3) , np.ravel(data4), np.ravel(data5) ) ) )
except:
pass
In [4]:
X = np.array(X)
Y = np.array(Y)
In [5]:
X.shape
Out[5]:
In [6]:
Y.shape
Out[6]:
In [7]:
#np.savetxt('data/X.txt', X)
#np.savetxt('data/Y.txt', Y)
#np.savez_compressed('data/Y_all_channels.npz',Y=Y)
#np.savez_compressed('data/X_all_channels.npz',X=X)
In [9]:
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import train_test_split
lr = LinearRegression()
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.5, random_state = 14)
In [10]:
lr.fit(X_train,Y_train)
Out[10]:
In [11]:
lr.score(X_test,Y_test)
Out[11]:
In [19]:
Y_pred = lr.predict(X_test)
from random import randint
val = randint(0,508)
print Y_pred[val]
print Y_test[val]
In [24]:
from sklearn.linear_model import Ridge
ridge = Ridge(solver = 'svd')
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.5, random_state = 14)
In [25]:
ridge.fit(X_train,Y_train)
Out[25]:
In [26]:
ridge.score(X_test,Y_test)
Out[26]:
In [33]:
Y_pred = ridge.predict(X_test)
from random import randint
val = randint(0,508)
print Y_pred[val]
print Y_test[val]
In [34]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor(oob_score=True)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.8, random_state = 14)
In [35]:
rfr.fit(X_train,Y_train)
Out[35]:
In [59]:
rfr.score(X_test,Y_test)
Out[59]:
In [71]:
Y_pred = rfr.predict(X_test)
from random import randint
val = randint(0,508)
print Y_pred[val]
print Y_test[val]
In [32]:
import pandas as pd
df = pd.read_csv('documentation/failed_times.txt',header=None,delimiter=" ")
df.loc[:,11:12]
df_failed_times = pd.to_datetime(df[11] +" "+ df[12], format="%Y-%m-%d %H:%M:%S")
In [25]:
from datetime import datetime,timedelta, time
import numpy as np
import matplotlib.pyplot as plt
from data_helper_functions import *
from IPython.display import display
pd.options.display.max_columns = 999
from __future__ import division
%matplotlib inline
#iterate over datetimes:
mytime = datetime(2014, 4, 1, 13)
times = make_time(mytime)
In [91]:
failed_datetimes = [] #make a list from all failed datetimes from df above
for i in xrange(len(df_failed_times)):
failed_datetimes.append(datetime.utcfromtimestamp(
df_failed_times.values.astype(int)[i]*1e-9))
Out[91]:
In [100]:
time_mask = [True]*len(times) #allocate array and assume good until proven bad
for i,time in enumerate(times):
for bad_time in failed_datetimes:
if time==bad_time:
time_mask[i] = False
In [103]:
sum(time_mask) #good, same as input to satellite to sensor
Out[103]:
In [110]:
good_times = np.array(times)[np.array(time_mask)]
good_times
Out[110]:
In [111]:
np.savez_compressed('data/good_times.npz',good_times=good_times) #save
In [ ]: