In [2]:
%pwd
%pylab inline
In [3]:
data_dir = "./Data/Weather/"
%matplotlib inline
!mkdir -p $data_dir
!ls $data_dir
In [3]:
!curl -o $data_dir/SampleStations.pickle http://mas-dse-open.s3.amazonaws.com/Weather/SampleStations.pickle
From each measurement, we take 2 sample stations with low residual error and 2 sample stations with high residual error and plot the following:
Read data from the pickle files SampleStations.pickle
and STAT.pickle
SampleStations.pickle
contains information about the 6 measurements taken from some sample stations. You have been provided the code to process this file. It converts the seemingly complicated structure into a list of lists where each sublist has the following information:['station','measurement','year','1', '2', '3', ...... , '365']
STAT.pickle
contains statistics about the weather data for each of the 6 measurements and its description.
In [183]:
import pickle
Data=pickle.load(open('./Data/Weather/SampleStations.pickle','r'))
STAT,STAT_description=pickle.load(open('./Data/Weather/STAT.pickle','r'))
In [184]:
FlatData=[]
for station in Data:
stationname=station[0]
for measurements in station[1]:
measurement,year=measurements[0]
yeardata=list(measurements[1])
rowData=[stationname]+[measurement]+[year]+yeardata
FlatData.append(rowData)
In [185]:
import pandas as pd
frameheader=['station','measurement','year']+range(1,366)
df=pd.DataFrame(FlatData,columns=frameheader)
m_df={}
for m in ['TMIN', 'TOBS', 'TMAX', 'SNOW', 'SNWD', 'PRCP']:
t_df=df[df['measurement']==m]
m_df[m]=t_df
In [186]:
station_selector = m_df['TMAX']['station']=="USC00457015"
year_selector=m_df['TMAX']['year']==1996 #this year has no nans
station_sample = m_df['TMAX'][year_selector & station_selector]
T = station_sample.ix[:,3:].values[0]
You need to plot reconstructions for two stations with low reconstruction error and two stations with high reconstruction error. To do this, you will need to do the following:
In [ ]:
In [186]:
evecs = STAT['TMAX']['eigvec'] #(365, 365)
evals = STAT['TMAX']['eigval'] #(365, 0)
U=evecs
X = np.dot(np.dot(U, U.T), T)
U = STAT['TMAX']['eigvec']
np.array(U[:,:3])
Out[186]:
In [220]:
# I have to do this 3 times: once for 1 then 2 then 3 evecs
m="TMIN"
stations = m_df[m]
T = samples = np.array(stations.ix[:,3:])
evecs = STAT[m]['eigvec']
Reconstructed={}
Reconstructed_errors={}
for i in range(1,4): #1,2,3
U = evecs[:,0:i]
UUT = np.dot(U, U.T) #(365,365)
X = np.dot(UUT, T.T) #(365, 5830)
Reconstructed[i] = X
error = (T - X.T)**2 #(5830, 365)
mse = np.mean(np.nan_to_num(error), axis=1) #(5830,)
mse[mse==0]=numpy.nan #convert 0s back to nan
Reconstructed_errors[i] = mse
#now find the 2 smallest and largest errors
errors = Reconstructed_errors[3]
mask = np.isnan(errors)
errors[mask] = np.interp(np.flatnonzero(mask), np.flatnonzero(~mask), errors[~mask])
sorted_indicies = errors.argsort()
low_i = sorted_indicies[0:2]
high_i = sorted_indicies[-2:]
# 2772 2782
# 2408 3928
print low_i
print high_i
In [143]:
errors
Out[143]:
In [223]:
def create_reconstructions(m):
## Put your code for computating reconstructions here
stations = m_df[m]
T = samples = np.array(stations.ix[:,3:])
norm_data = T - STAT[m]['Mean']
evecs = STAT[m]['eigvec']
Reconstructed={}
Reconstructed_errors={}
original = T
for i in range(1,4): #1,2,3
U = evecs[:,0:i]
UUT = np.dot(U, U.T) #(365,365)
X = np.dot(UUT, norm_data.T)
X = (X.T + STAT[m]['Mean']).T #add back mean here #(365, 5830)
Reconstructed[i] = X
error = (norm_data - X.T)**2 #(5830, 365)
mse = np.mean(np.nan_to_num(error), axis=1) #(5830,)
mse[mse==0]=numpy.nan #convert 0s back to nan
Reconstructed_errors[i] = mse
#now find the 2 smallest and largest errors
errors = Reconstructed_errors[3]
mask = np.isnan(errors)
errors[mask] = np.interp(np.flatnonzero(mask), np.flatnonzero(~mask), errors[~mask])
sorted_indicies = errors.argsort()
low_i = sorted_indicies[0:2]
high_i = sorted_indicies[-2:]
lower = low_i
upper = high_i
yeardays=[i for i in (1,366)]
plt.figure(figsize=(20,30),dpi=300)
j=1
c=0
for l in lower:
subplot(4,2,j)
j+=1
c+=1
plot(original[l])
plot(STAT[m]['Mean'])
plot(Reconstructed[1][:,l])
plot(Reconstructed[2][:,l])
plot(Reconstructed[3][:,l])
title('#' + str(c) + ' Sample for ' + m + ' (low residual error)')
xlim([0,365])
legend(['original','Mean','1','2','3'],loc=2)
c=0
for l in upper:
subplot(4,2,j)
j+=1
c+=1
plot(original[l])
plot(STAT[m]['Mean'])
plot(Reconstructed[1][:,l])
plot(Reconstructed[2][:,l])
plot(Reconstructed[3][:,l])
title('#' + str(c) + ' Sample for ' + m + ' (high residual error)')
xlim([0,365])
legend(['original','Mean','1','2','3'],loc=2)
In [224]:
for m in ['TMAX','SNWD']:
print 'Reconstruction Plots for '+ m
create_reconstructions(m)
In [ ]: