In [2]:
%pwd
%pylab inline


Populating the interactive namespace from numpy and matplotlib

In [3]:
data_dir = "./Data/Weather/"
%matplotlib inline
!mkdir -p $data_dir
!ls $data_dir


STAT.pickle                 data-source.txt             ghcnd-stations.txt          ghcnd-version.txt
SampleStations.pickle       ghcnd-readme.txt            ghcnd-stations_buffered.txt stations.pkl

Downloading Pickled data from S3

If SampleStations.pickle is not in the directory, get it using the following command


In [3]:
!curl -o $data_dir/SampleStations.pickle http://mas-dse-open.s3.amazonaws.com/Weather/SampleStations.pickle


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  398M  100  398M    0     0  7147k      0  0:00:57  0:00:57 --:--:-- 8749k

Plot Reconstructions

From each measurement, we take 2 sample stations with low residual error and 2 sample stations with high residual error and plot the following:

  • Original Data Vector
  • Mean Vector
  • Reconstructed Data Vector using mean and top 1 eigen vectors
  • Reconstructed Data Vector using mean and top 2 eigen vectors
  • Reconstructed Data Vector using mean and top 3 eigen vectors

Read and Preprocess data

Read data from the pickle files SampleStations.pickle and STAT.pickle

  • SampleStations.pickle contains information about the 6 measurements taken from some sample stations. You have been provided the code to process this file. It converts the seemingly complicated structure into a list of lists where each sublist has the following information:

['station','measurement','year','1', '2', '3', ...... , '365']

  • station - The station ID
  • measurement - One of the 6 types of measurements
  • year - The year in which the measurements were recorded
  • 1-365 - The actual value of measurement for each day of the year
  • STAT.pickle contains statistics about the weather data for each of the 6 measurements and its description.

In [183]:
import pickle
Data=pickle.load(open('./Data/Weather/SampleStations.pickle','r'))
STAT,STAT_description=pickle.load(open('./Data/Weather/STAT.pickle','r'))

In [184]:
FlatData=[]
for station in Data:
    stationname=station[0]
    for measurements in station[1]:
        measurement,year=measurements[0]
        yeardata=list(measurements[1])
        rowData=[stationname]+[measurement]+[year]+yeardata
        FlatData.append(rowData)

In [185]:
import pandas as pd

frameheader=['station','measurement','year']+range(1,366)
df=pd.DataFrame(FlatData,columns=frameheader)

m_df={}
for m in ['TMIN', 'TOBS', 'TMAX', 'SNOW', 'SNWD', 'PRCP']:
    t_df=df[df['measurement']==m]
    m_df[m]=t_df

In [186]:
station_selector = m_df['TMAX']['station']=="USC00457015"
year_selector=m_df['TMAX']['year']==1996 #this year has no nans
station_sample = m_df['TMAX'][year_selector  & station_selector]
T = station_sample.ix[:,3:].values[0]

Define Reconstruction Function

You need to plot reconstructions for two stations with low reconstruction error and two stations with high reconstruction error. To do this, you will need to do the following:

  1. Calculate the reconstruction error for the data vector reconstructed using mean and top-3 eigenvectors.
  2. Remove the ones for which reconstruction error is NaN.
  3. Choose two with the lowest and two with the highest reconstruction error for plotting.

In [ ]:


In [186]:
evecs = STAT['TMAX']['eigvec'] #(365, 365)
evals = STAT['TMAX']['eigval'] #(365, 0)
U=evecs

X = np.dot(np.dot(U, U.T), T)

U = STAT['TMAX']['eigvec']
np.array(U[:,:3])


Out[186]:
array([[ 0.06778057, -0.05895165,  0.01073846],
       [ 0.06727524, -0.06248513,  0.01057388],
       [ 0.06744335, -0.06353619,  0.02212949],
       ..., 
       [ 0.0655475 , -0.05457212,  0.08083343],
       [ 0.0672239 , -0.05235861,  0.0786577 ],
       [ 0.06702058, -0.05001858,  0.08235398]])

In [220]:
# I have to do this 3 times: once for 1 then 2 then 3 evecs


m="TMIN"
stations = m_df[m]

T = samples = np.array(stations.ix[:,3:])
evecs = STAT[m]['eigvec']
Reconstructed={}
Reconstructed_errors={}

for i in range(1,4): #1,2,3
    U = evecs[:,0:i]
    UUT = np.dot(U, U.T) #(365,365)
    X = np.dot(UUT, T.T) #(365, 5830)
    Reconstructed[i] = X
    error = (T - X.T)**2 #(5830, 365)
    mse = np.mean(np.nan_to_num(error), axis=1) #(5830,)
    mse[mse==0]=numpy.nan #convert 0s back to nan
    Reconstructed_errors[i] = mse
    


#now find the 2 smallest and largest errors
errors = Reconstructed_errors[3]
mask = np.isnan(errors)
errors[mask] = np.interp(np.flatnonzero(mask), np.flatnonzero(~mask), errors[~mask])
sorted_indicies = errors.argsort()

low_i = sorted_indicies[0:2]
high_i = sorted_indicies[-2:]
# 2772 2782
# 2408 3928
print low_i
print high_i


[3177 4092]
[2402 2403]

In [143]:
errors


Out[143]:
array([ 3077.02251599,  3060.54665177,  3175.96512361, ...,   999.26158198,
        1026.38850635,  1026.38850635])

In [223]:
def create_reconstructions(m):  
    
    ## Put your code for computating reconstructions here
    stations = m_df[m]
    T = samples = np.array(stations.ix[:,3:])
    norm_data = T - STAT[m]['Mean']
    evecs = STAT[m]['eigvec']
    Reconstructed={}
    Reconstructed_errors={}
    original = T

    for i in range(1,4): #1,2,3
        U = evecs[:,0:i]
        UUT = np.dot(U, U.T) #(365,365)
        X = np.dot(UUT, norm_data.T) 
        X = (X.T + STAT[m]['Mean']).T #add back mean here #(365, 5830)
        Reconstructed[i] = X 

        error = (norm_data - X.T)**2 #(5830, 365)
        mse = np.mean(np.nan_to_num(error), axis=1) #(5830,)
        mse[mse==0]=numpy.nan #convert 0s back to nan
        Reconstructed_errors[i] = mse


    #now find the 2 smallest and largest errors
    errors = Reconstructed_errors[3]
    mask = np.isnan(errors)
    errors[mask] = np.interp(np.flatnonzero(mask), np.flatnonzero(~mask), errors[~mask])
    sorted_indicies = errors.argsort()

    low_i = sorted_indicies[0:2]
    high_i = sorted_indicies[-2:]

    
    lower = low_i
    upper = high_i

    
    
    yeardays=[i for i in (1,366)]
    plt.figure(figsize=(20,30),dpi=300)
    j=1
    c=0
    for l in lower:
        subplot(4,2,j)
        j+=1
        c+=1

        plot(original[l])
        plot(STAT[m]['Mean'])
        plot(Reconstructed[1][:,l])
        plot(Reconstructed[2][:,l])
        plot(Reconstructed[3][:,l])
        title('#' + str(c) + ' Sample for ' + m + ' (low residual error)')
        xlim([0,365])
        legend(['original','Mean','1','2','3'],loc=2)
    
    c=0
    for l in upper:
        subplot(4,2,j)
        j+=1
        c+=1
        plot(original[l])
        plot(STAT[m]['Mean'])
        plot(Reconstructed[1][:,l])
        plot(Reconstructed[2][:,l])
        plot(Reconstructed[3][:,l])
        title('#' + str(c) + ' Sample for ' + m + ' (high residual error)')
        xlim([0,365])
        legend(['original','Mean','1','2','3'],loc=2)

In [224]:
for m in ['TMAX','SNWD']:
    print 'Reconstruction Plots for '+ m
    create_reconstructions(m)


Reconstruction Plots for TMAX
[2782 5079]
Reconstruction Plots for SNWD
[ 632 6063]

In [ ]: