notebook.community

Edit and run



In [1]:

    
#Follows example of tSNE in scikit learn for MNIST dataset
#http://alexanderfabisch.github.io/t-sne-in-scikit-learn.html

#With extra dimensionality reduction using PCA before tSNE calculation


#Created:       10.03.2016
#Last modified: 11.03.2016 Added decade column to use as labels for plotting tSNE results
#               14.03.2016 Added fn to select a cluster from the scatterplot, display it, and plot corresponding mean SLP



In [1]:

    
#%pylab inline
%pylab









    



Using matplotlib backend: Qt4Agg
Populating the interactive namespace from numpy and matplotlib



In [3]:

    
#from matplotlib.pyplot import *
#%matplotlib inline



In [4]:

    
import numpy as np
from matplotlib import pyplot as plt

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE



In [5]:

    
#pandas
import pandas as pd
import collections
import os



In [6]:

    
#FOR SOME REASON, netCDF4 MODULES NO LONGER INSTALLED...
#to read netCDF time
#import netCDF4
#from netCDF4 import num2date, date2num, date2index



In [7]:

    
#READ IN TIME PRODUCED BY /home/users/cnangini/MIRROR/analogues/get_time.py
dt = pd.DataFrame()
dt = pd.read_csv('/home/nils/data/cathy/time_out.dat',delim_whitespace=True,header=None)
dt.columns = ['Date', 'Time']

#Make 2 new cols for month and year obtained from col[0] date string
dt['Month'] = dt['Date'].str.split('-').str.get(1)  #.astype(int)
dt['Year'] = dt['Date'].str.split('-').str.get(0).astype(int)

dt.head(5)



In [8]:

    
#MAKE A SEASONS COL

#dt['Season'] = np.where(df['Month']=='Z', 'green', 'red')

#http://stackoverflow.com/questions/21733893/pandas-dataframe-add-a-field-based-on-multiple-if-statements
dt.loc[dt['Month'] == '12' ,'Season'] = 1
dt.loc[dt['Month'] == '01' ,'Season'] = 1
dt.loc[dt['Month'] == '02' ,'Season'] = 1
dt.loc[dt['Month'] == '03' ,'Season'] = 2
dt.loc[dt['Month'] == '04' ,'Season'] = 2
dt.loc[dt['Month'] == '05' ,'Season'] = 2
dt.loc[dt['Month'] == '06' ,'Season'] = 3
dt.loc[dt['Month'] == '07' ,'Season'] = 3
dt.loc[dt['Month'] == '08' ,'Season'] = 3
dt.loc[dt['Month'] == '09' ,'Season'] = 4
dt.loc[dt['Month'] == '10' ,'Season'] = 4
dt.loc[dt['Month'] == '11' ,'Season'] = 4

#df['Prediction'] = ((df['Sex']=='female') | ((df['Pclass']==1) & (df['Age']<18))).astype('int')

#dt.head(75)
dt.shape #(24800, 5)









    Out[8]:





(24800, 5)



In [9]:

    
#MAKE A COL FOR ARCHIVE PERIOD

dt['archivePeriod'] = np.where(dt['Year'] < 1986, 0, 1)

dt.head(5)
#dt.shape #(24800, 6)









    Out[9]:






  
    
      
      Date
      Time
      Month
      Year
      Season
      archivePeriod
    
  
  
    
      0
      1948-01-01
      00:00:00
      01
      1948
      1.0
      0
    
    
      1
      1948-01-02
      00:00:00
      01
      1948
      1.0
      0
    
    
      2
      1948-01-03
      00:00:00
      01
      1948
      1.0
      0
    
    
      3
      1948-01-04
      00:00:00
      01
      1948
      1.0
      0
    
    
      4
      1948-01-05
      00:00:00
      01
      1948
      1.0
      0



In [10]:

    
arange(1956,1965).tolist()









    Out[10]:





[1956, 1957, 1958, 1959, 1960, 1961, 1962, 1963, 1964]



In [11]:

    
#MAKE A COL FOR ARCHIVE DECADE
#Note: not clear how to make conditional statement
#http://stackoverflow.com/questions/23461502/evaluating-pandas-series-values-with-logical-expressions-and-if-statements

dt.loc[dt['Year'] <= 1955 ,'Decade'] = 0
dt.loc[dt['Year'].isin(arange(1956,1966).tolist())  ,'Decade'] = 1
dt.loc[dt['Year'].isin(arange(1966,1976).tolist())  ,'Decade'] = 2
dt.loc[dt['Year'].isin(arange(1976,1986).tolist())  ,'Decade'] = 3
dt.loc[dt['Year'].isin(arange(1986,1996).tolist())  ,'Decade'] = 4
dt.loc[dt['Year'].isin(arange(1996,2006).tolist())  ,'Decade'] = 5
dt.loc[dt['Year'].isin(arange(2006,2016).tolist())  ,'Decade'] = 6

dt.shape #(24800, 7)
dt.iloc[[20000]]
dt.iloc[[12300]]
dt.loc[10000:10010]
dt.tail(10)









    Out[11]:






  
    
      
      Date
      Time
      Month
      Year
      Season
      archivePeriod
      Decade
    
  
  
    
      24790
      2015-11-15
      00:00:00
      11
      2015
      4.0
      1
      6.0
    
    
      24791
      2015-11-16
      00:00:00
      11
      2015
      4.0
      1
      6.0
    
    
      24792
      2015-11-17
      00:00:00
      11
      2015
      4.0
      1
      6.0
    
    
      24793
      2015-11-18
      00:00:00
      11
      2015
      4.0
      1
      6.0
    
    
      24794
      2015-11-19
      00:00:00
      11
      2015
      4.0
      1
      6.0
    
    
      24795
      2015-11-20
      00:00:00
      11
      2015
      4.0
      1
      6.0
    
    
      24796
      2015-11-21
      00:00:00
      11
      2015
      4.0
      1
      6.0
    
    
      24797
      2015-11-22
      00:00:00
      11
      2015
      4.0
      1
      6.0
    
    
      24798
      2015-11-23
      00:00:00
      11
      2015
      4.0
      1
      6.0
    
    
      24799
      2015-11-24
      00:00:00
      11
      2015
      4.0
      1
      6.0



In [12]:

    
#WRITE SEASON, ARCHIVE PERIOD AND DECADE COLUMNS TO NDARRAY TO USE AS LABELS

season = dt.iloc[:,4].values
season = season.astype(np.int64)
len(season) #24800
season[len(season) - 1]

archivePeriod = dt.iloc[:,5].values
archivePeriod = archivePeriod.astype(np.int64)
len(archivePeriod) #24800
archivePeriod[20000]

decade = dt.iloc[:,6].values
decade = decade.astype(np.int64)
len(decade) #24800
decade[20000]









    Out[12]:





5



In [13]:

    
#MAP IMAGE INFO

#Each map is a matrix of 53 LON x 20 LAT
#Full dataset has 53 LON x 20 LAT x 24800 MAPS (1 map per date)



In [14]:

    
53*20*24800









    Out[14]:





26288000



In [15]:

    
53*20









    Out[15]:





1060



In [17]:

    
# Create dataframe of SLP data from netCDF file

#File testI8.dat (header manually removed and stored in testI8_header.dat)
# 5,456,000 lines x 5 cols => every 11 lines contain 53 LON values for 1 LAT value. 
# Thus there are 5,456,000/11 = 496,000 LAT values in total, corresponding to 
# 496,000/20 = 24,800 images (or dates)
# Reshape to 24,800 DATE rows x (53*20 = 1060) cols = 26,288,000 values
#Target (labels) file therefore is a 1D array of 24,800 elements

df = pd.DataFrame()
df = pd.read_csv('/home/nils/data/cathy/testI8.dat',delim_whitespace=True,header=None)



In [18]:

    
df.head(5)

#df.index.name



In [19]:

    
#testI8.dat: 5,456,000 rows x 5 cols
df.shape









    Out[19]:





(5456000, 5)



In [20]:

    
df.shape[0]/(11*20)









    Out[20]:





24800



In [21]:

    
#CONVERT TO (numDATE = 24800) x (53 LON * 20 LAT = 1060) MATRIX

row = df.shape[0]/(11*20) #number of dates in dataset (11 lines/lat, 20 lat/date)
col = 53 * 20

#CONVERT DF TO MATRIX AND RESHAPE, OMITTING NaNs
adata = df.values[np.logical_not(np.isnan(df.values))].reshape(row,col)



In [63]:

    
len(season)









    Out[63]:





24800



In [23]:

    
#CONVERT adata FROM NETCDF FILE TO DATAFRAME SO THAT SLP ANOMALIES CAN BE COMPUTED

dA = pd.DataFrame(adata)

#column-wise yearly mean for each dimension: dA.mean(axis=0) #shape (1060,)
#SLP ANOMALY (normalize by mean over entire reference period)

dA_yrAnom = dA.div(dA.mean(axis=0)) #shape (24800, 1060)

dA.head(5)









    Out[23]:






  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      ...
      1050
      1051
      1052
      1053
      1054
      1055
      1056
      1057
      1058
      1059
    
  
  
    
      0
      101562.0
      101610.0
      101655.0
      101712.0
      101762.0
      101790.0
      101810.0
      101805.0
      101787.0
      101770.0
      ...
      100540.0
      100607.0
      100637.0
      100625.0
      100585.0
      100545.0
      100505.0
      100487.0
      100487.0
      100507.0
    
    
      1
      101545.0
      101602.0
      101632.0
      101645.0
      101672.0
      101715.0
      101775.0
      101802.0
      101812.0
      101832.0
      ...
      99840.0
      99920.0
      99977.0
      100027.0
      100070.0
      100117.0
      100185.0
      100257.0
      100327.0
      100390.0
    
    
      2
      101577.0
      101577.0
      101557.0
      101532.0
      101527.0
      101560.0
      101627.0
      101682.0
      101745.0
      101825.0
      ...
      98940.0
      98915.0
      98867.0
      98810.0
      98780.0
      98792.0
      98872.0
      99017.0
      99210.0
      99430.0
    
    
      3
      101665.0
      101642.0
      101602.0
      101557.0
      101522.0
      101505.0
      101522.0
      101557.0
      101595.0
      101657.0
      ...
      99912.0
      99842.0
      99732.0
      99577.0
      99397.0
      99207.0
      99040.0
      98920.0
      98860.0
      98870.0
    
    
      4
      101700.0
      101607.0
      101532.0
      101470.0
      101432.0
      101425.0
      101455.0
      101505.0
      101565.0
      101657.0
      ...
      100375.0
      100280.0
      100167.0
      100017.0
      99845.0
      99650.0
      99445.0
      99242.0
      99050.0
      98880.0
    
  

5 rows × 1060 columns



In [24]:

    
#LOOK AT INDIVIDUAL IMAGES OF RAW SLP

#SLP, RAW, IMAGE 0
img_all = df.values[np.logical_not(np.isnan(df.values))].reshape(row,20,53)
img_all.shape

fig = figure(figsize=(10, 10))
img1 = img_all[0]
title('SLP RAW, IMG 0')
plt.imshow(img1)
plt.colorbar()
plt.gca().invert_yaxis()
#plt.gca().invert_xaxis()


#Correct. Compare with http://webportals.ipsl.jussieu.fr/ScientificApps/dev/forge_cathy/analogues/maps2_run.php?command=fill/line/lev=%28-inf%29%28995,1025,5%29%28inf%29/color=1/pal=mpl_PSU_plasma/key&dateStart=01-JAN-1948&dateEnd=01-JAN-1948

#SLP, RAW, IMAGE 20000
#fig = figure(figsize=(10, 10))
#title('SLP RAW, IMG 20000')
#imgn = img_all[20000]
#plt.imshow(imgn)
#plt.colorbar()
#plt.gca().invert_yaxis()
#plt.show()



In [39]:



In [25]:

    
#LOOK AT INDIVIDUAL IMAGES OF SLP ANOMALY (NORMALIZED BY MEAN OVER REF PERIOD)

fig = figure(figsize=(10, 10))
img0 = dA_yrAnom.iloc[0,:].reshape(20,53)
title('SLP YEARLY ANOMALY, IMG 1')
plt.imshow(img0)
plt.colorbar()
plt.gca().invert_yaxis()

fig = figure(figsize=(10, 10))
img0 = dA_yrAnom.iloc[20000,:].reshape(20,53)
title('SLP YEARLY ANOMALY, IMG 20000')
plt.imshow(img0)
plt.colorbar()
plt.gca().invert_yaxis()



In [26]:

    
#ADD SEASON TO ANOMALY DF TO CALCULATE SEASONAL ANOMALIES

dA['Season'] = season
dA_DJF = dA.loc[dA['Season'] == 1] #shape (6106, 1061)
dA_MAM = dA.loc[dA['Season'] == 2] #shape (6256, 1061)
dA_JJA = dA.loc[dA['Season'] == 3] #shape (6256, 1061)
dA_SON = dA.loc[dA['Season'] == 4] #shape (6182, 1061)

dA_DJF = dA_DJF.drop(['Season'], 1)
dA_MAM = dA_MAM.drop(['Season'], 1)
dA_JJA = dA_JJA.drop(['Season'], 1)
dA_SON = dA_SON.drop(['Season'], 1)

#column-wise seasonal mean for each dimension: dA_DJF.mean(axis=0) #shape (1060,)

#seasonal SLP anomaly
dA_DJFAnom = dA_DJF.div(dA_DJF.mean(axis=0)) #shape (24800, 1060)
dA_MAMAnom = dA_MAM.div(dA_MAM.mean(axis=0)) #shape (24800, 1060)
dA_JJAAnom = dA_JJA.div(dA_JJA.mean(axis=0)) #shape (24800, 1060)
dA_SONAnom = dA_SON.div(dA_SON.mean(axis=0)) #shape (24800, 1060)

dA_JJA.head(5)









    Out[26]:






  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      ...
      1050
      1051
      1052
      1053
      1054
      1055
      1056
      1057
      1058
      1059
    
  
  
    
      152
      101097.0
      101172.0
      101267.0
      101317.0
      101332.0
      101375.0
      101482.0
      101575.0
      101632.0
      101700.0
      ...
      102170.0
      102192.0
      102207.0
      102212.0
      102217.0
      102222.0
      102227.0
      102222.0
      102207.0
      102175.0
    
    
      153
      101177.0
      101267.0
      101335.0
      101360.0
      101365.0
      101420.0
      101535.0
      101650.0
      101735.0
      101817.0
      ...
      101645.0
      101667.0
      101675.0
      101682.0
      101685.0
      101692.0
      101710.0
      101745.0
      101782.0
      101812.0
    
    
      154
      101145.0
      101182.0
      101247.0
      101297.0
      101352.0
      101440.0
      101550.0
      101637.0
      101720.0
      101820.0
      ...
      101487.0
      101580.0
      101660.0
      101725.0
      101770.0
      101787.0
      101795.0
      101795.0
      101792.0
      101782.0
    
    
      155
      101310.0
      101352.0
      101405.0
      101430.0
      101442.0
      101480.0
      101555.0
      101622.0
      101677.0
      101747.0
      ...
      101110.0
      101317.0
      101520.0
      101705.0
      101857.0
      101977.0
      102062.0
      102117.0
      102147.0
      102155.0
    
    
      156
      101370.0
      101390.0
      101412.0
      101435.0
      101482.0
      101537.0
      101592.0
      101657.0
      101747.0
      101845.0
      ...
      100985.0
      101050.0
      101150.0
      101270.0
      101415.0
      101565.0
      101700.0
      101815.0
      101895.0
      101940.0
    
  

5 rows × 1060 columns



In [27]:

    
#LOOK AT INDIVIDUAL IMAGES OF DJF SLP ANOMALY

fig = figure(figsize=(10, 10))
img0 = dA_DJFAnom.iloc[0,:].reshape(20,53)
title('SLP DJF ANOMALY, IMG 1')
plt.imshow(img0)
plt.colorbar()
plt.gca().invert_yaxis()



In [28]:

    
# Plotting function

def plot_mnist(X, y, X_embedded, name, min_dist=10.0):
    fig = figure(figsize=(10, 10))
    ax = axes(frameon=True)
    subplots_adjust(left=0.0, bottom=0.0, right=1.0, top=0.9,
                    wspace=0.0, hspace=0.0)
    scatter(X_embedded[:, 0], X_embedded[:, 1],
            c=y, marker="x")



In [29]:

    
#Define arrays for tSNE calculation
#X, y = mnist.data / 255.0, mnist.target #data, labels

X = adata #24800 x 1060
y = season #24800 x 1



In [30]:

    
# Create subset and reduce to first 50 dimensions using PCA
indices = np.arange(X.shape[0])
np.random.shuffle(indices)
n_train_samples = 5000

X_pca = PCA(n_components=50).fit_transform(X) #shape (6256, 50)
X_train = X_pca[indices[:n_train_samples]] #shape (5000, 50)
y_train = y[indices[:n_train_samples]] #shape (5000,)
y_train_archivePeriod = archivePeriod[indices[:n_train_samples]]
y_train_decade = decade[indices[:n_train_samples]]



In [42]:

    
#RUN TSNE ON SUBSET OF 5000 PTS
X_train_embedded = TSNE(n_components=2, perplexity=40, verbose=2).fit_transform(X_train) #shape (5000, 2)









    



[t-SNE] Computing pairwise distances...
[t-SNE] Computing 121 nearest neighbors...
[t-SNE] Computed conditional probabilities for sample 1000 / 5000
[t-SNE] Computed conditional probabilities for sample 2000 / 5000
[t-SNE] Computed conditional probabilities for sample 3000 / 5000
[t-SNE] Computed conditional probabilities for sample 4000 / 5000
[t-SNE] Computed conditional probabilities for sample 5000 / 5000
[t-SNE] Mean sigma: 6400.609346
[t-SNE] Iteration 25: error = 1.6653107, gradient norm = 0.0094800
[t-SNE] Iteration 50: error = 1.6304889, gradient norm = 0.0054831
[t-SNE] Iteration 75: error = 1.5221460, gradient norm = 0.0018386
[t-SNE] Iteration 100: error = 1.4981953, gradient norm = 0.0015117
[t-SNE] Error after 100 iterations with early exaggeration: 1.498195
[t-SNE] Iteration 125: error = 1.4572191, gradient norm = 0.0011271
[t-SNE] Iteration 150: error = 1.4436326, gradient norm = 0.0010503
[t-SNE] Iteration 175: error = 1.4400251, gradient norm = 0.0010362
[t-SNE] Iteration 200: error = 1.4390403, gradient norm = 0.0010318
[t-SNE] Iteration 225: error = 1.4387625, gradient norm = 0.0010304
[t-SNE] Iteration 250: error = 1.4386886, gradient norm = 0.0010301
[t-SNE] Iteration 275: error = 1.4386662, gradient norm = 0.0010300
[t-SNE] Iteration 300: error = 1.4386612, gradient norm = 0.0010300
[t-SNE] Iteration 300: error difference 0.000000. Finished.
[t-SNE] Error after 300 iterations: 1.438661



In [ ]:

    
#RUN TSNE ON SLP DATA

#Use all datapoints
#NB: do this in ipython, not this notebook!

X_pca = PCA(n_components=50).fit_transform(X)
X_all = X_pca
y_all = season
X_all_embedded = TSNE(n_components=2, perplexity=40, verbose=2).fit_transform(X_all)

#plot
plot_mnist(X_all, y_all, X_all_embedded,"t-SNE", min_dist=20.0)



In [59]:

    
PCA?



In [55]:

    
X_all









    Out[55]:





array([[ -7420.7080092 ,  -4305.19141059,  14974.62887328, ...,
          -391.38305061,    437.95008601,    512.46037524],
       [-11816.9267341 ,  -3571.02457268,  18446.99416837, ...,
            43.59252083,    195.53888726,   -694.12622965],
       [-14373.11920892, -13806.26331781,  13346.79338354, ...,
          -135.62699498,   1305.38092886,    625.05674655],
       ..., 
       [  1301.80097621, -11563.32046377, -15686.41235909, ...,
          1205.53925954,   -873.96108598,    641.41276867],
       [-10769.43326729,  -4677.49964874, -18093.52331202, ...,
          1402.00265462,    -27.80190372,    646.57706237],
       [-15999.62146418,  -7938.28753617, -14290.06313911, ...,
          -340.14299184,    301.11507762,   -581.61854008]])



In [58]:

    
# Save to file
#filename = '/home/nils/data/cathy/tsne_scikitlearn_analogues_allpts.npy'
#np.save(filename, X_all_embedded)

 
# Read from file
X_tsne = np.load(filename) #shape (24800, 2)

plot_mnist(X_all, y_all, X_tsne,"t-SNE", min_dist=20.0)



In [50]:



In [40]:

    
#RUN TSNE ON YEARLY SLP ANOMALIES
#X_yrAnom = dA_yrAnom.values #24800 x 1060
label_season = season #24800 x 1

#performed in ipython

#### ##########
###  RUN it with kmeans clustering --> sklearn.cluster.KMeans
#### ##########

#X_yrAnom_pca = PCA(n_components=50).fit_transform(X_yrAnom)
#X_yrAnom_tsne = TSNE(n_components=2, perplexity=40, verbose=2).fit_transform(X_yrAnom)


#[t-SNE] Computing pairwise distances...
#[t-SNE] Computing 121 nearest neighbors...
#[t-SNE] Computed conditional probabilities for sample 1000 / 24800
#[t-SNE] Computed conditional probabilities for sample 2000 / 24800
#[t-SNE] Computed conditional probabilities for sample 3000 / 24800
#[t-SNE] Computed conditional probabilities for sample 4000 / 24800
#[t-SNE] Computed conditional probabilities for sample 5000 / 24800
#[t-SNE] Computed conditional probabilities for sample 6000 / 24800
#[t-SNE] Computed conditional probabilities for sample 7000 / 24800
#[t-SNE] Computed conditional probabilities for sample 8000 / 24800
#[t-SNE] Computed conditional probabilities for sample 9000 / 24800
#[t-SNE] Computed conditional probabilities for sample 10000 / 24800
#[t-SNE] Computed conditional probabilities for sample 11000 / 24800
#[t-SNE] Computed conditional probabilities for sample 12000 / 24800
#[t-SNE] Computed conditional probabilities for sample 13000 / 24800
#[t-SNE] Computed conditional probabilities for sample 14000 / 24800
#[t-SNE] Computed conditional probabilities for sample 15000 / 24800
#[t-SNE] Computed conditional probabilities for sample 16000 / 24800
#[t-SNE] Computed conditional probabilities for sample 17000 / 24800
#[t-SNE] Computed conditional probabilities for sample 18000 / 24800
#[t-SNE] Computed conditional probabilities for sample 19000 / 24800
#[t-SNE] Computed conditional probabilities for sample 20000 / 24800
#[t-SNE] Computed conditional probabilities for sample 21000 / 24800
#[t-SNE] Computed conditional probabilities for sample 22000 / 24800
#[t-SNE] Computed conditional probabilities for sample 23000 / 24800
#[t-SNE] Computed conditional probabilities for sample 24000 / 24800
#[t-SNE] Computed conditional probabilities for sample 24800 / 24800
#[t-SNE] Mean sigma: 0.067055
#[t-SNE] Iteration 25: error = 0.8297511, gradient norm = 0.0000106
#[t-SNE] Iteration 25: gradient norm 0.000011. Finished.
#[t-SNE] Iteration 50: error = 0.8297430, gradient norm = 0.0007347
#[t-SNE] Iteration 50: gradient norm 0.000735. Finished.
#[t-SNE] Error after 50 iterations with early exaggeration: 0.829743
#[t-SNE] Iteration 75: error = 0.8283891, gradient norm = 0.0033106
#[t-SNE] Iteration 100: error = 0.8274307, gradient norm = 0.0025482
#[t-SNE] Iteration 125: error = 0.8271999, gradient norm = 0.0024152
#[t-SNE] Iteration 150: error = 0.8271373, gradient norm = 0.0023812
#[t-SNE] Iteration 175: error = 0.8271213, gradient norm = 0.0023721
#[t-SNE] Iteration 200: error = 0.8271173, gradient norm = 0.0023697
#[t-SNE] Iteration 200: error difference 0.000000. Finished.
#[t-SNE] Error after 200 iterations: 0.827117

#filename = '/home/nils/data/cathy/tsne_yrAnom.npy'
#np.save(filename, X_yrAnom_tsne)

# Load TSNE results from file
X_yrAnom_tsne = np.load(filename) #shape (24800, 2)



In [43]:

    
#PLOT TSNE EMBEDDING, ALL SEASONS
X_yrAnom_tsne = X_train_embedded  

#ALL SEASONS
fig = figure(figsize=(10, 10))
ax = axes(frameon=True) 
title("TSNE with PCA reduction on yearly SLP anomalies")
scatter(X_yrAnom_tsne[:, 0], X_yrAnom_tsne[:, 1], c=label_season, marker="x")
#scatter(X_tsne[:, 0], X_tsne[:, 1], c=season, marker="x") #RAW SLP DATA, NOT ANOMALIES
colorbar(ticks=range(4))
clim(1, 4)
text(5.5, 3.9, "SON", fontsize=12)
text(5.5, 1.3, "JJA", fontsize=12)
text(5.5, -1.4, "MAM", fontsize=12)
text(5.5, -4, "DJF", fontsize=12)









    



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-43-958cfcaca653> in <module>()
      6 ax = axes(frameon=True)
      7 title("TSNE with PCA reduction on yearly SLP anomalies")
----> 8 scatter(X_yrAnom_tsne[:, 0], X_yrAnom_tsne[:, 1], c=label_season, marker="x")
      9 #scatter(X_tsne[:, 0], X_tsne[:, 1], c=season, marker="x") #RAW SLP DATA, NOT ANOMALIES
     10 colorbar(ticks=range(4))

/home/nils/.conda/envs/birdhouse/lib/python2.7/site-packages/matplotlib/pyplot.pyc in scatter(x, y, s, c, marker, cmap, norm, vmin, vmax, alpha, linewidths, verts, edgecolors, hold, data, **kwargs)
   3249                          vmin=vmin, vmax=vmax, alpha=alpha,
   3250                          linewidths=linewidths, verts=verts,
-> 3251                          edgecolors=edgecolors, data=data, **kwargs)
   3252     finally:
   3253         ax.hold(washold)

/home/nils/.conda/envs/birdhouse/lib/python2.7/site-packages/matplotlib/__init__.pyc in inner(ax, *args, **kwargs)
   1810                     warnings.warn(msg % (label_namer, func.__name__),
   1811                                   RuntimeWarning, stacklevel=2)
-> 1812             return func(ax, *args, **kwargs)
   1813         pre_doc = inner.__doc__
   1814         if pre_doc is None:

/home/nils/.conda/envs/birdhouse/lib/python2.7/site-packages/matplotlib/axes/_axes.pyc in scatter(self, x, y, s, c, marker, cmap, norm, vmin, vmax, alpha, linewidths, verts, edgecolors, **kwargs)
   3891                 offsets=offsets,
   3892                 transOffset=kwargs.pop('transform', self.transData),
-> 3893                 alpha=alpha
   3894                 )
   3895         collection.set_transform(mtransforms.IdentityTransform())

/home/nils/.conda/envs/birdhouse/lib/python2.7/site-packages/matplotlib/collections.pyc in __init__(self, paths, sizes, **kwargs)
    829         """
    830 
--> 831         Collection.__init__(self, **kwargs)
    832         self.set_paths(paths)
    833         self.set_sizes(sizes)

/home/nils/.conda/envs/birdhouse/lib/python2.7/site-packages/matplotlib/collections.pyc in __init__(self, edgecolors, facecolors, linewidths, linestyles, antialiaseds, offsets, transOffset, norm, cmap, pickradius, hatch, urls, offset_position, zorder, **kwargs)
    115 
    116         self.set_edgecolor(edgecolors)
--> 117         self.set_facecolor(facecolors)
    118         self.set_linewidth(linewidths)
    119         self.set_linestyle(linestyles)

/home/nils/.conda/envs/birdhouse/lib/python2.7/site-packages/matplotlib/collections.pyc in set_facecolor(self, c)
    610             c = mpl.rcParams['patch.facecolor']
    611         self._facecolors_original = c
--> 612         self._facecolors = mcolors.colorConverter.to_rgba_array(c, self._alpha)
    613         self.stale = True
    614 

/home/nils/.conda/envs/birdhouse/lib/python2.7/site-packages/matplotlib/colors.pyc in to_rgba_array(self, c, alpha)
    401             if isinstance(c, np.ndarray):
    402                 if c.ndim != 2 and c.dtype.kind not in 'SU':
--> 403                     raise ValueError("Color array must be two-dimensional")
    404                 if (c.ndim == 2 and c.shape[1] == 4 and c.dtype.kind == 'f'):
    405                     if (c.ravel() > 1).any() or (c.ravel() < 0).any():

ValueError: Color array must be two-dimensional



In [39]:

    
#CONVERT TSNE MATRIX TO DATAFRAME SO THAT IT CAN BE FILTERED
#Add Date, Season, archivePeriod and Decade (from dt) as columns fo filtering

dTSNE = pd.DataFrame(X_yrAnom_tsne)  #pd.DataFrame(X_tsne) #RAW SLP
dTSNE['Date'] = dt['Date']
dTSNE['Season'] = dt['Season']
dTSNE['archivePeriod'] = archivePeriod
dTSNE['Decade'] = decade

dTSNE.columns = ['xcoord', 'ycoord', 'Date', 'Season', 'archivePeriod', 'Decade']

dTSNE.head(10)









    Out[39]:






  
    
      
      xcoord
      ycoord
      Date
      Season
      archivePeriod
      Decade
    
  
  
    
      0
      -0.461965
      -0.677101
      1948-01-01
      1.0
      0
      0
    
    
      1
      -0.604743
      -0.538666
      1948-01-02
      1.0
      0
      0
    
    
      2
      -1.105354
      -1.144206
      1948-01-03
      1.0
      0
      0
    
    
      3
      -1.555567
      -0.966212
      1948-01-04
      1.0
      0
      0
    
    
      4
      -1.815013
      -0.758155
      1948-01-05
      1.0
      0
      0
    
    
      5
      -2.139731
      -0.294086
      1948-01-06
      1.0
      0
      0
    
    
      6
      -2.115170
      -0.277215
      1948-01-07
      1.0
      0
      0
    
    
      7
      -1.934536
      -0.461872
      1948-01-08
      1.0
      0
      0
    
    
      8
      -1.138612
      -0.821229
      1948-01-09
      1.0
      0
      0
    
    
      9
      -0.963943
      -0.960517
      1948-01-10
      1.0
      0
      0



In [40]:

    
#ADD COLUMNS TO dA_yrAnom DATFRAME TO ALLOW FILTERING

#VARIOUS LABEL COLUMNS
dA_yrAnom['Date'] = dt['Date']
dA_yrAnom['Season'] = dt['Season']
dA_yrAnom['archivePeriod'] = archivePeriod
dA_yrAnom['Decade'] = decade

#X, Y COORDS OF THE TSNE EMBEDDING SO EVERYTHING IS IN ONE DATAFRAME AND CAN BE FILTERED AS DESIRED
dA_yrAnom['x_TSNE'] = dTSNE['xcoord']
dA_yrAnom['y_TSNE'] = dTSNE['ycoord']



In [188]:

    
dA_yrAnom.tail(5)









    Out[188]:






  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      ...
      1056
      1057
      1058
      1059
      Date
      Season
      archivePeriod
      Decade
      x_TSNE
      y_TSNE
    
  
  
    
      24795
      0.998187
      0.998329
      0.998534
      0.998702
      0.998660
      0.997908
      0.996867
      0.995707
      0.994343
      0.993094
      ...
      0.984613
      0.985128
      0.986255
      0.987984
      2015-11-20
      4
      1
      6
      2.174449
      1.346213
    
    
      24796
      0.996986
      0.997247
      0.997579
      0.997865
      0.997991
      0.997682
      0.996995
      0.996366
      0.995699
      0.995078
      ...
      0.990559
      0.991548
      0.992743
      0.994093
      2015-11-21
      4
      1
      6
      2.275168
      1.195308
    
    
      24797
      0.996119
      0.996853
      0.997481
      0.997836
      0.998020
      0.998105
      0.998371
      0.998655
      0.998646
      0.998693
      ...
      0.989647
      0.990359
      0.991386
      0.992687
      2015-11-22
      4
      1
      6
      2.044314
      -0.056447
    
    
      24798
      0.997527
      0.997020
      0.996693
      0.996783
      0.997154
      0.997485
      0.997899
      0.998282
      0.998371
      0.998300
      ...
      0.987814
      0.987456
      0.987295
      0.987409
      2015-11-23
      4
      1
      6
      0.169110
      -1.620654
    
    
      24799
      1.000402
      0.999875
      0.998780
      0.997423
      0.996515
      0.996285
      0.996356
      0.996513
      0.996534
      0.996483
      ...
      0.984861
      0.984167
      0.983660
      0.983281
      2015-11-24
      4
      1
      6
      0.534276
      -0.710061
    
  

5 rows × 1066 columns



In [41]:

    
#PLOT TSNE BY SEASON

#DJF
fig = figure(figsize=(10, 10))
title("TSNE FROM YEARLY ANOMALY, DJF only")
scatter(dTSNE.loc[dTSNE['Season'] == 1].values[:, 0],dTSNE.loc[dTSNE['Season'] == 1].values[:, 1], marker="x")
#savefig("TSNE_DJF.png")

#JJA
fig = figure(figsize=(10, 10))
title("TSNE FROM YEARLY ANOMALY, JJA only")
scatter(dTSNE.loc[dTSNE['Season'] == 3].values[:, 0],dTSNE.loc[dTSNE['Season'] == 3].values[:, 1], marker="x")
#savefig("TSNE_JJA.png")









    Out[41]:





<matplotlib.collections.PathCollection at 0x7f39a63c9890>



In [42]:

    
#PLOTTING FNS 

#PLOT POINTS IN A SPECIFIED BOUNDARY BOX ON THE EMBEDDING PLOT
def plot_cluster(df, xbox, ybox, colour):
    df_cl = df.loc[(df['x_TSNE'] > xbox[0]) & (df['x_TSNE'] < xbox[1]) & (df['y_TSNE'] > ybox[0]) & (df['y_TSNE'] < ybox[1])]
    scatter(df_cl['x_TSNE'], df_cl['y_TSNE'], c=colour)
    #print df_cl


#PLOT MEAN CLUSTER SLP
def plot_clusterSLP(df, xbox, ybox, title1, idx):
    df_cl = df.loc[(df['x_TSNE'] > xbox[0]) & (df['x_TSNE'] < xbox[1]) & (df['y_TSNE'] > ybox[0]) & (df['y_TSNE'] < ybox[1])]
          
    subplot(4,1,idx)
    title(title1)
    plt.imshow(df_cl.drop(['Date', 'Season', 'archivePeriod', 'Decade', 'x_TSNE', 'y_TSNE'], 1).mean().reshape(20,53))
    colorbar()



In [43]:

    
#PLOT TSNE DJF CLUSTERS
TSNE_DJF = dA_yrAnom.loc[dA['Season'] == 1]

fig = figure(figsize=(10, 10))
title('TSNE FROM YEARLY ANOMALY, DJF')
scatter(TSNE_DJF['x_TSNE'].values, TSNE_DJF['y_TSNE'].values, marker="x")
plot_cluster(TSNE_DJF, [-1.5, -0.5], [-2.4, -1.9], 'r')
plot_cluster(TSNE_DJF, [-2.6, -1.2], [1.8, 2.6], 'g')
plot_cluster(TSNE_DJF, [1.8, 2.7], [0.7, 1.5], 'y')
plot_cluster(TSNE_DJF, [2.1, 3.0], [-0.8, 0.2], 'm')
#outliers
plot_cluster(TSNE_DJF, [-0.5, -0.3], [-3.1, -3.0], 'r') #2 dates: 1980-02-19, 1996-01-15
plot_cluster(TSNE_DJF, [-2.20, -2.10], [-2.60, -2.45], 'y') #4 dates: 1976-01-25, 1989-01-14, 2003-01-30, 2015-01-08



In [44]:

    
#PLOT TSNE DJF MEAN CLUSTER SLP

fig = figure(figsize=(10, 10))
plot_clusterSLP(TSNE_DJF, [-1.5, -0.5], [-2.4, -1.9], 'Mean DJF SLP, TSNE red cluster', 1)
plot_clusterSLP(TSNE_DJF, [-2.6, -1.2], [1.8, 2.6], 'Mean DJF SLP, TSNE green cluster', 2)
plot_clusterSLP(TSNE_DJF, [1.8, 2.7], [0.7, 1.5], 'Mean DJF SLP, TSNE yellow cluster', 3)
plot_clusterSLP(TSNE_DJF, [2.1, 3.0], [-0.8, 0.2], 'Mean DJF SLP, TSNE magenta cluster', 4)

	Date	Time	Month	Year
0	1948-01-01	00:00:00	01	1948
1	1948-01-02	00:00:00	01	1948
2	1948-01-03	00:00:00	01	1948
3	1948-01-04	00:00:00	01	1948
4	1948-01-05	00:00:00	01	1948

	Date	Time	Month	Year	Season	archivePeriod	Decade
24790	2015-11-15	00:00:00	11	2015	4.0	1	6.0
24791	2015-11-16	00:00:00	11	2015	4.0	1	6.0
24792	2015-11-17	00:00:00	11	2015	4.0	1	6.0
24793	2015-11-18	00:00:00	11	2015	4.0	1	6.0
24794	2015-11-19	00:00:00	11	2015	4.0	1	6.0
24795	2015-11-20	00:00:00	11	2015	4.0	1	6.0
24796	2015-11-21	00:00:00	11	2015	4.0	1	6.0
24797	2015-11-22	00:00:00	11	2015	4.0	1	6.0
24798	2015-11-23	00:00:00	11	2015	4.0	1	6.0
24799	2015-11-24	00:00:00	11	2015	4.0	1	6.0

	0	1	2	3	4
0	101562.0	101610.0	101655.0	101712.0	101762.0
1	101790.0	101810.0	101805.0	101787.0	101770.0
2	101742.0	101700.0	101682.0	101697.0	101680.0
3	101660.0	101695.0	101737.0	101737.0	101760.0
4	101800.0	101785.0	101767.0	101775.0	101732.0

	0	1	2	3	4	5	6	7	8	9	...	1050	1051	1052	1053	1054	1055	1056	1057	1058	1059
152	101097.0	101172.0	101267.0	101317.0	101332.0	101375.0	101482.0	101575.0	101632.0	101700.0	...	102170.0	102192.0	102207.0	102212.0	102217.0	102222.0	102227.0	102222.0	102207.0	102175.0
153	101177.0	101267.0	101335.0	101360.0	101365.0	101420.0	101535.0	101650.0	101735.0	101817.0	...	101645.0	101667.0	101675.0	101682.0	101685.0	101692.0	101710.0	101745.0	101782.0	101812.0
154	101145.0	101182.0	101247.0	101297.0	101352.0	101440.0	101550.0	101637.0	101720.0	101820.0	...	101487.0	101580.0	101660.0	101725.0	101770.0	101787.0	101795.0	101795.0	101792.0	101782.0
155	101310.0	101352.0	101405.0	101430.0	101442.0	101480.0	101555.0	101622.0	101677.0	101747.0	...	101110.0	101317.0	101520.0	101705.0	101857.0	101977.0	102062.0	102117.0	102147.0	102155.0
156	101370.0	101390.0	101412.0	101435.0	101482.0	101537.0	101592.0	101657.0	101747.0	101845.0	...	100985.0	101050.0	101150.0	101270.0	101415.0	101565.0	101700.0	101815.0	101895.0	101940.0

	xcoord	ycoord	Date	Season
0	-0.461965	-0.677101	1948-01-01	1.0
1	-0.604743	-0.538666	1948-01-02	1.0
2	-1.105354	-1.144206	1948-01-03	1.0
3	-1.555567	-0.966212	1948-01-04	1.0
4	-1.815013	-0.758155	1948-01-05	1.0
5	-2.139731	-0.294086	1948-01-06	1.0
6	-2.115170	-0.277215	1948-01-07	1.0
7	-1.934536	-0.461872	1948-01-08	1.0
8	-1.138612	-0.821229	1948-01-09	1.0
9	-0.963943	-0.960517	1948-01-10	1.0

	0	1	2	3	4	5	6	7	8	9	...	1056	1057	1058	1059	Date	Season	archivePeriod	Decade	x_TSNE	y_TSNE
24795	0.998187	0.998329	0.998534	0.998702	0.998660	0.997908	0.996867	0.995707	0.994343	0.993094	...	0.984613	0.985128	0.986255	0.987984	2015-11-20	4	1	6	2.174449	1.346213
24796	0.996986	0.997247	0.997579	0.997865	0.997991	0.997682	0.996995	0.996366	0.995699	0.995078	...	0.990559	0.991548	0.992743	0.994093	2015-11-21	4	1	6	2.275168	1.195308
24797	0.996119	0.996853	0.997481	0.997836	0.998020	0.998105	0.998371	0.998655	0.998646	0.998693	...	0.989647	0.990359	0.991386	0.992687	2015-11-22	4	1	6	2.044314	-0.056447
24798	0.997527	0.997020	0.996693	0.996783	0.997154	0.997485	0.997899	0.998282	0.998371	0.998300	...	0.987814	0.987456	0.987295	0.987409	2015-11-23	4	1	6	0.169110	-1.620654
24799	1.000402	0.999875	0.998780	0.997423	0.996515	0.996285	0.996356	0.996513	0.996534	0.996483	...	0.984861	0.984167	0.983660	0.983281	2015-11-24	4	1	6	0.534276	-0.710061