In [1]:
#Follows example of tSNE in scikit learn for MNIST dataset
#http://alexanderfabisch.github.io/t-sne-in-scikit-learn.html
#With extra dimensionality reduction using PCA before tSNE calculation
#Created: 10.03.2016
#Last modified: 11.03.2016 Added decade column to use as labels for plotting tSNE results
# 14.03.2016 Added fn to select a cluster from the scatterplot, display it, and plot corresponding mean SLP
In [1]:
#%pylab inline
%pylab
In [3]:
#from matplotlib.pyplot import *
#%matplotlib inline
In [4]:
import numpy as np
from matplotlib import pyplot as plt
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
In [5]:
#pandas
import pandas as pd
import collections
import os
In [6]:
#FOR SOME REASON, netCDF4 MODULES NO LONGER INSTALLED...
#to read netCDF time
#import netCDF4
#from netCDF4 import num2date, date2num, date2index
In [7]:
#READ IN TIME PRODUCED BY /home/users/cnangini/MIRROR/analogues/get_time.py
dt = pd.DataFrame()
dt = pd.read_csv('/home/nils/data/cathy/time_out.dat',delim_whitespace=True,header=None)
dt.columns = ['Date', 'Time']
#Make 2 new cols for month and year obtained from col[0] date string
dt['Month'] = dt['Date'].str.split('-').str.get(1) #.astype(int)
dt['Year'] = dt['Date'].str.split('-').str.get(0).astype(int)
dt.head(5)
Out[7]:
In [8]:
#MAKE A SEASONS COL
#dt['Season'] = np.where(df['Month']=='Z', 'green', 'red')
#http://stackoverflow.com/questions/21733893/pandas-dataframe-add-a-field-based-on-multiple-if-statements
dt.loc[dt['Month'] == '12' ,'Season'] = 1
dt.loc[dt['Month'] == '01' ,'Season'] = 1
dt.loc[dt['Month'] == '02' ,'Season'] = 1
dt.loc[dt['Month'] == '03' ,'Season'] = 2
dt.loc[dt['Month'] == '04' ,'Season'] = 2
dt.loc[dt['Month'] == '05' ,'Season'] = 2
dt.loc[dt['Month'] == '06' ,'Season'] = 3
dt.loc[dt['Month'] == '07' ,'Season'] = 3
dt.loc[dt['Month'] == '08' ,'Season'] = 3
dt.loc[dt['Month'] == '09' ,'Season'] = 4
dt.loc[dt['Month'] == '10' ,'Season'] = 4
dt.loc[dt['Month'] == '11' ,'Season'] = 4
#df['Prediction'] = ((df['Sex']=='female') | ((df['Pclass']==1) & (df['Age']<18))).astype('int')
#dt.head(75)
dt.shape #(24800, 5)
Out[8]:
In [9]:
#MAKE A COL FOR ARCHIVE PERIOD
dt['archivePeriod'] = np.where(dt['Year'] < 1986, 0, 1)
dt.head(5)
#dt.shape #(24800, 6)
Out[9]:
In [10]:
arange(1956,1965).tolist()
Out[10]:
In [11]:
#MAKE A COL FOR ARCHIVE DECADE
#Note: not clear how to make conditional statement
#http://stackoverflow.com/questions/23461502/evaluating-pandas-series-values-with-logical-expressions-and-if-statements
dt.loc[dt['Year'] <= 1955 ,'Decade'] = 0
dt.loc[dt['Year'].isin(arange(1956,1966).tolist()) ,'Decade'] = 1
dt.loc[dt['Year'].isin(arange(1966,1976).tolist()) ,'Decade'] = 2
dt.loc[dt['Year'].isin(arange(1976,1986).tolist()) ,'Decade'] = 3
dt.loc[dt['Year'].isin(arange(1986,1996).tolist()) ,'Decade'] = 4
dt.loc[dt['Year'].isin(arange(1996,2006).tolist()) ,'Decade'] = 5
dt.loc[dt['Year'].isin(arange(2006,2016).tolist()) ,'Decade'] = 6
dt.shape #(24800, 7)
dt.iloc[[20000]]
dt.iloc[[12300]]
dt.loc[10000:10010]
dt.tail(10)
Out[11]:
In [12]:
#WRITE SEASON, ARCHIVE PERIOD AND DECADE COLUMNS TO NDARRAY TO USE AS LABELS
season = dt.iloc[:,4].values
season = season.astype(np.int64)
len(season) #24800
season[len(season) - 1]
archivePeriod = dt.iloc[:,5].values
archivePeriod = archivePeriod.astype(np.int64)
len(archivePeriod) #24800
archivePeriod[20000]
decade = dt.iloc[:,6].values
decade = decade.astype(np.int64)
len(decade) #24800
decade[20000]
Out[12]:
In [13]:
#MAP IMAGE INFO
#Each map is a matrix of 53 LON x 20 LAT
#Full dataset has 53 LON x 20 LAT x 24800 MAPS (1 map per date)
In [14]:
53*20*24800
Out[14]:
In [15]:
53*20
Out[15]:
In [17]:
# Create dataframe of SLP data from netCDF file
#File testI8.dat (header manually removed and stored in testI8_header.dat)
# 5,456,000 lines x 5 cols => every 11 lines contain 53 LON values for 1 LAT value.
# Thus there are 5,456,000/11 = 496,000 LAT values in total, corresponding to
# 496,000/20 = 24,800 images (or dates)
# Reshape to 24,800 DATE rows x (53*20 = 1060) cols = 26,288,000 values
#Target (labels) file therefore is a 1D array of 24,800 elements
df = pd.DataFrame()
df = pd.read_csv('/home/nils/data/cathy/testI8.dat',delim_whitespace=True,header=None)
In [18]:
df.head(5)
#df.index.name
Out[18]:
In [19]:
#testI8.dat: 5,456,000 rows x 5 cols
df.shape
Out[19]:
In [20]:
df.shape[0]/(11*20)
Out[20]:
In [21]:
#CONVERT TO (numDATE = 24800) x (53 LON * 20 LAT = 1060) MATRIX
row = df.shape[0]/(11*20) #number of dates in dataset (11 lines/lat, 20 lat/date)
col = 53 * 20
#CONVERT DF TO MATRIX AND RESHAPE, OMITTING NaNs
adata = df.values[np.logical_not(np.isnan(df.values))].reshape(row,col)
In [63]:
len(season)
Out[63]:
In [23]:
#CONVERT adata FROM NETCDF FILE TO DATAFRAME SO THAT SLP ANOMALIES CAN BE COMPUTED
dA = pd.DataFrame(adata)
#column-wise yearly mean for each dimension: dA.mean(axis=0) #shape (1060,)
#SLP ANOMALY (normalize by mean over entire reference period)
dA_yrAnom = dA.div(dA.mean(axis=0)) #shape (24800, 1060)
dA.head(5)
Out[23]:
In [24]:
#LOOK AT INDIVIDUAL IMAGES OF RAW SLP
#SLP, RAW, IMAGE 0
img_all = df.values[np.logical_not(np.isnan(df.values))].reshape(row,20,53)
img_all.shape
fig = figure(figsize=(10, 10))
img1 = img_all[0]
title('SLP RAW, IMG 0')
plt.imshow(img1)
plt.colorbar()
plt.gca().invert_yaxis()
#plt.gca().invert_xaxis()
#Correct. Compare with http://webportals.ipsl.jussieu.fr/ScientificApps/dev/forge_cathy/analogues/maps2_run.php?command=fill/line/lev=%28-inf%29%28995,1025,5%29%28inf%29/color=1/pal=mpl_PSU_plasma/key&dateStart=01-JAN-1948&dateEnd=01-JAN-1948
#SLP, RAW, IMAGE 20000
#fig = figure(figsize=(10, 10))
#title('SLP RAW, IMG 20000')
#imgn = img_all[20000]
#plt.imshow(imgn)
#plt.colorbar()
#plt.gca().invert_yaxis()
#plt.show()
In [39]:
In [25]:
#LOOK AT INDIVIDUAL IMAGES OF SLP ANOMALY (NORMALIZED BY MEAN OVER REF PERIOD)
fig = figure(figsize=(10, 10))
img0 = dA_yrAnom.iloc[0,:].reshape(20,53)
title('SLP YEARLY ANOMALY, IMG 1')
plt.imshow(img0)
plt.colorbar()
plt.gca().invert_yaxis()
fig = figure(figsize=(10, 10))
img0 = dA_yrAnom.iloc[20000,:].reshape(20,53)
title('SLP YEARLY ANOMALY, IMG 20000')
plt.imshow(img0)
plt.colorbar()
plt.gca().invert_yaxis()
In [26]:
#ADD SEASON TO ANOMALY DF TO CALCULATE SEASONAL ANOMALIES
dA['Season'] = season
dA_DJF = dA.loc[dA['Season'] == 1] #shape (6106, 1061)
dA_MAM = dA.loc[dA['Season'] == 2] #shape (6256, 1061)
dA_JJA = dA.loc[dA['Season'] == 3] #shape (6256, 1061)
dA_SON = dA.loc[dA['Season'] == 4] #shape (6182, 1061)
dA_DJF = dA_DJF.drop(['Season'], 1)
dA_MAM = dA_MAM.drop(['Season'], 1)
dA_JJA = dA_JJA.drop(['Season'], 1)
dA_SON = dA_SON.drop(['Season'], 1)
#column-wise seasonal mean for each dimension: dA_DJF.mean(axis=0) #shape (1060,)
#seasonal SLP anomaly
dA_DJFAnom = dA_DJF.div(dA_DJF.mean(axis=0)) #shape (24800, 1060)
dA_MAMAnom = dA_MAM.div(dA_MAM.mean(axis=0)) #shape (24800, 1060)
dA_JJAAnom = dA_JJA.div(dA_JJA.mean(axis=0)) #shape (24800, 1060)
dA_SONAnom = dA_SON.div(dA_SON.mean(axis=0)) #shape (24800, 1060)
dA_JJA.head(5)
Out[26]:
In [27]:
#LOOK AT INDIVIDUAL IMAGES OF DJF SLP ANOMALY
fig = figure(figsize=(10, 10))
img0 = dA_DJFAnom.iloc[0,:].reshape(20,53)
title('SLP DJF ANOMALY, IMG 1')
plt.imshow(img0)
plt.colorbar()
plt.gca().invert_yaxis()
In [28]:
# Plotting function
def plot_mnist(X, y, X_embedded, name, min_dist=10.0):
fig = figure(figsize=(10, 10))
ax = axes(frameon=True)
subplots_adjust(left=0.0, bottom=0.0, right=1.0, top=0.9,
wspace=0.0, hspace=0.0)
scatter(X_embedded[:, 0], X_embedded[:, 1],
c=y, marker="x")
In [29]:
#Define arrays for tSNE calculation
#X, y = mnist.data / 255.0, mnist.target #data, labels
X = adata #24800 x 1060
y = season #24800 x 1
In [30]:
# Create subset and reduce to first 50 dimensions using PCA
indices = np.arange(X.shape[0])
np.random.shuffle(indices)
n_train_samples = 5000
X_pca = PCA(n_components=50).fit_transform(X) #shape (6256, 50)
X_train = X_pca[indices[:n_train_samples]] #shape (5000, 50)
y_train = y[indices[:n_train_samples]] #shape (5000,)
y_train_archivePeriod = archivePeriod[indices[:n_train_samples]]
y_train_decade = decade[indices[:n_train_samples]]
In [42]:
#RUN TSNE ON SUBSET OF 5000 PTS
X_train_embedded = TSNE(n_components=2, perplexity=40, verbose=2).fit_transform(X_train) #shape (5000, 2)
In [ ]:
#RUN TSNE ON SLP DATA
#Use all datapoints
#NB: do this in ipython, not this notebook!
X_pca = PCA(n_components=50).fit_transform(X)
X_all = X_pca
y_all = season
X_all_embedded = TSNE(n_components=2, perplexity=40, verbose=2).fit_transform(X_all)
#plot
plot_mnist(X_all, y_all, X_all_embedded,"t-SNE", min_dist=20.0)
In [59]:
PCA?
In [55]:
X_all
Out[55]:
In [58]:
# Save to file
#filename = '/home/nils/data/cathy/tsne_scikitlearn_analogues_allpts.npy'
#np.save(filename, X_all_embedded)
# Read from file
X_tsne = np.load(filename) #shape (24800, 2)
plot_mnist(X_all, y_all, X_tsne,"t-SNE", min_dist=20.0)
In [50]:
In [40]:
#RUN TSNE ON YEARLY SLP ANOMALIES
#X_yrAnom = dA_yrAnom.values #24800 x 1060
label_season = season #24800 x 1
#performed in ipython
#### ##########
### RUN it with kmeans clustering --> sklearn.cluster.KMeans
#### ##########
#X_yrAnom_pca = PCA(n_components=50).fit_transform(X_yrAnom)
#X_yrAnom_tsne = TSNE(n_components=2, perplexity=40, verbose=2).fit_transform(X_yrAnom)
#[t-SNE] Computing pairwise distances...
#[t-SNE] Computing 121 nearest neighbors...
#[t-SNE] Computed conditional probabilities for sample 1000 / 24800
#[t-SNE] Computed conditional probabilities for sample 2000 / 24800
#[t-SNE] Computed conditional probabilities for sample 3000 / 24800
#[t-SNE] Computed conditional probabilities for sample 4000 / 24800
#[t-SNE] Computed conditional probabilities for sample 5000 / 24800
#[t-SNE] Computed conditional probabilities for sample 6000 / 24800
#[t-SNE] Computed conditional probabilities for sample 7000 / 24800
#[t-SNE] Computed conditional probabilities for sample 8000 / 24800
#[t-SNE] Computed conditional probabilities for sample 9000 / 24800
#[t-SNE] Computed conditional probabilities for sample 10000 / 24800
#[t-SNE] Computed conditional probabilities for sample 11000 / 24800
#[t-SNE] Computed conditional probabilities for sample 12000 / 24800
#[t-SNE] Computed conditional probabilities for sample 13000 / 24800
#[t-SNE] Computed conditional probabilities for sample 14000 / 24800
#[t-SNE] Computed conditional probabilities for sample 15000 / 24800
#[t-SNE] Computed conditional probabilities for sample 16000 / 24800
#[t-SNE] Computed conditional probabilities for sample 17000 / 24800
#[t-SNE] Computed conditional probabilities for sample 18000 / 24800
#[t-SNE] Computed conditional probabilities for sample 19000 / 24800
#[t-SNE] Computed conditional probabilities for sample 20000 / 24800
#[t-SNE] Computed conditional probabilities for sample 21000 / 24800
#[t-SNE] Computed conditional probabilities for sample 22000 / 24800
#[t-SNE] Computed conditional probabilities for sample 23000 / 24800
#[t-SNE] Computed conditional probabilities for sample 24000 / 24800
#[t-SNE] Computed conditional probabilities for sample 24800 / 24800
#[t-SNE] Mean sigma: 0.067055
#[t-SNE] Iteration 25: error = 0.8297511, gradient norm = 0.0000106
#[t-SNE] Iteration 25: gradient norm 0.000011. Finished.
#[t-SNE] Iteration 50: error = 0.8297430, gradient norm = 0.0007347
#[t-SNE] Iteration 50: gradient norm 0.000735. Finished.
#[t-SNE] Error after 50 iterations with early exaggeration: 0.829743
#[t-SNE] Iteration 75: error = 0.8283891, gradient norm = 0.0033106
#[t-SNE] Iteration 100: error = 0.8274307, gradient norm = 0.0025482
#[t-SNE] Iteration 125: error = 0.8271999, gradient norm = 0.0024152
#[t-SNE] Iteration 150: error = 0.8271373, gradient norm = 0.0023812
#[t-SNE] Iteration 175: error = 0.8271213, gradient norm = 0.0023721
#[t-SNE] Iteration 200: error = 0.8271173, gradient norm = 0.0023697
#[t-SNE] Iteration 200: error difference 0.000000. Finished.
#[t-SNE] Error after 200 iterations: 0.827117
#filename = '/home/nils/data/cathy/tsne_yrAnom.npy'
#np.save(filename, X_yrAnom_tsne)
# Load TSNE results from file
X_yrAnom_tsne = np.load(filename) #shape (24800, 2)
In [43]:
#PLOT TSNE EMBEDDING, ALL SEASONS
X_yrAnom_tsne = X_train_embedded
#ALL SEASONS
fig = figure(figsize=(10, 10))
ax = axes(frameon=True)
title("TSNE with PCA reduction on yearly SLP anomalies")
scatter(X_yrAnom_tsne[:, 0], X_yrAnom_tsne[:, 1], c=label_season, marker="x")
#scatter(X_tsne[:, 0], X_tsne[:, 1], c=season, marker="x") #RAW SLP DATA, NOT ANOMALIES
colorbar(ticks=range(4))
clim(1, 4)
text(5.5, 3.9, "SON", fontsize=12)
text(5.5, 1.3, "JJA", fontsize=12)
text(5.5, -1.4, "MAM", fontsize=12)
text(5.5, -4, "DJF", fontsize=12)
In [39]:
#CONVERT TSNE MATRIX TO DATAFRAME SO THAT IT CAN BE FILTERED
#Add Date, Season, archivePeriod and Decade (from dt) as columns fo filtering
dTSNE = pd.DataFrame(X_yrAnom_tsne) #pd.DataFrame(X_tsne) #RAW SLP
dTSNE['Date'] = dt['Date']
dTSNE['Season'] = dt['Season']
dTSNE['archivePeriod'] = archivePeriod
dTSNE['Decade'] = decade
dTSNE.columns = ['xcoord', 'ycoord', 'Date', 'Season', 'archivePeriod', 'Decade']
dTSNE.head(10)
Out[39]:
In [40]:
#ADD COLUMNS TO dA_yrAnom DATFRAME TO ALLOW FILTERING
#VARIOUS LABEL COLUMNS
dA_yrAnom['Date'] = dt['Date']
dA_yrAnom['Season'] = dt['Season']
dA_yrAnom['archivePeriod'] = archivePeriod
dA_yrAnom['Decade'] = decade
#X, Y COORDS OF THE TSNE EMBEDDING SO EVERYTHING IS IN ONE DATAFRAME AND CAN BE FILTERED AS DESIRED
dA_yrAnom['x_TSNE'] = dTSNE['xcoord']
dA_yrAnom['y_TSNE'] = dTSNE['ycoord']
In [188]:
dA_yrAnom.tail(5)
Out[188]:
In [41]:
#PLOT TSNE BY SEASON
#DJF
fig = figure(figsize=(10, 10))
title("TSNE FROM YEARLY ANOMALY, DJF only")
scatter(dTSNE.loc[dTSNE['Season'] == 1].values[:, 0],dTSNE.loc[dTSNE['Season'] == 1].values[:, 1], marker="x")
#savefig("TSNE_DJF.png")
#JJA
fig = figure(figsize=(10, 10))
title("TSNE FROM YEARLY ANOMALY, JJA only")
scatter(dTSNE.loc[dTSNE['Season'] == 3].values[:, 0],dTSNE.loc[dTSNE['Season'] == 3].values[:, 1], marker="x")
#savefig("TSNE_JJA.png")
Out[41]:
In [42]:
#PLOTTING FNS
#PLOT POINTS IN A SPECIFIED BOUNDARY BOX ON THE EMBEDDING PLOT
def plot_cluster(df, xbox, ybox, colour):
df_cl = df.loc[(df['x_TSNE'] > xbox[0]) & (df['x_TSNE'] < xbox[1]) & (df['y_TSNE'] > ybox[0]) & (df['y_TSNE'] < ybox[1])]
scatter(df_cl['x_TSNE'], df_cl['y_TSNE'], c=colour)
#print df_cl
#PLOT MEAN CLUSTER SLP
def plot_clusterSLP(df, xbox, ybox, title1, idx):
df_cl = df.loc[(df['x_TSNE'] > xbox[0]) & (df['x_TSNE'] < xbox[1]) & (df['y_TSNE'] > ybox[0]) & (df['y_TSNE'] < ybox[1])]
subplot(4,1,idx)
title(title1)
plt.imshow(df_cl.drop(['Date', 'Season', 'archivePeriod', 'Decade', 'x_TSNE', 'y_TSNE'], 1).mean().reshape(20,53))
colorbar()
In [43]:
#PLOT TSNE DJF CLUSTERS
TSNE_DJF = dA_yrAnom.loc[dA['Season'] == 1]
fig = figure(figsize=(10, 10))
title('TSNE FROM YEARLY ANOMALY, DJF')
scatter(TSNE_DJF['x_TSNE'].values, TSNE_DJF['y_TSNE'].values, marker="x")
plot_cluster(TSNE_DJF, [-1.5, -0.5], [-2.4, -1.9], 'r')
plot_cluster(TSNE_DJF, [-2.6, -1.2], [1.8, 2.6], 'g')
plot_cluster(TSNE_DJF, [1.8, 2.7], [0.7, 1.5], 'y')
plot_cluster(TSNE_DJF, [2.1, 3.0], [-0.8, 0.2], 'm')
#outliers
plot_cluster(TSNE_DJF, [-0.5, -0.3], [-3.1, -3.0], 'r') #2 dates: 1980-02-19, 1996-01-15
plot_cluster(TSNE_DJF, [-2.20, -2.10], [-2.60, -2.45], 'y') #4 dates: 1976-01-25, 1989-01-14, 2003-01-30, 2015-01-08
In [44]:
#PLOT TSNE DJF MEAN CLUSTER SLP
fig = figure(figsize=(10, 10))
plot_clusterSLP(TSNE_DJF, [-1.5, -0.5], [-2.4, -1.9], 'Mean DJF SLP, TSNE red cluster', 1)
plot_clusterSLP(TSNE_DJF, [-2.6, -1.2], [1.8, 2.6], 'Mean DJF SLP, TSNE green cluster', 2)
plot_clusterSLP(TSNE_DJF, [1.8, 2.7], [0.7, 1.5], 'Mean DJF SLP, TSNE yellow cluster', 3)
plot_clusterSLP(TSNE_DJF, [2.1, 3.0], [-0.8, 0.2], 'Mean DJF SLP, TSNE magenta cluster', 4)