In [1]:
from datetime import datetime
import urllib2
import pandas as pd
import pandas.io.data
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
pd.set_option('max_columns', 50)
%matplotlib inline

In [2]:
run_console = True
update_from_web = True
plot_on = True
run_console = True

Extract data from noaa.gov and store in csv files


In [3]:
if update_from_web:
    url = 'http://www.cpc.ncep.noaa.gov/data/indices/oni.ascii.txt'
    res = urllib2.Request(url)
    csvio = urllib2.urlopen(res)
    df = pd.read_csv(csvio, index_col=['YR', 'SEAS'], sep='\s+', usecols=['SEAS', 'YR','TOTAL','ANOM'])
    df.to_csv('ONI.csv')
    
    url = 'http://www.cpc.ncep.noaa.gov/products/analysis_monitoring/ensostuff/detrend.nino34.ascii.txt'
    res = urllib2.Request(url)
    csvio = urllib2.urlopen(res)
    df = pd.read_csv(csvio, sep='\s+', index_col=['YR', 'MON'], usecols=['MON','YR','TOTAL','ANOM','ClimAdjust'])
    df.to_csv('NINO34.csv')

Read data from csv files


In [4]:
oni = pd.read_csv('ONI.csv')
nino = pd.read_csv('NINO34.csv')
data = pd.read_excel('DGAregionIV.xls', 'Hoja1', index_col=0, parse_dates=True)

Parse index to datetime format


In [5]:
format = "%m/%Y"
date = pd.to_datetime(nino.MON.astype(str) + '/' + nino.YR.astype(str) , format=format)
nino.set_index(date, inplace=True)
# and cleanup
nino.drop(['MON','YR'], axis=1, inplace=True)

In [6]:
rng = pd.date_range('1/1/1950', periods=len(oni), freq='MS')
oni.set_index(rng, inplace=True)
# and cleanup
oni.drop(['SEAS','YR'], axis=1, inplace=True)

Plot data


In [7]:
oni.plot(figsize=(10, 10), subplots=True, sharex=True)


Out[7]:
array([<matplotlib.axes.AxesSubplot object at 0x0957EC10>,
       <matplotlib.axes.AxesSubplot object at 0x09591D10>], dtype=object)

In [8]:
nino.plot(figsize=(10, 10), subplots=True, sharex=True)


Out[8]:
array([<matplotlib.axes.AxesSubplot object at 0x0970D670>,
       <matplotlib.axes.AxesSubplot object at 0x09785850>,
       <matplotlib.axes.AxesSubplot object at 0x0992BA30>], dtype=object)

In [9]:
data.plot(figsize=(10, 10), subplots=True, sharex=True)


Out[9]:
array([<matplotlib.axes.AxesSubplot object at 0x09AD2290>,
       <matplotlib.axes.AxesSubplot object at 0x09C33F90>,
       <matplotlib.axes.AxesSubplot object at 0x09D141B0>,
       <matplotlib.axes.AxesSubplot object at 0x09DC4050>], dtype=object)

Data characterization


In [10]:
oni.info()
print '\n'
nino.info()
print '\n'
data.info()


<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 771 entries, 1950-01-01 00:00:00 to 2014-03-01 00:00:00
Freq: MS
Data columns (total 2 columns):
TOTAL    771 non-null float64
ANOM     771 non-null float64
dtypes: float64(2)

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 772 entries, 1950-01-01 00:00:00 to 2014-04-01 00:00:00
Data columns (total 3 columns):
TOTAL         772 non-null float64
ClimAdjust    772 non-null float64
ANOM          772 non-null float64
dtypes: float64(3)

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 39 entries, 1971-01-01 00:00:00 to 2009-01-01 00:00:00
Data columns (total 4 columns):
Prec_INIA      37 non-null float64
Snow_Laguna    39 non-null int64
Prec_Laguna    36 non-null float64
Q_Elqui        36 non-null float64
dtypes: float64(3), int64(1)

In [11]:
print oni.describe()
print nino.describe()
print data.describe()


            TOTAL        ANOM
count  771.000000  771.000000
mean    26.965292    0.002853
std      0.898122    0.799846
min     24.490000   -2.040000
25%     26.335000   -0.540000
50%     27.040000   -0.050000
75%     27.600000    0.510000
max     29.100000    2.360000

[8 rows x 2 columns]
            TOTAL  ClimAdjust        ANOM
count  772.000000  772.000000  772.000000
mean    26.965868   26.963277    0.002785
std      0.940379    0.473537    0.821303
min     24.290000   26.210000   -2.080000
25%     26.300000   26.620000   -0.572500
50%     27.025000   26.830000   -0.045000
75%     27.640000   27.340000    0.530000
max     29.130000   27.910000    2.380000

[8 rows x 3 columns]
        Prec_INIA  Snow_Laguna  Prec_Laguna     Q_Elqui
count   37.000000    39.000000    36.000000   36.000000
mean    98.778378   248.102564   167.622222  143.491944
std     75.783270   212.903956   104.863196  107.843943
min      1.300000    23.000000    21.000000   28.930000
25%     51.700000   102.500000    88.250000   77.327500
50%     77.500000   188.000000   141.250000  103.040000
75%    132.100000   312.000000   217.250000  152.415000
max    298.200000  1011.000000   406.000000  473.920000

[8 rows x 4 columns]

Warm (red) and cold (blue) episodes based on a threshold of +/- 0.5oC for the Oceanic Niño Index (ONI)

http://www.cpc.ncep.noaa.gov/products/analysis_monitoring/ensostuff/ensoyears.shtml

Wet years (Values over quantil 75%)


In [12]:
data[data.Snow_Laguna > data.Snow_Laguna.quantile(0.75)]


Out[12]:
Prec_INIA Snow_Laguna Prec_Laguna Q_Elqui
Year
1972-01-01 148.8 567 NaN 256.82
1978-01-01 65.0 479 NaN 201.13
1982-01-01 88.1 437 330.5 148.88
1984-01-01 258.5 681 352.5 417.34
1987-01-01 198.1 745 406.0 408.85
1991-01-01 196.1 313 169.5 103.72
1997-01-01 269.5 1011 404.0 473.92
2002-01-01 298.2 427 314.0 317.01
2005-01-01 65.6 355 219.5 136.69
2007-01-01 17.8 315 216.5 NaN

10 rows × 4 columns


In [13]:
data[data.Q_Elqui > data.Q_Elqui.quantile(0.75)]


Out[13]:
Prec_INIA Snow_Laguna Prec_Laguna Q_Elqui
Year
1972-01-01 148.8 567 NaN 256.82
1978-01-01 65.0 479 NaN 201.13
1983-01-01 187.5 273 167.0 185.94
1984-01-01 258.5 681 352.5 417.34
1987-01-01 198.1 745 406.0 408.85
1992-01-01 180.2 245 143.5 198.57
1997-01-01 269.5 1011 404.0 473.92
1998-01-01 25.9 117 51.0 152.94
2002-01-01 298.2 427 314.0 317.01

9 rows × 4 columns

Dry years (Values under quantil 25%)


In [14]:
data[data.Snow_Laguna < data.Snow_Laguna.quantile(0.25)]


Out[14]:
Prec_INIA Snow_Laguna Prec_Laguna Q_Elqui
Year
1971-01-01 86.4 98 124.0 28.93
1973-01-01 56.8 77 21.0 132.50
1975-01-01 110.7 81 93.0 51.84
1979-01-01 4.7 71 71.5 100.53
1981-01-01 80.6 95 68.0 88.84
1988-01-01 9.9 46 45.5 142.23
1993-01-01 39.5 63 74.0 102.36
1995-01-01 1.3 23 65.5 55.22
1996-01-01 43.4 63 43.0 38.28
2009-01-01 NaN 54 NaN NaN

10 rows × 4 columns


In [15]:
data[data.Q_Elqui < data.Q_Elqui.quantile(0.25)]


Out[15]:
Prec_INIA Snow_Laguna Prec_Laguna Q_Elqui
Year
1971-01-01 86.4 98 124.0 28.93
1974-01-01 46.6 212 225.0 74.90
1975-01-01 110.7 81 93.0 51.84
1976-01-01 88.3 148 207.5 44.40
1977-01-01 52.8 282 319.2 76.99
1990-01-01 64.5 111 67.5 57.54
1995-01-01 1.3 23 65.5 55.22
1996-01-01 43.4 63 43.0 38.28
2004-01-01 109.3 110 110.0 74.18

9 rows × 4 columns

Create ONI DataFrame with Month values in columns


In [16]:
df = oni.drop('TOTAL', 1)
df.index.name = 'date'
df['Year'] = df.index.year
df['Month'] = df.index.month
df = df.pivot(index='Year', columns='Month', values='ANOM')
df.head()


Out[16]:
Month 1 2 3 4 5 6 7 8 9 10 11 12
Year
1950 -1.39 -1.26 -1.16 -1.18 -1.10 -0.89 -0.59 -0.50 -0.43 -0.46 -0.55 -0.72
1951 -0.77 -0.62 -0.44 -0.16 0.01 0.36 0.61 0.95 1.08 1.22 1.11 0.90
1952 0.57 0.37 0.29 0.34 0.26 0.07 -0.07 0.01 0.17 0.23 0.23 0.33
1953 0.50 0.61 0.64 0.66 0.70 0.67 0.66 0.70 0.76 0.79 0.76 0.75
1954 0.72 0.52 0.06 -0.35 -0.52 -0.49 -0.57 -0.74 -0.80 -0.73 -0.72 -0.72

5 rows × 12 columns

Proposal for candidates in feature matrix

  1. ANOM trend from April to January
  2. ANOM value in April

In [17]:
wet_years = data[data.Snow_Laguna > data.Snow_Laguna.quantile(0.75)].index.year
(df[4] - df[1]).loc[wet_years]


Out[17]:
1972    0.99
1978   -0.86
1982    0.40
1984    0.13
1987   -0.17
1991   -0.07
1997    0.74
2002    0.47
2005   -0.26
2007   -0.92
dtype: float64

In [18]:
df[4].loc[wet_years]


Out[18]:
1972    0.35
1978   -0.15
1982    0.32
1984   -0.36
1987    1.07
1991    0.27
1997    0.23
2002    0.29
2005    0.31
2007   -0.23
Name: 4, dtype: float64

In [19]:
dry_years = data[data.Snow_Laguna < data.Snow_Laguna.quantile(0.25)].index.year
(df[4] - df[1]).loc[dry_years]


Out[19]:
1971    0.43
1973   -1.87
1975   -0.11
1979    0.31
1981   -0.03
1988   -0.98
1993    0.47
1995   -0.73
1996    0.44
2009    0.68
dtype: float64

In [20]:
df[4].loc[dry_years]


Out[20]:
1971   -0.81
1973   -0.05
1975   -0.65
1979    0.26
1981   -0.41
1988   -0.23
1993    0.63
1995    0.30
1996   -0.41
2009   -0.15
Name: 4, dtype: float64

In [21]:
dataset = pd.DataFrame()
dataset['Trend'] = (df[4] - df[1])
dataset['April'] = df[4]
dataset.index = pd.to_datetime(dataset.index,format='%Y')
dataset['Snow_Laguna'] = data.Snow_Laguna
dataset.dropna(inplace=True)
dataset.tail()


Out[21]:
Trend April Snow_Laguna
2005-01-01 -0.26 0.31 355
2006-01-01 0.60 -0.26 188
2007-01-01 -0.92 -0.23 315
2008-01-01 0.61 -0.90 223
2009-01-01 0.68 -0.15 54

5 rows × 3 columns


In [22]:
dataset.plot(figsize=(10, 10), subplots=True, sharex=True)


Out[22]:
array([<matplotlib.axes.AxesSubplot object at 0x0B8ED130>,
       <matplotlib.axes.AxesSubplot object at 0x09AB08D0>,
       <matplotlib.axes.AxesSubplot object at 0x0BA49F90>], dtype=object)

Define feature matrix and a label vector


In [23]:
X = dataset[['Trend','April']].values
y = dataset[['Snow_Laguna']].values.ravel()

print X.shape
print y.shape


(39, 2)
(39,)

In [24]:
from sklearn import metrics

In [25]:
from sklearn.naive_bayes import GaussianNB

# Instantiate the estimator
clf = GaussianNB()
# Fit the estimator to the data, leaving out the last five samples
clf.fit(X[:-5], y[:-5])
# Use the model to predict the last several labels
y_pred = clf.predict(X[-5:])

print y_pred
print y[-5:]
print 'Accuracy: %s' % metrics.accuracy_score(y[-5:], y_pred)


[ 63.  63.  63.  63.  63.]
[ 355.  188.  315.  223.   54.]
Accuracy: 0.0

In [26]:
from sklearn import linear_model

# Instantiate the estimator
clf = linear_model.BayesianRidge()
# Fit the estimator to the data, leaving out the last five samples
clf.fit(X[:-5], y[:-5])
# Use the model to predict the last several labels
y_pred = clf.predict(X[-5:])

print y_pred
print y[-5:]


[ 260.80677595  273.73059599  144.41926154  204.88992399  292.69034999]
[ 355.  188.  315.  223.   54.]

In [27]:
from sklearn import linear_model

# Instantiate the estimator
clf = linear_model.LinearRegression()
# Fit the estimator to the data, leaving out the last five samples
clf.fit(X[:-5], y[:-5])
# Use the model to predict the last several labels
y_pred = clf.predict(X[-5:])

print y_pred
print y[-5:]


[ 266.05511733  281.65587673  100.4668978   182.29601674  308.7279753 ]
[ 355.  188.  315.  223.   54.]

Implement Supervised learning with neural networks in PyBrain


In [28]:
from pybrain.tools.shortcuts import buildNetwork
from pybrain.structure import TanhLayer
from pybrain.datasets import SupervisedDataSet
from pybrain.supervised.trainers import BackpropTrainer

In [29]:
net = buildNetwork(2, 3, 1, bias=False, hiddenclass=TanhLayer)
net.activate([2, 1])


Out[29]:
array([-0.83467921])

In [30]:
ds = SupervisedDataSet(2, 1)
for i in np.arange(len(y[:-5])):
    ds.addSample(X[i], y[i])

In [31]:
for inpt, target in ds:
    print inpt, target


[ 0.43 -0.81] [ 98.]
[ 0.99  0.35] [ 567.]
[-1.87 -0.05] [ 77.]
[ 0.9  -0.98] [ 212.]
[-0.11 -0.65] [ 81.]
[ 1.08 -0.47] [ 148.]
[-0.36  0.28] [ 282.]
[-0.86 -0.15] [ 479.]
[ 0.31  0.26] [ 71.]
[-0.22  0.28] [ 311.]
[-0.03 -0.41] [ 95.]
[ 0.4   0.32] [ 437.]
[-0.97  1.19] [ 273.]
[ 0.13 -0.36] [ 681.]
[ 0.29 -0.73] [ 107.]
[ 0.32 -0.15] [ 253.]
[-0.17  1.07] [ 745.]
[-0.98 -0.23] [ 46.]
[ 0.87 -0.84] [ 179.]
[ 0.12  0.25] [ 111.]
[-0.07  0.27] [ 313.]
[-0.36  1.22] [ 245.]
[ 0.47  0.63] [ 63.]
[ 0.22  0.29] [ 154.]
[-0.73  0.3 ] [ 23.]
[ 0.44 -0.41] [ 63.]
[ 0.74  0.23] [ 1011.]
[-1.26  0.92] [ 117.]
[ 0.62 -0.89] [ 138.]
[ 0.81 -0.89] [ 257.]
[ 0.37 -0.35] [ 201.]
[ 0.47  0.29] [ 427.]
[-1.08 -0.01] [ 166.]
[-0.18  0.11] [ 110.]

In [32]:
trainer = BackpropTrainer(net, ds)
trainer.train()


Out[32]:
56748.544441673825

In [33]:
print 'predicted vs real'
for i in len(y) - np.arange(1,len(y[-5:])):
    print '%s vs %s' % (net.activate(X[i]), y[i])


predicted vs real
[ 6.98894746] vs 54.0
[ 6.98894734] vs 223.0
[-6.98894735] vs 315.0
[ 6.98894767] vs 188.0

In [34]:
if run_console:
    %qtconsole