In [1]:

    
from datetime import datetime
import urllib2
import pandas as pd
import pandas.io.data
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
pd.set_option('max_columns', 50)
%matplotlib inline



In [2]:

    
run_console = True
update_from_web = True
plot_on = True
run_console = True

Extract data from noaa.gov and store in csv files



In [3]:

    
if update_from_web:
    url = 'http://www.cpc.ncep.noaa.gov/data/indices/oni.ascii.txt'
    res = urllib2.Request(url)
    csvio = urllib2.urlopen(res)
    df = pd.read_csv(csvio, index_col=['YR', 'SEAS'], sep='\s+', usecols=['SEAS', 'YR','TOTAL','ANOM'])
    df.to_csv('ONI.csv')
    
    url = 'http://www.cpc.ncep.noaa.gov/products/analysis_monitoring/ensostuff/detrend.nino34.ascii.txt'
    res = urllib2.Request(url)
    csvio = urllib2.urlopen(res)
    df = pd.read_csv(csvio, sep='\s+', index_col=['YR', 'MON'], usecols=['MON','YR','TOTAL','ANOM','ClimAdjust'])
    df.to_csv('NINO34.csv')

Read data from csv files



In [4]:

    
oni = pd.read_csv('ONI.csv')
nino = pd.read_csv('NINO34.csv')
data = pd.read_excel('DGAregionIV.xls', 'Hoja1', index_col=0, parse_dates=True)

Parse index to datetime format



In [5]:

    
format = "%m/%Y"
date = pd.to_datetime(nino.MON.astype(str) + '/' + nino.YR.astype(str) , format=format)
nino.set_index(date, inplace=True)
# and cleanup
nino.drop(['MON','YR'], axis=1, inplace=True)



In [6]:

    
rng = pd.date_range('1/1/1950', periods=len(oni), freq='MS')
oni.set_index(rng, inplace=True)
# and cleanup
oni.drop(['SEAS','YR'], axis=1, inplace=True)

Plot data



In [7]:

    
oni.plot(figsize=(10, 10), subplots=True, sharex=True)









    Out[7]:





array([<matplotlib.axes.AxesSubplot object at 0x0957EC10>,
       <matplotlib.axes.AxesSubplot object at 0x09591D10>], dtype=object)



In [8]:

    
nino.plot(figsize=(10, 10), subplots=True, sharex=True)









    Out[8]:





array([<matplotlib.axes.AxesSubplot object at 0x0970D670>,
       <matplotlib.axes.AxesSubplot object at 0x09785850>,
       <matplotlib.axes.AxesSubplot object at 0x0992BA30>], dtype=object)



In [9]:

    
data.plot(figsize=(10, 10), subplots=True, sharex=True)









    Out[9]:





array([<matplotlib.axes.AxesSubplot object at 0x09AD2290>,
       <matplotlib.axes.AxesSubplot object at 0x09C33F90>,
       <matplotlib.axes.AxesSubplot object at 0x09D141B0>,
       <matplotlib.axes.AxesSubplot object at 0x09DC4050>], dtype=object)

Data characterization



In [10]:

    
oni.info()
print '\n'
nino.info()
print '\n'
data.info()









    



<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 771 entries, 1950-01-01 00:00:00 to 2014-03-01 00:00:00
Freq: MS
Data columns (total 2 columns):
TOTAL    771 non-null float64
ANOM     771 non-null float64
dtypes: float64(2)

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 772 entries, 1950-01-01 00:00:00 to 2014-04-01 00:00:00
Data columns (total 3 columns):
TOTAL         772 non-null float64
ClimAdjust    772 non-null float64
ANOM          772 non-null float64
dtypes: float64(3)

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 39 entries, 1971-01-01 00:00:00 to 2009-01-01 00:00:00
Data columns (total 4 columns):
Prec_INIA      37 non-null float64
Snow_Laguna    39 non-null int64
Prec_Laguna    36 non-null float64
Q_Elqui        36 non-null float64
dtypes: float64(3), int64(1)



In [11]:

    
print oni.describe()
print nino.describe()
print data.describe()









    



            TOTAL        ANOM
count  771.000000  771.000000
mean    26.965292    0.002853
std      0.898122    0.799846
min     24.490000   -2.040000
25%     26.335000   -0.540000
50%     27.040000   -0.050000
75%     27.600000    0.510000
max     29.100000    2.360000

[8 rows x 2 columns]
            TOTAL  ClimAdjust        ANOM
count  772.000000  772.000000  772.000000
mean    26.965868   26.963277    0.002785
std      0.940379    0.473537    0.821303
min     24.290000   26.210000   -2.080000
25%     26.300000   26.620000   -0.572500
50%     27.025000   26.830000   -0.045000
75%     27.640000   27.340000    0.530000
max     29.130000   27.910000    2.380000

[8 rows x 3 columns]
        Prec_INIA  Snow_Laguna  Prec_Laguna     Q_Elqui
count   37.000000    39.000000    36.000000   36.000000
mean    98.778378   248.102564   167.622222  143.491944
std     75.783270   212.903956   104.863196  107.843943
min      1.300000    23.000000    21.000000   28.930000
25%     51.700000   102.500000    88.250000   77.327500
50%     77.500000   188.000000   141.250000  103.040000
75%    132.100000   312.000000   217.250000  152.415000
max    298.200000  1011.000000   406.000000  473.920000

[8 rows x 4 columns]

Warm (red) and cold (blue) episodes based on a threshold of +/- 0.5oC for the Oceanic Niño Index (ONI)

http://www.cpc.ncep.noaa.gov/products/analysis_monitoring/ensostuff/ensoyears.shtml

Wet years (Values over quantil 75%)



In [12]:

    
data[data.Snow_Laguna > data.Snow_Laguna.quantile(0.75)]









    Out[12]:






  
    
      
      Prec_INIA
      Snow_Laguna
      Prec_Laguna
      Q_Elqui
    
    
      Year
      
      
      
      
    
  
  
    
      1972-01-01
       148.8
        567
         NaN
       256.82
    
    
      1978-01-01
        65.0
        479
         NaN
       201.13
    
    
      1982-01-01
        88.1
        437
       330.5
       148.88
    
    
      1984-01-01
       258.5
        681
       352.5
       417.34
    
    
      1987-01-01
       198.1
        745
       406.0
       408.85
    
    
      1991-01-01
       196.1
        313
       169.5
       103.72
    
    
      1997-01-01
       269.5
       1011
       404.0
       473.92
    
    
      2002-01-01
       298.2
        427
       314.0
       317.01
    
    
      2005-01-01
        65.6
        355
       219.5
       136.69
    
    
      2007-01-01
        17.8
        315
       216.5
          NaN
    
  

10 rows × 4 columns



In [13]:

    
data[data.Q_Elqui > data.Q_Elqui.quantile(0.75)]









    Out[13]:






  
    
      
      Prec_INIA
      Snow_Laguna
      Prec_Laguna
      Q_Elqui
    
    
      Year
      
      
      
      
    
  
  
    
      1972-01-01
       148.8
        567
         NaN
       256.82
    
    
      1978-01-01
        65.0
        479
         NaN
       201.13
    
    
      1983-01-01
       187.5
        273
       167.0
       185.94
    
    
      1984-01-01
       258.5
        681
       352.5
       417.34
    
    
      1987-01-01
       198.1
        745
       406.0
       408.85
    
    
      1992-01-01
       180.2
        245
       143.5
       198.57
    
    
      1997-01-01
       269.5
       1011
       404.0
       473.92
    
    
      1998-01-01
        25.9
        117
        51.0
       152.94
    
    
      2002-01-01
       298.2
        427
       314.0
       317.01
    
  

9 rows × 4 columns

Dry years (Values under quantil 25%)



In [14]:

    
data[data.Snow_Laguna < data.Snow_Laguna.quantile(0.25)]









    Out[14]:






  
    
      
      Prec_INIA
      Snow_Laguna
      Prec_Laguna
      Q_Elqui
    
    
      Year
      
      
      
      
    
  
  
    
      1971-01-01
        86.4
       98
       124.0
        28.93
    
    
      1973-01-01
        56.8
       77
        21.0
       132.50
    
    
      1975-01-01
       110.7
       81
        93.0
        51.84
    
    
      1979-01-01
         4.7
       71
        71.5
       100.53
    
    
      1981-01-01
        80.6
       95
        68.0
        88.84
    
    
      1988-01-01
         9.9
       46
        45.5
       142.23
    
    
      1993-01-01
        39.5
       63
        74.0
       102.36
    
    
      1995-01-01
         1.3
       23
        65.5
        55.22
    
    
      1996-01-01
        43.4
       63
        43.0
        38.28
    
    
      2009-01-01
         NaN
       54
         NaN
          NaN
    
  

10 rows × 4 columns



In [15]:

    
data[data.Q_Elqui < data.Q_Elqui.quantile(0.25)]









    Out[15]:






  
    
      
      Prec_INIA
      Snow_Laguna
      Prec_Laguna
      Q_Elqui
    
    
      Year
      
      
      
      
    
  
  
    
      1971-01-01
        86.4
        98
       124.0
       28.93
    
    
      1974-01-01
        46.6
       212
       225.0
       74.90
    
    
      1975-01-01
       110.7
        81
        93.0
       51.84
    
    
      1976-01-01
        88.3
       148
       207.5
       44.40
    
    
      1977-01-01
        52.8
       282
       319.2
       76.99
    
    
      1990-01-01
        64.5
       111
        67.5
       57.54
    
    
      1995-01-01
         1.3
        23
        65.5
       55.22
    
    
      1996-01-01
        43.4
        63
        43.0
       38.28
    
    
      2004-01-01
       109.3
       110
       110.0
       74.18
    
  

9 rows × 4 columns

Create ONI DataFrame with Month values in columns



In [16]:

    
df = oni.drop('TOTAL', 1)
df.index.name = 'date'
df['Year'] = df.index.year
df['Month'] = df.index.month
df = df.pivot(index='Year', columns='Month', values='ANOM')
df.head()









    Out[16]:






  
    
      Month
      1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      11
      12
    
    
      Year
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      1950
      -1.39
      -1.26
      -1.16
      -1.18
      -1.10
      -0.89
      -0.59
      -0.50
      -0.43
      -0.46
      -0.55
      -0.72
    
    
      1951
      -0.77
      -0.62
      -0.44
      -0.16
       0.01
       0.36
       0.61
       0.95
       1.08
       1.22
       1.11
       0.90
    
    
      1952
       0.57
       0.37
       0.29
       0.34
       0.26
       0.07
      -0.07
       0.01
       0.17
       0.23
       0.23
       0.33
    
    
      1953
       0.50
       0.61
       0.64
       0.66
       0.70
       0.67
       0.66
       0.70
       0.76
       0.79
       0.76
       0.75
    
    
      1954
       0.72
       0.52
       0.06
      -0.35
      -0.52
      -0.49
      -0.57
      -0.74
      -0.80
      -0.73
      -0.72
      -0.72
    
  

5 rows × 12 columns

Proposal for candidates in feature matrix

ANOM trend from April to January
ANOM value in April



In [17]:

    
wet_years = data[data.Snow_Laguna > data.Snow_Laguna.quantile(0.75)].index.year
(df[4] - df[1]).loc[wet_years]









    Out[17]:





1972    0.99
1978   -0.86
1982    0.40
1984    0.13
1987   -0.17
1991   -0.07
1997    0.74
2002    0.47
2005   -0.26
2007   -0.92
dtype: float64



In [18]:

    
df[4].loc[wet_years]









    Out[18]:





1972    0.35
1978   -0.15
1982    0.32
1984   -0.36
1987    1.07
1991    0.27
1997    0.23
2002    0.29
2005    0.31
2007   -0.23
Name: 4, dtype: float64



In [19]:

    
dry_years = data[data.Snow_Laguna < data.Snow_Laguna.quantile(0.25)].index.year
(df[4] - df[1]).loc[dry_years]









    Out[19]:





1971    0.43
1973   -1.87
1975   -0.11
1979    0.31
1981   -0.03
1988   -0.98
1993    0.47
1995   -0.73
1996    0.44
2009    0.68
dtype: float64



In [20]:

    
df[4].loc[dry_years]









    Out[20]:





1971   -0.81
1973   -0.05
1975   -0.65
1979    0.26
1981   -0.41
1988   -0.23
1993    0.63
1995    0.30
1996   -0.41
2009   -0.15
Name: 4, dtype: float64



In [21]:

    
dataset = pd.DataFrame()
dataset['Trend'] = (df[4] - df[1])
dataset['April'] = df[4]
dataset.index = pd.to_datetime(dataset.index,format='%Y')
dataset['Snow_Laguna'] = data.Snow_Laguna
dataset.dropna(inplace=True)
dataset.tail()









    Out[21]:






  
    
      
      Trend
      April
      Snow_Laguna
    
  
  
    
      2005-01-01
      -0.26
       0.31
       355
    
    
      2006-01-01
       0.60
      -0.26
       188
    
    
      2007-01-01
      -0.92
      -0.23
       315
    
    
      2008-01-01
       0.61
      -0.90
       223
    
    
      2009-01-01
       0.68
      -0.15
        54
    
  

5 rows × 3 columns



In [22]:

    
dataset.plot(figsize=(10, 10), subplots=True, sharex=True)









    Out[22]:





array([<matplotlib.axes.AxesSubplot object at 0x0B8ED130>,
       <matplotlib.axes.AxesSubplot object at 0x09AB08D0>,
       <matplotlib.axes.AxesSubplot object at 0x0BA49F90>], dtype=object)

TODO: Implement Machine Learning with sklearn

Define feature matrix and a label vector



In [23]:

    
X = dataset[['Trend','April']].values
y = dataset[['Snow_Laguna']].values.ravel()

print X.shape
print y.shape









    



(39, 2)
(39,)



In [24]:

    
from sklearn import metrics



In [25]:

    
from sklearn.naive_bayes import GaussianNB

# Instantiate the estimator
clf = GaussianNB()
# Fit the estimator to the data, leaving out the last five samples
clf.fit(X[:-5], y[:-5])
# Use the model to predict the last several labels
y_pred = clf.predict(X[-5:])

print y_pred
print y[-5:]
print 'Accuracy: %s' % metrics.accuracy_score(y[-5:], y_pred)









    



[ 63.  63.  63.  63.  63.]
[ 355.  188.  315.  223.   54.]
Accuracy: 0.0



In [26]:

    
from sklearn import linear_model

# Instantiate the estimator
clf = linear_model.BayesianRidge()
# Fit the estimator to the data, leaving out the last five samples
clf.fit(X[:-5], y[:-5])
# Use the model to predict the last several labels
y_pred = clf.predict(X[-5:])

print y_pred
print y[-5:]









    



[ 260.80677595  273.73059599  144.41926154  204.88992399  292.69034999]
[ 355.  188.  315.  223.   54.]



In [27]:

    
from sklearn import linear_model

# Instantiate the estimator
clf = linear_model.LinearRegression()
# Fit the estimator to the data, leaving out the last five samples
clf.fit(X[:-5], y[:-5])
# Use the model to predict the last several labels
y_pred = clf.predict(X[-5:])

print y_pred
print y[-5:]









    



[ 266.05511733  281.65587673  100.4668978   182.29601674  308.7279753 ]
[ 355.  188.  315.  223.   54.]

Implement Supervised learning with neural networks in PyBrain



In [28]:

    
from pybrain.tools.shortcuts import buildNetwork
from pybrain.structure import TanhLayer
from pybrain.datasets import SupervisedDataSet
from pybrain.supervised.trainers import BackpropTrainer



In [29]:

    
net = buildNetwork(2, 3, 1, bias=False, hiddenclass=TanhLayer)
net.activate([2, 1])









    Out[29]:





array([-0.83467921])



In [30]:

    
ds = SupervisedDataSet(2, 1)
for i in np.arange(len(y[:-5])):
    ds.addSample(X[i], y[i])



In [31]:

    
for inpt, target in ds:
    print inpt, target









    



[ 0.43 -0.81] [ 98.]
[ 0.99  0.35] [ 567.]
[-1.87 -0.05] [ 77.]
[ 0.9  -0.98] [ 212.]
[-0.11 -0.65] [ 81.]
[ 1.08 -0.47] [ 148.]
[-0.36  0.28] [ 282.]
[-0.86 -0.15] [ 479.]
[ 0.31  0.26] [ 71.]
[-0.22  0.28] [ 311.]
[-0.03 -0.41] [ 95.]
[ 0.4   0.32] [ 437.]
[-0.97  1.19] [ 273.]
[ 0.13 -0.36] [ 681.]
[ 0.29 -0.73] [ 107.]
[ 0.32 -0.15] [ 253.]
[-0.17  1.07] [ 745.]
[-0.98 -0.23] [ 46.]
[ 0.87 -0.84] [ 179.]
[ 0.12  0.25] [ 111.]
[-0.07  0.27] [ 313.]
[-0.36  1.22] [ 245.]
[ 0.47  0.63] [ 63.]
[ 0.22  0.29] [ 154.]
[-0.73  0.3 ] [ 23.]
[ 0.44 -0.41] [ 63.]
[ 0.74  0.23] [ 1011.]
[-1.26  0.92] [ 117.]
[ 0.62 -0.89] [ 138.]
[ 0.81 -0.89] [ 257.]
[ 0.37 -0.35] [ 201.]
[ 0.47  0.29] [ 427.]
[-1.08 -0.01] [ 166.]
[-0.18  0.11] [ 110.]



In [32]:

    
trainer = BackpropTrainer(net, ds)
trainer.train()









    Out[32]:





56748.544441673825



In [33]:

    
print 'predicted vs real'
for i in len(y) - np.arange(1,len(y[-5:])):
    print '%s vs %s' % (net.activate(X[i]), y[i])









    



predicted vs real
[ 6.98894746] vs 54.0
[ 6.98894734] vs 223.0
[-6.98894735] vs 315.0
[ 6.98894767] vs 188.0



In [34]:

    
if run_console:
    %qtconsole

	Prec_INIA	Snow_Laguna	Prec_Laguna	Q_Elqui
Year
1972-01-01	148.8	567	NaN	256.82
1978-01-01	65.0	479	NaN	201.13
1982-01-01	88.1	437	330.5	148.88
1984-01-01	258.5	681	352.5	417.34
1987-01-01	198.1	745	406.0	408.85
1991-01-01	196.1	313	169.5	103.72
1997-01-01	269.5	1011	404.0	473.92
2002-01-01	298.2	427	314.0	317.01
2005-01-01	65.6	355	219.5	136.69
2007-01-01	17.8	315	216.5	NaN

	Prec_INIA	Snow_Laguna	Prec_Laguna	Q_Elqui
Year
1971-01-01	86.4	98	124.0	28.93
1973-01-01	56.8	77	21.0	132.50
1975-01-01	110.7	81	93.0	51.84
1979-01-01	4.7	71	71.5	100.53
1981-01-01	80.6	95	68.0	88.84
1988-01-01	9.9	46	45.5	142.23
1993-01-01	39.5	63	74.0	102.36
1995-01-01	1.3	23	65.5	55.22
1996-01-01	43.4	63	43.0	38.28
2009-01-01	NaN	54	NaN	NaN

Month	1	2	3	4	5	6	7	8	9	10	11	12
Year
1950	-1.39	-1.26	-1.16	-1.18	-1.10	-0.89	-0.59	-0.50	-0.43	-0.46	-0.55	-0.72
1951	-0.77	-0.62	-0.44	-0.16	0.01	0.36	0.61	0.95	1.08	1.22	1.11	0.90
1952	0.57	0.37	0.29	0.34	0.26	0.07	-0.07	0.01	0.17	0.23	0.23	0.33
1953	0.50	0.61	0.64	0.66	0.70	0.67	0.66	0.70	0.76	0.79	0.76	0.75
1954	0.72	0.52	0.06	-0.35	-0.52	-0.49	-0.57	-0.74	-0.80	-0.73	-0.72	-0.72

	Trend	April	Snow_Laguna
2005-01-01	-0.26	0.31	355
2006-01-01	0.60	-0.26	188
2007-01-01	-0.92	-0.23	315
2008-01-01	0.61	-0.90	223
2009-01-01	0.68	-0.15	54