FDMS TME3

Florian Toque & Paul Willot

Dear professor Denoyer...

Warning

This is an early version of our entry for the Kaggle challenge

It's still very messy and we send it because we forgot that we had to submit our progress step by step...

To summarize our goal, we plan to use a RNN to take advantage of the sequential data



In [1]:

    
# from __future__ import exam_success
from __future__ import absolute_import
from __future__ import print_function

%matplotlib inline
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import random
import pandas as pd
import scipy.stats as stats

# Sk cheats
from sklearn.cross_validation import cross_val_score  # cross val
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.preprocessing import Imputer   # get rid of nan

13.765.202 lines in train.csv
8.022.757 lines in test.csv

Reduced to

10.000
5.000



In [2]:

    
%%time
#filename = "data/reduced_train_100000.csv"
#filename = "data/reduced_test_5000.csv"
filename = "data/reduced_train_100000.csv"
raw = pd.read_csv(filename)
raw = raw.set_index('Id')
#train = train.dropna()









    



CPU times: user 267 ms, sys: 41.6 ms, total: 309 ms
Wall time: 309 ms



In [3]:

    
l = float(len(raw["minutes_past"]))
comp = []
for i in raw.columns:
    #print(raw"%.03f, %s"%(1-train[i].isnull().sum()/l , i) )
    comp.append([1-raw[i].isnull().sum()/l , i])
comp.sort(key=lambda x: x[0], reverse=True)
comp









    Out[3]:





[[1.0, 'minutes_past'],
 [1.0, 'radardist_km'],
 [1.0, 'Expected'],
 [0.55457000000000001, 'RefComposite_5x5_90th'],
 [0.53336000000000006, 'Ref_5x5_90th'],
 [0.48194999999999999, 'RefComposite_5x5_50th'],
 [0.48109000000000002, 'RefComposite'],
 [0.45211000000000001, 'Ref_5x5_50th'],
 [0.45076000000000005, 'Ref'],
 [0.42332999999999998, 'RhoHV_5x5_90th'],
 [0.42332999999999998, 'Zdr_5x5_90th'],
 [0.42296, 'RefComposite_5x5_10th'],
 [0.38461999999999996, 'Ref_5x5_10th'],
 [0.36687999999999998, 'Kdp_5x5_90th'],
 [0.36124999999999996, 'RhoHV'],
 [0.36124999999999996, 'Zdr'],
 [0.36097999999999997, 'RhoHV_5x5_50th'],
 [0.36097999999999997, 'Zdr_5x5_50th'],
 [0.31401999999999997, 'Kdp_5x5_50th'],
 [0.31355999999999995, 'Kdp'],
 [0.30947000000000002, 'RhoHV_5x5_10th'],
 [0.30947000000000002, 'Zdr_5x5_10th'],
 [0.26480000000000004, 'Kdp_5x5_10th']]



In [4]:

    
raw = raw.dropna()

verifier val aberantes sur labels



In [5]:

    
raw.head()









    Out[5]:






  
    
      
      minutes_past
      radardist_km
      Ref
      Ref_5x5_10th
      Ref_5x5_50th
      Ref_5x5_90th
      RefComposite
      RefComposite_5x5_10th
      RefComposite_5x5_50th
      RefComposite_5x5_90th
      ...
      RhoHV_5x5_90th
      Zdr
      Zdr_5x5_10th
      Zdr_5x5_50th
      Zdr_5x5_90th
      Kdp
      Kdp_5x5_10th
      Kdp_5x5_50th
      Kdp_5x5_90th
      Expected
    
    
      Id
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      2
      1
      2
      9.0
      5.0
      7.5
      10.5
      15.0
      10.5
      16.5
      23.5
      ...
      0.998333
      0.3750
      -0.1250
      0.3125
      0.8750
      1.059998
      -1.410004
      -0.350006
      1.059998
      1.016
    
    
      2
      16
      2
      18.0
      14.0
      17.5
      21.0
      20.5
      18.0
      20.5
      23.0
      ...
      1.001667
      0.2500
      0.1250
      0.3750
      0.6875
      0.349991
      -1.059998
      0.000000
      1.059998
      1.016
    
    
      2
      21
      2
      24.5
      16.5
      21.0
      24.5
      24.5
      21.0
      24.0
      28.0
      ...
      0.998333
      0.2500
      0.0625
      0.1875
      0.5625
      -0.350006
      -1.059998
      -0.350006
      1.759994
      1.016
    
    
      2
      26
      2
      12.0
      12.0
      16.0
      20.0
      16.5
      17.0
      19.0
      21.0
      ...
      0.998333
      0.5625
      0.2500
      0.4375
      0.6875
      -1.760010
      -1.760010
      -0.350006
      0.709991
      1.016
    
    
      2
      31
      2
      22.5
      19.0
      22.0
      25.0
      26.0
      23.5
      25.5
      27.5
      ...
      1.001667
      0.0000
      -0.1875
      0.2500
      0.6250
      -1.059998
      -2.120010
      -0.710007
      0.349991
      1.016
    
  

5 rows × 23 columns



In [6]:

    
raw["Expected"].describe()









    Out[6]:





count    22182.000000
mean         4.968538
std         24.314519
min          0.010000
25%          0.508000
50%          2.032001
75%          4.064002
max        675.999400
Name: Expected, dtype: float64

Get rid of Nan value for now



In [7]:

    
#train_clean = train[[not i for i in np.isnan(train["Ref_5x5_10th"])]]

Forums indicate that a higher than 1m rainfall is probably an error. Which is quite understandable. We filter that out



In [8]:

    
raw = raw[raw['Expected'] < 1000]



In [9]:

    
raw['Expected'].describe()









    Out[9]:





count    22182.000000
mean         4.968538
std         24.314519
min          0.010000
25%          0.508000
50%          2.032001
75%          4.064002
max        675.999400
Name: Expected, dtype: float64



In [10]:

    
split = 0.2
train = raw.tail(int(len(raw)*1-split))
test = raw.tail(int(len(raw)*split))



In [11]:

    
#columns = [u'minutes_past', u'radardist_km', u'Ref', u'Ref_5x5_10th',
#       u'Ref_5x5_50th', u'Ref_5x5_90th', u'RefComposite',
#       u'RefComposite_5x5_10th', u'RefComposite_5x5_50th',
#       u'RefComposite_5x5_90th', u'RhoHV', u'RhoHV_5x5_10th',
#       u'RhoHV_5x5_50th', u'RhoHV_5x5_90th', u'Zdr', u'Zdr_5x5_10th',
#       u'Zdr_5x5_50th', u'Zdr_5x5_90th', u'Kdp', u'Kdp_5x5_10th',
#       u'Kdp_5x5_50th', u'Kdp_5x5_90th']
#columns = [u'radardist_km', u'Ref', u'Ref_5x5_10th']
columns = [ u'radardist_km', u'Ref', u'Ref_5x5_10th',
       u'Ref_5x5_50th', u'Ref_5x5_90th', u'RefComposite',
       u'RefComposite_5x5_10th', u'RefComposite_5x5_50th',
       u'RefComposite_5x5_90th', u'RhoHV', u'RhoHV_5x5_10th',
       u'RhoHV_5x5_50th', u'RhoHV_5x5_90th', u'Zdr', u'Zdr_5x5_10th',
       u'Zdr_5x5_50th', u'Zdr_5x5_90th', u'Kdp', u'Kdp_5x5_10th',
       u'Kdp_5x5_50th', u'Kdp_5x5_90th']
nb_features = len(columns)
data = raw[list(columns)]
data.head(5)









    Out[11]:






  
    
      
      radardist_km
      Ref
      Ref_5x5_10th
      Ref_5x5_50th
      Ref_5x5_90th
      RefComposite
      RefComposite_5x5_10th
      RefComposite_5x5_50th
      RefComposite_5x5_90th
      RhoHV
      ...
      RhoHV_5x5_50th
      RhoHV_5x5_90th
      Zdr
      Zdr_5x5_10th
      Zdr_5x5_50th
      Zdr_5x5_90th
      Kdp
      Kdp_5x5_10th
      Kdp_5x5_50th
      Kdp_5x5_90th
    
    
      Id
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      2
      2
      9.0
      5.0
      7.5
      10.5
      15.0
      10.5
      16.5
      23.5
      0.998333
      ...
      0.998333
      0.998333
      0.3750
      -0.1250
      0.3125
      0.8750
      1.059998
      -1.410004
      -0.350006
      1.059998
    
    
      2
      2
      18.0
      14.0
      17.5
      21.0
      20.5
      18.0
      20.5
      23.0
      0.995000
      ...
      0.998333
      1.001667
      0.2500
      0.1250
      0.3750
      0.6875
      0.349991
      -1.059998
      0.000000
      1.059998
    
    
      2
      2
      24.5
      16.5
      21.0
      24.5
      24.5
      21.0
      24.0
      28.0
      0.998333
      ...
      0.998333
      0.998333
      0.2500
      0.0625
      0.1875
      0.5625
      -0.350006
      -1.059998
      -0.350006
      1.759994
    
    
      2
      2
      12.0
      12.0
      16.0
      20.0
      16.5
      17.0
      19.0
      21.0
      0.998333
      ...
      0.998333
      0.998333
      0.5625
      0.2500
      0.4375
      0.6875
      -1.760010
      -1.760010
      -0.350006
      0.709991
    
    
      2
      2
      22.5
      19.0
      22.0
      25.0
      26.0
      23.5
      25.5
      27.5
      0.998333
      ...
      0.998333
      1.001667
      0.0000
      -0.1875
      0.2500
      0.6250
      -1.059998
      -2.120010
      -0.710007
      0.349991
    
  

5 rows × 21 columns



In [12]:

    
data.head(20)









    Out[12]:






  
    
      
      radardist_km
      Ref
      Ref_5x5_10th
      Ref_5x5_50th
      Ref_5x5_90th
      RefComposite
      RefComposite_5x5_10th
      RefComposite_5x5_50th
      RefComposite_5x5_90th
      RhoHV
      ...
      RhoHV_5x5_50th
      RhoHV_5x5_90th
      Zdr
      Zdr_5x5_10th
      Zdr_5x5_50th
      Zdr_5x5_90th
      Kdp
      Kdp_5x5_10th
      Kdp_5x5_50th
      Kdp_5x5_90th
    
    
      Id
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      2
      2
      9.0
      5.0
      7.5
      10.5
      15.0
      10.5
      16.5
      23.5
      0.998333
      ...
      0.998333
      0.998333
      0.3750
      -0.1250
      0.3125
      0.8750
      1.059998
      -1.410004
      -0.350006
      1.059998
    
    
      2
      2
      18.0
      14.0
      17.5
      21.0
      20.5
      18.0
      20.5
      23.0
      0.995000
      ...
      0.998333
      1.001667
      0.2500
      0.1250
      0.3750
      0.6875
      0.349991
      -1.059998
      0.000000
      1.059998
    
    
      2
      2
      24.5
      16.5
      21.0
      24.5
      24.5
      21.0
      24.0
      28.0
      0.998333
      ...
      0.998333
      0.998333
      0.2500
      0.0625
      0.1875
      0.5625
      -0.350006
      -1.059998
      -0.350006
      1.759994
    
    
      2
      2
      12.0
      12.0
      16.0
      20.0
      16.5
      17.0
      19.0
      21.0
      0.998333
      ...
      0.998333
      0.998333
      0.5625
      0.2500
      0.4375
      0.6875
      -1.760010
      -1.760010
      -0.350006
      0.709991
    
    
      2
      2
      22.5
      19.0
      22.0
      25.0
      26.0
      23.5
      25.5
      27.5
      0.998333
      ...
      0.998333
      1.001667
      0.0000
      -0.1875
      0.2500
      0.6250
      -1.059998
      -2.120010
      -0.710007
      0.349991
    
    
      2
      2
      14.0
      14.0
      18.5
      21.0
      19.5
      20.0
      21.0
      23.0
      0.998333
      ...
      0.998333
      0.998333
      0.5000
      0.1875
      0.4375
      0.8125
      0.000000
      -1.760010
      -0.350006
      1.059998
    
    
      2
      2
      12.0
      11.0
      12.5
      17.0
      19.5
      18.0
      21.0
      23.0
      0.998333
      ...
      0.998333
      0.998333
      0.6250
      0.3750
      0.6250
      0.8750
      -0.350006
      -0.350006
      0.000000
      0.349991
    
    
      2
      2
      1.5
      3.5
      7.0
      10.5
      18.0
      16.5
      18.5
      21.5
      0.998333
      ...
      0.998333
      0.998333
      0.3750
      0.1875
      0.5000
      0.6875
      0.349991
      -2.110001
      -0.350006
      1.059998
    
    
      2
      2
      16.0
      14.5
      18.0
      23.5
      28.0
      23.5
      26.5
      29.5
      0.998333
      ...
      0.998333
      0.998333
      0.8750
      0.6250
      0.9375
      1.3750
      -0.350006
      -1.410004
      -0.350006
      2.119995
    
    
      10
      10
      32.5
      32.0
      35.0
      37.0
      35.0
      33.5
      35.5
      39.5
      0.941667
      ...
      0.935000
      0.991667
      1.8750
      -1.9375
      0.3750
      2.6875
      10.989990
      -6.330002
      0.709991
      11.089996
    
    
      10
      10
      37.5
      30.5
      34.5
      37.5
      37.5
      32.5
      35.5
      39.0
      0.891667
      ...
      0.931667
      0.991667
      0.7500
      -1.5625
      0.6875
      3.6875
      -20.250000
      -11.750000
      0.000000
      8.429993
    
    
      10
      10
      36.0
      31.0
      34.0
      36.0
      36.0
      33.5
      36.5
      38.5
      0.955000
      ...
      0.911667
      0.971667
      1.1875
      -0.5000
      1.1875
      5.8750
      -3.169998
      -13.449997
      -3.169998
      8.080002
    
    
      10
      10
      29.0
      28.5
      32.0
      36.0
      29.0
      31.0
      33.0
      36.0
      0.761667
      ...
      0.931667
      0.985000
      0.9375
      -0.6875
      0.8125
      2.9375
      0.000000
      -0.350006
      1.059998
      10.399994
    
    
      10
      10
      32.5
      31.0
      33.5
      38.0
      32.5
      32.5
      34.0
      38.5
      0.975000
      ...
      0.971667
      0.985000
      -2.4375
      -1.3125
      0.5625
      2.2500
      5.979996
      0.000000
      2.110001
      6.879990
    
    
      10
      10
      32.0
      26.5
      32.0
      37.0
      36.5
      29.5
      35.0
      38.0
      0.861667
      ...
      0.915000
      0.978333
      -0.5000
      -1.2500
      0.3750
      3.3125
      -5.980011
      -6.389999
      1.059998
      11.929993
    
    
      10
      10
      36.5
      27.0
      32.5
      36.0
      36.5
      28.5
      34.0
      36.0
      0.991667
      ...
      0.978333
      0.991667
      0.3125
      -0.4375
      0.4375
      2.2500
      -1.880005
      -3.520004
      -1.059998
      2.470001
    
    
      10
      10
      32.5
      31.5
      34.5
      39.5
      34.5
      33.0
      35.5
      39.5
      0.995000
      ...
      0.981667
      0.995000
      0.2500
      -0.2500
      0.3750
      2.3750
      1.409988
      -5.550003
      0.349991
      4.229996
    
    
      10
      10
      38.0
      33.0
      36.0
      38.5
      38.0
      34.5
      37.0
      39.5
      0.751667
      ...
      0.941667
      0.985000
      -0.3125
      -0.8750
      0.5000
      2.8750
      4.509995
      -9.130005
      3.509994
      7.739990
    
    
      10
      10
      36.5
      32.5
      34.0
      36.0
      36.5
      33.5
      35.0
      36.5
      0.941667
      ...
      0.935000
      0.988333
      -1.2500
      -1.6250
      0.4375
      1.7500
      -2.470001
      -5.630005
      -0.350006
      3.169998
    
    
      10
      10
      32.5
      30.0
      31.5
      37.5
      32.5
      31.0
      32.5
      37.5
      0.865000
      ...
      0.968333
      0.991667
      1.0625
      -1.2500
      0.0000
      2.0625
      -2.120010
      -5.290008
      -3.510010
      2.119995
    
  

20 rows × 21 columns



In [13]:

    
%%time
#max_padding = 20
docX, docY = [], []
for i in raw.index.unique():
    if isinstance(raw.loc[i],pd.core.series.Series):
        m = [raw.loc[i].as_matrix()]
        #pad = np.pad(m, ((max_padding -len(m), 0),(0,0)), 'constant')  # pre-padding
        docX.append(m)
        docY.append(float(raw.loc[i]["Expected"]))
    else:
        m = data.loc[i].as_matrix()
        #pad = np.pad(m, ((max_padding -len(m), 0),(0,0)), 'constant')
        docX.append(m)
        docY.append(float(raw.loc[i][:1]["Expected"]))
    #docY.append(train.loc[i][:1]["Expected"].as_matrix)
X = np.array(docX)
y = np.array(docY)









    



CPU times: user 2.08 s, sys: 24.4 ms, total: 2.11 s
Wall time: 2.12 s



In [20]:

    
np.shape(X)









    Out[20]:





(3093,)



In [17]:

    
XX = [np.array(t).mean(0) for t in X]



In [19]:

    
np.shape(XX)









    Out[19]:





(3093,)



In [243]:

    
XX[0]









    Out[243]:





array([  2.        ,  14.38888889,  12.16666667,  15.55555556,
        19.22222222,  20.83333333,  18.66666667,  21.38888889,
        24.44444444,   0.99796296,   0.99462964,   0.99833333,
         0.99907408,   0.42361111,   0.16666667,   0.45138889,
         0.79861111,  -0.2344496 ,  -1.44889326,  -0.31222703,   1.05888367])



In [213]:

    
global_means = np.nanmean(data,0)
#global_means = data.mean(0).values



In [214]:

    
a = [
        [1,2,np.nan],
        [3,4,np.nan],
        [2,np.nan,np.nan]
    ]



In [215]:

    
n = np.nanmean(a,0)



In [216]:

    
[np.isnan(i) for i in n]









    Out[216]:





[False, False, True]



In [217]:

    
n









    Out[217]:





array([  2.,   3.,  nan])



In [218]:

    
np.count_nonzero(~np.isnan(X[0])) / float(X[0].size)









    Out[218]:





1.0



In [219]:

    
t = []
for i in X:
    t.append(np.count_nonzero(~np.isnan(i)) / float(i.size))









    



---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-219-5a88b68728d3> in <module>()
      1 t = []
      2 for i in X:
----> 3     t.append(np.count_nonzero(~np.isnan(i)) / float(i.size))

AttributeError: 'list' object has no attribute 'size'



In [220]:

    
pd.DataFrame(np.array(t)).describe()



In [231]:

    
XX = []
for i in X:
    nm = np.nanmean(i,0)
    for idx,j in enumerate(nm):
        if np.isnan(j):
            nm[idx]=global_means[idx]
    XX.append(np.array(nm))



In [226]:

    
XX = [np.array(t).mean(0) for t in X]



In [227]:

    
split = 0.2
ps = int(len(XX) * (1-split))
X_train = XX[:ps]
y_train = y[:ps]
X_test = XX[ps:]
y_test = y[ps:]



In [228]:

    
etreg = ExtraTreesRegressor(n_estimators=100, max_depth=None, min_samples_split=1, random_state=0)



In [229]:

    
y_train[0]









    Out[229]:





1.0160004999999999



In [238]:

    
#%%time
#etreg = etreg.fit(X_train,y_train)
etreg = etreg.fit(XX,y)









    



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-238-ffc4a1a119b9> in <module>()
      1 #%%time
      2 #etreg = etreg.fit(X_train,y_train)
----> 3 etreg = etreg.fit(XX,y)

/Library/Python/2.7/site-packages/sklearn/ensemble/forest.pyc in fit(self, X, y, sample_weight)
    193         """
    194         # Validate or convert input data
--> 195         X = check_array(X, dtype=DTYPE, accept_sparse="csc")
    196         if issparse(X):
    197             # Pre-sort indices to avoid that each individual tree of the

/Library/Python/2.7/site-packages/sklearn/utils/validation.pyc in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features)
    342             else:
    343                 dtype = None
--> 344         array = np.array(array, dtype=dtype, order=order, copy=copy)
    345         # make sure we actually converted to numeric:
    346         if dtype_numeric and array.dtype.kind == "O":

ValueError: setting an array element with a sequence.



In [74]:

    
%%time
et_score = cross_val_score(etreg, XX, y, cv=5)
 
print("Features: %s\nScore: %s\tMean: %.03f"%(columns, et_score,et_score.mean()))









    



Features: [u'radardist_km', u'Ref', u'Ref_5x5_10th', u'Ref_5x5_50th', u'Ref_5x5_90th', u'RefComposite', u'RefComposite_5x5_10th', u'RefComposite_5x5_50th', u'RefComposite_5x5_90th', u'RhoHV', u'RhoHV_5x5_10th', u'RhoHV_5x5_50th', u'RhoHV_5x5_90th', u'Zdr', u'Zdr_5x5_10th', u'Zdr_5x5_50th', u'Zdr_5x5_90th', u'Kdp', u'Kdp_5x5_10th', u'Kdp_5x5_50th', u'Kdp_5x5_90th']
Score: [ 0.17979264  0.23970167  0.19785938  0.13952535  0.20881749]	Mean: 0.193
CPU times: user 11.1 s, sys: 140 ms, total: 11.3 s
Wall time: 11.3 s



In [155]:

    
pred = etreg.predict(X_test)



In [156]:

    
#pred = len(XX)
for idx,i in enumerate(X_test):
    if (np.count_nonzero(~np.isnan(i)) / float(i.size)) < 0.7 :
        pred[idx]=0.2



In [157]:

    
pred[1]









    Out[157]:





9.6683310318784343



In [158]:

    
err = (pred-y_test)**2
err.sum()/len(err)









    Out[158]:





6131.6882757526473



In [129]:

    
r = random.randrange(len(pred))
print(r)
print(pred[r])
print(y_test[r])



In [96]:

    
def marshall_palmer(ref, minutes_past):
    #print("Estimating rainfall from {0} observations".format(len(minutes_past)))
    # how long is each observation valid?
    valid_time = np.zeros_like(minutes_past)
    valid_time[0] = minutes_past.iloc[0]
    for n in xrange(1, len(minutes_past)):
        valid_time[n] = minutes_past.iloc[n] - minutes_past.iloc[n-1]
    valid_time[-1] = valid_time[-1] + 60 - np.sum(valid_time)
    valid_time = valid_time / 60.0

    # sum up rainrate * validtime
    sum = 0
    for dbz, hours in zip(ref, valid_time):
        # See: https://en.wikipedia.org/wiki/DBZ_(meteorology)
        if np.isfinite(dbz):
            mmperhr = pow(pow(10, dbz/10)/200, 0.625)
            sum = sum + mmperhr * hours
    return sum


def simplesum(ref,hour):
    hour.sum()

# each unique Id is an hour of data at some gauge
def myfunc(hour):
    #rowid = hour['Id'].iloc[0]
    # sort hour by minutes_past
    hour = hour.sort('minutes_past', ascending=True)
    est = marshall_palmer(hour['Ref'], hour['minutes_past'])
    return est



In [99]:

    
estimates = raw.groupby(raw.index).apply(myfunc)
estimates.head(20)









    Out[99]:





Id
2      0.448765
10     5.103416
11     0.599822
12     4.399598
14     0.403563
15     0.522524
17     0.906720
18     0.614962
24     6.483500
27     4.939502
29     0.529641
32     0.947299
33     4.238946
34     1.339219
38     2.557478
41     4.407576
51    16.523660
56     1.038116
58     1.489662
62     8.366335
dtype: float64



In [127]:

    
err = (estimates-(np.hstack((y_train,y_test))))**2
err.sum()/len(err)









    Out[127]:





550.24441384929003

Memento (mauri)

# swap litteral values train_df["Sex"] = train_df["Sex"].apply(lambda sex: 0 if sex == "male" else 1) def replace_non_numeric(df): df["Sex"] = df["Sex"].apply(lambda sex: 0 if sex == "male" else 1) return df

RandomForestRegressor()

import pandas as pd from sklearn.ensemble import ExtraTreesClassifier from sklearn.cross_validation import cross_val_score train_df = pd.read_csv("train.csv") et = ExtraTreesClassifier(n_estimators=100, max_depth=None, min_samples_split=1, random_state=0) columns = ["Fare", "Pclass"] labels = train_df["Survived"].values features = train_df[list(columns)].values et_score = cross_val_score(et, features, labels, n_jobs=-1).mean() print("{0} -> ET: {1})".format(columns, et_score))



In [66]:

    
etreg = ExtraTreesRegressor(n_estimators=100, max_depth=None, min_samples_split=1, random_state=0)



In [67]:

    
"""
columns = train_clean.columns
columns = ["minutes_past","radardist_km","Ref","Ref_5x5_10th", "Ref_5x5_50th"]
columns = [u'Id', u'minutes_past', u'radardist_km', u'Ref', u'Ref_5x5_10th',
       u'Ref_5x5_50th', u'Ref_5x5_90th', u'RefComposite',
       u'RefComposite_5x5_10th', u'RefComposite_5x5_50th',
       u'RefComposite_5x5_90th', u'RhoHV', u'RhoHV_5x5_10th',
       u'RhoHV_5x5_50th', u'RhoHV_5x5_90th', u'Zdr', u'Zdr_5x5_10th',
       u'Zdr_5x5_50th', u'Zdr_5x5_90th', u'Kdp', u'Kdp_5x5_10th',
       u'Kdp_5x5_50th', u'Kdp_5x5_90th', u'Expected']
"""
columns = [u'minutes_past', u'radardist_km', u'Ref', u'Ref_5x5_10th',
       u'Ref_5x5_50th', u'Ref_5x5_90th', u'RefComposite',
       u'RefComposite_5x5_10th', u'RefComposite_5x5_50th',
       u'RefComposite_5x5_90th', u'RhoHV', u'RhoHV_5x5_10th',
       u'RhoHV_5x5_50th', u'RhoHV_5x5_90th', u'Zdr', u'Zdr_5x5_10th',
       u'Zdr_5x5_50th', u'Zdr_5x5_90th', u'Kdp', u'Kdp_5x5_10th',
       u'Kdp_5x5_50th', u'Kdp_5x5_90th']
 
labels = train["Expected"].values
features = train[list(columns)].values



In [68]:

    
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit(features)
features_trans = imp.transform(features)



In [69]:

    
len(features_trans)









    Out[69]:





22182



In [70]:

    
split = 0.2
ps = int(len(features_trans) * split)
ftrain = features_trans[:ps]
ltrain = labels[:ps]
ftest = features_trans[ps:]
ltest = labels[ps:]



In [71]:

    
%%time
etreg.fit(ftrain,ltrain)









    



CPU times: user 1.9 s, sys: 43.6 ms, total: 1.94 s
Wall time: 1.95 s






    Out[71]:





ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=None,
          max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
          min_samples_split=1, min_weight_fraction_leaf=0.0,
          n_estimators=100, n_jobs=1, oob_score=False, random_state=0,
          verbose=0, warm_start=False)



In [72]:

    
def scorer(estimator, X, y):
    return (estimator.predict(X[0])-y)**2



In [57]:

    
%%time
et_score = cross_val_score(etreg, features_trans, labels, cv=3)
 
print("Features: %s\nScore: %s\tMean: %.03f"%(columns, et_score,et_score.mean()))









    



---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-57-dabe600e9c4f> in <module>()
----> 1 get_ipython().run_cell_magic(u'time', u'', u'et_score = cross_val_score(etreg, features_trans, labels, cv=3)\n \nprint("Features: %s\\nScore: %s\\tMean: %.03f"%(columns, et_score,et_score.mean()))')

/Library/Python/2.7/site-packages/IPython/core/interactiveshell.pyc in run_cell_magic(self, magic_name, line, cell)
   2291             magic_arg_s = self.var_expand(line, stack_depth)
   2292             with self.builtin_trap:
-> 2293                 result = fn(magic_arg_s, cell)
   2294             return result
   2295 

/Library/Python/2.7/site-packages/IPython/core/magics/execution.pyc in time(self, line, cell, local_ns)

/Library/Python/2.7/site-packages/IPython/core/magic.pyc in <lambda>(f, *a, **k)
    191     # but it's overkill for just that one bit of state.
    192     def magic_deco(arg):
--> 193         call = lambda f, *a, **k: f(*a, **k)
    194 
    195         if callable(arg):

/Library/Python/2.7/site-packages/IPython/core/magics/execution.pyc in time(self, line, cell, local_ns)
   1165         else:
   1166             st = clock2()
-> 1167             exec(code, glob, local_ns)
   1168             end = clock2()
   1169             out = None

<timed exec> in <module>()

/Library/Python/2.7/site-packages/sklearn/cross_validation.pyc in cross_val_score(estimator, X, y, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch)
   1359                                               train, test, verbose, None,
   1360                                               fit_params)
-> 1361                       for train, test in cv)
   1362     return np.array(scores)[:, 0]
   1363 

/Library/Python/2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self, iterable)
    657             self._iterating = True
    658             for function, args, kwargs in iterable:
--> 659                 self.dispatch(function, args, kwargs)
    660 
    661             if pre_dispatch == "all" or n_jobs == 1:

/Library/Python/2.7/site-packages/sklearn/externals/joblib/parallel.pyc in dispatch(self, func, args, kwargs)
    404         """
    405         if self._pool is None:
--> 406             job = ImmediateApply(func, args, kwargs)
    407             index = len(self._jobs)
    408             if not _verbosity_filter(index, self.verbose):

/Library/Python/2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __init__(self, func, args, kwargs)
    138         # Don't delay the application, to avoid keeping the input
    139         # arguments in memory
--> 140         self.results = func(*args, **kwargs)
    141 
    142     def get(self):

/Library/Python/2.7/site-packages/sklearn/cross_validation.pyc in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, error_score)
   1457             estimator.fit(X_train, **fit_params)
   1458         else:
-> 1459             estimator.fit(X_train, y_train, **fit_params)
   1460 
   1461     except Exception as e:

/Library/Python/2.7/site-packages/sklearn/ensemble/forest.pyc in fit(self, X, y, sample_weight)
    271                     t, self, X, y, sample_weight, i, len(trees),
    272                     verbose=self.verbose, class_weight=self.class_weight)
--> 273                 for i, t in enumerate(trees))
    274 
    275             # Collect newly grown trees

/Library/Python/2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self, iterable)
    657             self._iterating = True
    658             for function, args, kwargs in iterable:
--> 659                 self.dispatch(function, args, kwargs)
    660 
    661             if pre_dispatch == "all" or n_jobs == 1:

/Library/Python/2.7/site-packages/sklearn/externals/joblib/parallel.pyc in dispatch(self, func, args, kwargs)
    404         """
    405         if self._pool is None:
--> 406             job = ImmediateApply(func, args, kwargs)
    407             index = len(self._jobs)
    408             if not _verbosity_filter(index, self.verbose):

/Library/Python/2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __init__(self, func, args, kwargs)
    138         # Don't delay the application, to avoid keeping the input
    139         # arguments in memory
--> 140         self.results = func(*args, **kwargs)
    141 
    142     def get(self):

/Library/Python/2.7/site-packages/sklearn/ensemble/forest.pyc in _parallel_build_trees(tree, forest, X, y, sample_weight, tree_idx, n_trees, verbose, class_weight)
     97 
     98     else:
---> 99         tree.fit(X, y, sample_weight=sample_weight, check_input=False)
    100 
    101     return tree

/Library/Python/2.7/site-packages/sklearn/tree/tree.pyc in fit(self, X, y, sample_weight, check_input)
    302                                            max_leaf_nodes)
    303 
--> 304         builder.build(self.tree_, X, y, sample_weight)
    305 
    306         if self.n_outputs_ == 1:

KeyboardInterrupt:



In [73]:

    
r = random.randrange(len(ltrain))
print(r)
print(etreg.predict(ftrain[r]))
print(ltrain[r])









    



2590
[ 2.7940013]
2.7940013



In [94]:

    
r = random.randrange(len(ltest))
print(r)
print(etreg.predict(ftest[r]))
print(ltest[r])









    



9219
[ 4.71356263]
0.25400013



In [95]:

    
err = (etreg.predict(ftest)-ltest)**2



In [96]:

    
err.sum()/len(err)









    Out[96]:





625.80234972451899

Submit



In [154]:

    
filename = "data/reduced_test_5000.csv"
test = pd.read_csv(filename)



In [164]:

    
columns = [u'minutes_past', u'radardist_km', u'Ref', u'Ref_5x5_10th',
       u'Ref_5x5_50th', u'Ref_5x5_90th', u'RefComposite',
       u'RefComposite_5x5_10th', u'RefComposite_5x5_50th',
       u'RefComposite_5x5_90th', u'RhoHV', u'RhoHV_5x5_10th',
       u'RhoHV_5x5_50th', u'RhoHV_5x5_90th', u'Zdr', u'Zdr_5x5_10th',
       u'Zdr_5x5_50th', u'Zdr_5x5_90th', u'Kdp', u'Kdp_5x5_10th',
       u'Kdp_5x5_50th', u'Kdp_5x5_90th']
features = test[list(columns)].values



In [165]:

    
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit(features)
features_trans = imp.transform(features)



In [166]:

    
fall = test[test.columns].values



In [177]:

    
fall[20]









    Out[177]:





array([  2.,  12.,  15.,  nan,  nan,  nan,  nan,  nan,  nan,  nan,  nan,
        nan,  nan,  nan,  nan,  nan,  nan,  nan,  nan,  nan,  nan,  nan,
        nan])



In [173]:

    
features_trans[0]









    Out[173]:





array([  1.        ,   8.        ,  22.41518913,  19.80155642,
        22.10491607,  14.        ,  24.05639308,  21.99042553,
        23.72933182,  15.        ,   0.98224537,   0.93533269,
         0.98130561,   1.01558046,   0.47618814,  -0.66830184,
         0.24352433,   1.84881703,   0.33119267,  -3.23234445,
        -0.39269493,   3.89739166])



In [188]:

    
i = 1
pred = 0
while fall[i][0] == 1:
    #print(fall[i])
    pred+=etreg.predict(features_trans[i])[0]
    #print(etreg.predict(features_trans[i])[0])
    i+=1
print(i)



In [192]:

    
fall[-1][0]









    Out[192]:





460.0



In [202]:

    
%%time
res=[]
i=0
while i<len(fall) and i < 10000:
    pred = 0
    lenn = 0
    curr=fall[i][0]
    while i<len(fall) and fall[i][0] == curr:
        #print(fall[i])
        pred+=etreg.predict(features_trans[i])[0]
        #print(etreg.predict(features_trans[i])[0])
        i+=1
        lenn += 1
    res.append((curr,pred/lenn))
    #i+=1
    #print(i)









    



CPU times: user 11.9 s, sys: 47.2 ms, total: 11.9 s
Wall time: 12 s



In [199]:

    
len(res)









    Out[199]:





460



In [203]:

    
res[:10]









    Out[203]:





[(1.0, 2.1142275617605883),
 (2.0, 1.0368167780596871),
 (3.0, 6.6831796034460016),
 (4.0, 7.0145211283436399),
 (5.0, 2.3839445058508328),
 (6.0, 3.2730617268046158),
 (7.0, 4.1804031388372724),
 (8.0, 1.9472981445571425),
 (9.0, 1.0272548098142853),
 (10.0, 8.0115426694183345)]



In [97]:

    
def myfunc(hour):
    #rowid = hour['Id'].iloc[0]
    # sort hour by minutes_past
    hour = hour.sort('minutes_past', ascending=True)
    #est = (hour['Id'],random.random())
    est = random.random()
    return est



In [98]:

    
def marshall_palmer(ref, minutes_past):
    #print("Estimating rainfall from {0} observations".format(len(minutes_past)))
    # how long is each observation valid?
    valid_time = np.zeros_like(minutes_past)
    valid_time[0] = minutes_past.iloc[0]
    for n in xrange(1, len(minutes_past)):
        valid_time[n] = minutes_past.iloc[n] - minutes_past.iloc[n-1]
    valid_time[-1] = valid_time[-1] + 60 - np.sum(valid_time)
    valid_time = valid_time / 60.0

    # sum up rainrate * validtime
    sum = 0
    for dbz, hours in zip(ref, valid_time):
        # See: https://en.wikipedia.org/wiki/DBZ_(meteorology)
        if np.isfinite(dbz):
            mmperhr = pow(pow(10, dbz/10)/200, 0.625)
            sum = sum + mmperhr * hours
    return sum


def simplesum(ref,hour):
    hour.sum()

# each unique Id is an hour of data at some gauge
def myfunc(hour):
    #rowid = hour['Id'].iloc[0]
    # sort hour by minutes_past
    hour = hour.sort('minutes_past', ascending=True)
    est = marshall_palmer(hour['Ref'], hour['minutes_past'])
    return est



In [122]:

    
estimates = test.groupby(train.index).apply(myfunc)
estimates.head(20)









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-122-931afd88b123> in <module>()
----> 1 estimates = test.groupby(train.index).apply(myfunc)
      2 estimates.head(20)

NameError: name 'test' is not defined



In [123]:

    
estimates = train.groupby(train.index).apply(myfunc)
estimates.head(20)









    Out[123]:





Id
1     0.000000
2     0.630814
3     0.000000
4     2.717599
5     0.000000
6     0.000000
7     0.040640
8     0.701645
9     0.000000
10    5.103416
11    0.599822
12    4.399598
13    0.000000
14    0.245304
15    0.390208
16    0.000000
17    0.906720
18    0.353654
19    0.202986
20    0.000000
dtype: float64



In [100]:

    
train["Expected"].head(20)









    Out[100]:





Id
2     1.016
2     1.016
2     1.016
2     1.016
2     1.016
2     1.016
2     1.016
2     1.016
2     1.016
10    0.010
10    0.010
10    0.010
10    0.010
10    0.010
10    0.010
10    0.010
10    0.010
10    0.010
10    0.010
10    0.010
Name: Expected, dtype: float64



In [102]:

    
print(features_trans[0])
print(etreg.predict(features_trans[0]))









    



[  1.           2.           9.           5.           7.5         10.5
  15.          10.5         16.5         23.5          0.99833333
   0.99833333   0.99833333   0.99833333   0.375       -0.125        0.3125
   0.875        1.0599976   -1.4100037   -0.3500061    1.0599976 ]
[ 1.0160005]



In [16]:

    
def marshall_palmer(data):
    res=[]
    for n in data:
        res.append(etreg.predict(n)[0])
    return np.array(res).mean()


def simplesum(ref,hour):
    hour.sum()

def myfunc(hour):
    hour = hour.sort('minutes_past', ascending=True)
    est = marshall_palmer(hour[train.columns])
    return est



In [302]:

    
estimates = train_clean.groupby(train_clean.index).apply(myfunc)
estimates.head(20)









    



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-302-edc026cfb441> in <module>()
----> 1 estimates = train_clean.groupby(train_clean.index).apply(myfunc)
      2 estimates.head(20)

/Library/Python/2.7/site-packages/pandas/core/groupby.pyc in apply(self, func, *args, **kwargs)
    669         # ignore SettingWithCopy here in case the user mutates
    670         with option_context('mode.chained_assignment',None):
--> 671             return self._python_apply_general(f)
    672 
    673     def _python_apply_general(self, f):

/Library/Python/2.7/site-packages/pandas/core/groupby.pyc in _python_apply_general(self, f)
    673     def _python_apply_general(self, f):
    674         keys, values, mutated = self.grouper.apply(f, self._selected_obj,
--> 675                                                    self.axis)
    676 
    677         return self._wrap_applied_output(keys, values,

/Library/Python/2.7/site-packages/pandas/core/groupby.pyc in apply(self, f, data, axis)
   1292             # group might be modified
   1293             group_axes = _get_axes(group)
-> 1294             res = f(group)
   1295             if not _is_indexed_like(res, group_axes):
   1296                 mutated = True

/Library/Python/2.7/site-packages/pandas/core/groupby.pyc in f(g)
    665         @wraps(func)
    666         def f(g):
--> 667             return func(g, *args, **kwargs)
    668 
    669         # ignore SettingWithCopy here in case the user mutates

<ipython-input-291-903557efb5b5> in myfunc(hour)
     11 def myfunc(hour):
     12     hour = hour.sort('minutes_past', ascending=True)
---> 13     est = marshall_palmer(hour[train.columns])
     14     return est

<ipython-input-291-903557efb5b5> in marshall_palmer(data)
      2     res=[]
      3     for n in data:
----> 4         res.append(int(etreg.predict(n)[0]))
      5     return np.array(res).mean()
      6 

/Library/Python/2.7/site-packages/sklearn/ensemble/forest.pyc in predict(self, X)
    615 
    616         # Check data
--> 617         X = check_array(X, dtype=DTYPE, accept_sparse="csr")
    618         if issparse(X) and (X.indices.dtype != np.intc or
    619                             X.indptr.dtype != np.intc):

/Library/Python/2.7/site-packages/sklearn/utils/validation.pyc in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features)
    342             else:
    343                 dtype = None
--> 344         array = np.array(array, dtype=dtype, order=order, copy=copy)
    345         # make sure we actually converted to numeric:
    346         if dtype_numeric and array.dtype.kind == "O":

ValueError: could not convert string to float: minutes_past



In [ ]:



In [ ]:

RNN



In [134]:

    
import pandas as pd  
from random import random

flow = (list(range(1,10,1)) + list(range(10,1,-1)))*1000  
pdata = pd.DataFrame({"a":flow, "b":flow})  
pdata.b = pdata.b.shift(9)  
data = pdata.iloc[10:] * random()  # some noise



In [135]:

    
#columns = [u'minutes_past', u'radardist_km', u'Ref', u'Ref_5x5_10th',
#       u'Ref_5x5_50th', u'Ref_5x5_90th', u'RefComposite',
#       u'RefComposite_5x5_10th', u'RefComposite_5x5_50th',
#       u'RefComposite_5x5_90th', u'RhoHV', u'RhoHV_5x5_10th',
#       u'RhoHV_5x5_50th', u'RhoHV_5x5_90th', u'Zdr', u'Zdr_5x5_10th',
#       u'Zdr_5x5_50th', u'Zdr_5x5_90th', u'Kdp', u'Kdp_5x5_10th',
#       u'Kdp_5x5_50th', u'Kdp_5x5_90th']
columns = [u'radardist_km', u'Ref', u'Ref_5x5_10th']
nb_features = len(columns)
data = train[list(columns)]
data.head(10)









    Out[135]:






  
    
      
      radardist_km
      Ref
      Ref_5x5_10th
    
    
      Id
      
      
      
    
  
  
    
      2
      2
      9.0
      5.0
    
    
      2
      2
      18.0
      14.0
    
    
      2
      2
      24.5
      16.5
    
    
      2
      2
      12.0
      12.0
    
    
      2
      2
      22.5
      19.0
    
    
      2
      2
      14.0
      14.0
    
    
      2
      2
      12.0
      11.0
    
    
      2
      2
      1.5
      3.5
    
    
      2
      2
      16.0
      14.5
    
    
      10
      10
      32.5
      32.0



In [136]:

    
data.iloc[0].as_matrix()









    Out[136]:





array([ 2.,  9.,  5.])



In [137]:

    
train.head(5)









    Out[137]:






  
    
      
      minutes_past
      radardist_km
      Ref
      Ref_5x5_10th
      Ref_5x5_50th
      Ref_5x5_90th
      RefComposite
      RefComposite_5x5_10th
      RefComposite_5x5_50th
      RefComposite_5x5_90th
      ...
      RhoHV_5x5_90th
      Zdr
      Zdr_5x5_10th
      Zdr_5x5_50th
      Zdr_5x5_90th
      Kdp
      Kdp_5x5_10th
      Kdp_5x5_50th
      Kdp_5x5_90th
      Expected
    
    
      Id
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      2
      1
      2
      9.0
      5.0
      7.5
      10.5
      15.0
      10.5
      16.5
      23.5
      ...
      0.998333
      0.3750
      -0.1250
      0.3125
      0.8750
      1.059998
      -1.410004
      -0.350006
      1.059998
      1.016
    
    
      2
      16
      2
      18.0
      14.0
      17.5
      21.0
      20.5
      18.0
      20.5
      23.0
      ...
      1.001667
      0.2500
      0.1250
      0.3750
      0.6875
      0.349991
      -1.059998
      0.000000
      1.059998
      1.016
    
    
      2
      21
      2
      24.5
      16.5
      21.0
      24.5
      24.5
      21.0
      24.0
      28.0
      ...
      0.998333
      0.2500
      0.0625
      0.1875
      0.5625
      -0.350006
      -1.059998
      -0.350006
      1.759994
      1.016
    
    
      2
      26
      2
      12.0
      12.0
      16.0
      20.0
      16.5
      17.0
      19.0
      21.0
      ...
      0.998333
      0.5625
      0.2500
      0.4375
      0.6875
      -1.760010
      -1.760010
      -0.350006
      0.709991
      1.016
    
    
      2
      31
      2
      22.5
      19.0
      22.0
      25.0
      26.0
      23.5
      25.5
      27.5
      ...
      1.001667
      0.0000
      -0.1875
      0.2500
      0.6250
      -1.059998
      -2.120010
      -0.710007
      0.349991
      1.016
    
  

5 rows × 23 columns



In [138]:

    
train.loc[11]









    Out[138]:






  
    
      
      minutes_past
      radardist_km
      Ref
      Ref_5x5_10th
      Ref_5x5_50th
      Ref_5x5_90th
      RefComposite
      RefComposite_5x5_10th
      RefComposite_5x5_50th
      RefComposite_5x5_90th
      ...
      RhoHV_5x5_90th
      Zdr
      Zdr_5x5_10th
      Zdr_5x5_50th
      Zdr_5x5_90th
      Kdp
      Kdp_5x5_10th
      Kdp_5x5_50th
      Kdp_5x5_90th
      Expected
    
    
      Id
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      11
      1
      12
      21.0
      16.5
      20.0
      23.0
      21.0
      16.5
      20.0
      23.0
      ...
      1.015000
      -0.4375
      -0.4375
      0.2500
      0.8750
      -1.410004
      -1.760010
      1.059998
      3.879990
      2.540001
    
    
      11
      10
      12
      23.5
      22.0
      24.5
      26.5
      23.5
      22.0
      24.5
      26.5
      ...
      1.001667
      0.5625
      -0.1250
      0.2500
      0.7500
      0.709991
      -2.470001
      0.000000
      2.119995
      2.540001
    
    
      11
      20
      12
      20.0
      17.0
      19.5
      23.0
      20.0
      17.0
      19.5
      23.0
      ...
      1.001667
      0.0625
      -0.3125
      0.0625
      0.5625
      -1.059998
      -2.110001
      -1.059998
      1.759994
      2.540001
    
    
      11
      29
      12
      16.5
      16.5
      19.0
      23.0
      16.5
      16.5
      19.0
      23.0
      ...
      0.998333
      0.0000
      -0.6875
      0.1250
      0.9375
      -3.169998
      -3.169998
      0.709991
      2.470001
      2.540001
    
    
      11
      39
      12
      20.0
      16.5
      18.5
      21.5
      20.0
      16.5
      18.5
      21.5
      ...
      1.021667
      0.6875
      -0.3750
      0.3125
      1.2500
      -1.340012
      -1.360001
      -0.350006
      1.759994
      2.540001
    
    
      11
      49
      12
      19.0
      18.0
      19.5
      22.0
      19.0
      18.0
      19.5
      22.0
      ...
      1.035000
      1.4375
      -0.6250
      0.1875
      1.3125
      -3.169998
      -3.169998
      -1.059998
      3.509994
      2.540001
    
    
      11
      58
      12
      15.0
      15.0
      18.5
      22.5
      15.0
      15.0
      18.5
      22.5
      ...
      1.008333
      1.0625
      -0.4375
      0.1875
      1.0625
      -0.710007
      -3.169998
      -0.350006
      1.759994
      2.540001
    
  

7 rows × 23 columns



In [139]:

    
train.loc[11][:1]["Expected"].as_matrix









    Out[139]:





<bound method Series.as_matrix of Id
11    2.540001
Name: Expected, dtype: float64>



In [140]:

    
#train.index.unique()



In [141]:

    
def _load_data(data, n_prev = 100):  
    """
    data should be pd.DataFrame()
    """

    docX, docY = [], []
    for i in range(len(data)-n_prev):
        docX.append(data.iloc[i:i+n_prev].as_matrix())
        docY.append(data.iloc[i+n_prev].as_matrix())
    alsX = np.array(docX)
    alsY = np.array(docY)

    return alsX, alsY

def train_test_split(df, test_size=0.1):  
    ntrn = round(len(df) * (1 - test_size))

    X_train, y_train = _load_data(df.iloc[0:ntrn])
    X_test, y_test = _load_data(df.iloc[ntrn:])
    
    return (X_train, y_train), (X_test, y_test)

(X_train, y_train), (X_test, y_test) = train_test_split(data)



In [142]:

    
np.shape(X_train)









    Out[142]:





(19864, 100, 3)



In [144]:

    
t = np.array([2,1])
t.shape = (1,2)
t.tolist()[0]









    Out[144]:





[2, 1]



In [145]:

    
np.shape(t)









    Out[145]:





(1, 2)



In [146]:

    
X_train[:2,:2]









    Out[146]:





array([[[  2. ,   9. ,   5. ],
        [  2. ,  18. ,  14. ]],

       [[  2. ,  18. ,  14. ],
        [  2. ,  24.5,  16.5]]])



In [44]:

    
train.index.unique()









    Out[44]:





array([  2,  10,  11,  12,  14,  15,  17,  18,  24,  27,  29,  32,  33,
        34,  38,  41,  51,  56,  58,  62,  69,  72,  73,  74,  77,  78,
        87,  90,  92,  94,  98, 102, 103, 104, 105, 106, 111, 117, 123,
       127, 128, 133, 135, 136, 139, 144, 145, 151, 153, 155, 156, 160,
       163, 164, 166, 174, 177, 178, 184, 186, 188, 189, 191, 192, 193,
       198, 200, 206, 208, 210, 219, 220, 221, 224, 225, 226, 227, 230,
       237, 240, 243, 245, 246, 249, 250, 251, 254, 255, 261, 263, 265,
       266, 269, 274, 276, 279, 280, 284, 285, 288, 289, 291, 300, 302,
       304, 309, 310, 316, 322, 325, 327, 330, 332, 335, 336, 340, 341,
       344, 345, 347, 348, 350, 353, 358, 361, 362, 364, 365, 366, 368,
       369, 371, 373, 379, 382, 390, 391, 396, 402, 403, 404, 405, 406,
       410, 411, 416, 418, 425, 429, 431, 432, 436, 438, 442, 443, 445,
       451, 453, 462, 465, 474, 475, 476, 483, 489, 495, 500, 502, 506,
       521, 523, 527, 531, 539, 541, 545, 546, 547, 548, 551, 552, 556,
       557, 559, 560, 563, 565, 566, 569, 575, 576, 577, 583, 584, 585,
       592, 597, 599, 603, 605, 616, 617, 620, 625, 628, 629, 639, 640,
       641, 644, 647, 651, 655, 657, 658, 662, 669, 673, 675, 682, 685,
       689, 697, 706, 707, 708, 720, 727, 729, 736, 739, 741, 742, 743,
       746, 748, 752, 754, 762, 770, 771, 778, 781, 782, 784, 786, 788,
       789, 793, 796, 797, 801, 803, 804, 812, 817, 818, 821, 823, 825,
       826, 828, 830, 836, 837, 838, 861, 862, 864, 868, 871, 872, 875,
       877, 878, 880, 885, 887, 894, 899, 901, 902, 904, 905, 906, 908,
       909, 912, 913, 917, 925, 929, 930, 931, 938, 939, 940])



In [148]:

    
max_padding = 20



In [149]:

    
%%time

docX, docY = [], []
for i in train.index.unique():
    if isinstance(train.loc[i],pd.core.series.Series):
        m = [data.loc[i].as_matrix()]
        pad = np.pad(m, ((max_padding -len(m), 0),(0,0)), 'constant')  # pre-padding
        docX.append(pad)
        docY.append(float(train.loc[i]["Expected"]))
    else:
        m = data.loc[i].as_matrix()
        pad = np.pad(m, ((max_padding -len(m), 0),(0,0)), 'constant')
        docX.append(pad)
        docY.append(float(train.loc[i][:1]["Expected"]))
    #docY.append(train.loc[i][:1]["Expected"].as_matrix)
XX = np.array(docX)
yy = np.array(docY)









    



CPU times: user 2.14 s, sys: 16.5 ms, total: 2.16 s
Wall time: 2.2 s



In [151]:

    
np.shape(XX)









    Out[151]:





(3093, 20, 3)



In [154]:

    
XX[0].mean()









    Out[154]:





4.2833333333333332



In [36]:

    
#from keras.preprocessing import sequence
#sequence.pad_sequences(X_train, maxlen=maxlen)



In [37]:

    
def _load_data(data):  
    """
    data should be pd.DataFrame()
    """
    docX, docY = [], []
    for i in data.index.unique():
        #np.pad(tmp, ((0, max_padding -len(tmp) ),(0,0)), 'constant')
        m = data.loc[i].as_matrix()
        pad = np.pad(m, ((0, max_padding -len(m) ),(0,0)), 'constant')
        docX.append(pad)
        if isinstance(train.loc[i],pd.core.series.Series):
            docY.append(float(train.loc[i]["Expected"]))
        else:
            docY.append(float(train.loc[i][:1]["Expected"]))
    alsX = np.array(docX)
    alsY = np.array(docY)

    return alsX, alsY

def train_test_split(df, test_size=0.1):  
    ntrn = round(len(df) * (1 - test_size))

    X_train, y_train = _load_data(df.iloc[0:ntrn])
    X_test, y_test = _load_data(df.iloc[ntrn:])
    
    return (X_train, y_train), (X_test, y_test)

(X_train, y_train), (X_test, y_test) = train_test_split(train)









    



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-37-8b5129e33870> in <module>()
     26     return (X_train, y_train), (X_test, y_test)
     27 
---> 28 (X_train, y_train), (X_test, y_test) = train_test_split(train)

<ipython-input-37-8b5129e33870> in train_test_split(df, test_size)
     21     ntrn = round(len(df) * (1 - test_size))
     22 
---> 23     X_train, y_train = _load_data(df.iloc[0:ntrn])
     24     X_test, y_test = _load_data(df.iloc[ntrn:])
     25 

<ipython-input-37-8b5129e33870> in _load_data(data)
      7         #np.pad(tmp, ((0, max_padding -len(tmp) ),(0,0)), 'constant')
      8         m = data.loc[i].as_matrix()
----> 9         pad = np.pad(m, ((0, max_padding -len(m) ),(0,0)), 'constant')
     10         docX.append(pad)
     11         if isinstance(train.loc[i],pd.core.series.Series):

/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/numpy/lib/arraypad.pyc in pad(array, pad_width, mode, **kwargs)
   1280 
   1281     narray = np.array(array)
-> 1282     pad_width = _validate_lengths(narray, pad_width)
   1283 
   1284     allowedkwargs = {

/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/numpy/lib/arraypad.pyc in _validate_lengths(narray, number_elements)
   1077 
   1078     """
-> 1079     normshp = _normalize_shape(narray, number_elements)
   1080     for i in normshp:
   1081         chk = [1 if x is None else x for x in i]

/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/numpy/lib/arraypad.pyc in _normalize_shape(narray, shape)
   1042     if normshp is None:
   1043         fmt = "Unable to create correctly shaped tuple from %s"
-> 1044         raise ValueError(fmt % (shape,))
   1045     return normshp
   1046 

ValueError: Unable to create correctly shaped tuple from ((0, -3), (0, 0))



In [38]:

    
len(X_train[0])









    Out[38]:





100



In [39]:

    
train.head()









    Out[39]:






  
    
      
      minutes_past
      radardist_km
      Ref
      Ref_5x5_10th
      Ref_5x5_50th
      Ref_5x5_90th
      RefComposite
      RefComposite_5x5_10th
      RefComposite_5x5_50th
      RefComposite_5x5_90th
      ...
      RhoHV_5x5_90th
      Zdr
      Zdr_5x5_10th
      Zdr_5x5_50th
      Zdr_5x5_90th
      Kdp
      Kdp_5x5_10th
      Kdp_5x5_50th
      Kdp_5x5_90th
      Expected
    
    
      Id
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      2
      1
      2
      9.0
      5.0
      7.5
      10.5
      15.0
      10.5
      16.5
      23.5
      ...
      0.998333
      0.3750
      -0.1250
      0.3125
      0.8750
      1.059998
      -1.410004
      -0.350006
      1.059998
      1.016
    
    
      2
      16
      2
      18.0
      14.0
      17.5
      21.0
      20.5
      18.0
      20.5
      23.0
      ...
      1.001667
      0.2500
      0.1250
      0.3750
      0.6875
      0.349991
      -1.059998
      0.000000
      1.059998
      1.016
    
    
      2
      21
      2
      24.5
      16.5
      21.0
      24.5
      24.5
      21.0
      24.0
      28.0
      ...
      0.998333
      0.2500
      0.0625
      0.1875
      0.5625
      -0.350006
      -1.059998
      -0.350006
      1.759994
      1.016
    
    
      2
      26
      2
      12.0
      12.0
      16.0
      20.0
      16.5
      17.0
      19.0
      21.0
      ...
      0.998333
      0.5625
      0.2500
      0.4375
      0.6875
      -1.760010
      -1.760010
      -0.350006
      0.709991
      1.016
    
    
      2
      31
      2
      22.5
      19.0
      22.0
      25.0
      26.0
      23.5
      25.5
      27.5
      ...
      1.001667
      0.0000
      -0.1875
      0.2500
      0.6250
      -1.059998
      -2.120010
      -0.710007
      0.349991
      1.016
    
  

5 rows × 23 columns



In [40]:

    
X_train[0][:10]









    Out[40]:





array([[  2. ,   9. ,   5. ],
       [  2. ,  18. ,  14. ],
       [  2. ,  24.5,  16.5],
       [  2. ,  12. ,  12. ],
       [  2. ,  22.5,  19. ],
       [  2. ,  14. ,  14. ],
       [  2. ,  12. ,  11. ],
       [  2. ,   1.5,   3.5],
       [  2. ,  16. ,  14.5],
       [ 10. ,  32.5,  32. ]])



In [41]:

    
yt = []
for i in y_train:
    yt.append([i[0]])



In [42]:

    
yt[0]









    Out[42]:





[5.0]



In [439]:

    
X_train.shape









    Out[439]:





(16091, 100, 2)



In [450]:

    
len(fea[0])









    Out[450]:





21



In [449]:

    
len(X_train[0][0])









    Out[449]:





2



In [442]:

    
f = np.array(fea)



In [443]:

    
f.shape()









    



---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-443-4a75c17b35fb> in <module>()
----> 1 f.shape()

TypeError: 'tuple' object is not callable



In [428]:

    
#(X_train, y_train), (X_test, y_test) = train_test_split(data)  # retrieve data

# and now train the model
# batch_size should be appropriate to your memory size
# number of epochs should be higher for real world problems
model.fit(X_train, yt, batch_size=450, nb_epoch=2, validation_split=0.05)









    



Train on 15286 samples, validate on 805 samples
Epoch 0
15286/15286 [==============================] - 11s - loss: 1.4571 - val_loss: 0.5908
Epoch 1
15286/15286 [==============================] - 10s - loss: 0.4297 - val_loss: 0.3123






    Out[428]:





<keras.callbacks.History at 0x1495fe7d0>



In [37]:

    
from keras.models import Sequential  
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM
from keras.layers.embeddings import Embedding



In [38]:

    
%%time
input_dim = nb_features
out_dim = 1  
hidden_dim = 200

model = Sequential()
#Embedding(input_dim, hidden_dim, mask_zero=True)
#model.add(LSTM(hidden_dim, hidden_dim, return_sequences=False))  
model.add(LSTM(input_dim, hidden_dim, return_sequences=False))  
model.add(Dropout(0.5))
model.add(Dense(hidden_dim, out_dim))  
model.add(Activation("linear"))  
model.compile(loss="mean_squared_error", optimizer="rmsprop")









    



CPU times: user 50 s, sys: 1.49 s, total: 51.5 s
Wall time: 1min 26s



In [39]:

    
model.fit(XX, yy, batch_size=10, nb_epoch=10, validation_split=0.1)









    



Train on 267 samples, validate on 30 samples
Epoch 0
267/267 [==============================] - 1s - loss: 117.1316 - val_loss: 14.7827
Epoch 1
267/267 [==============================] - 1s - loss: 113.4763 - val_loss: 14.4738
Epoch 2
267/267 [==============================] - 1s - loss: 112.2433 - val_loss: 14.5549
Epoch 3
267/267 [==============================] - 1s - loss: 112.4863 - val_loss: 14.5511
Epoch 4
267/267 [==============================] - 1s - loss: 111.3677 - val_loss: 14.3697
Epoch 5
267/267 [==============================] - 1s - loss: 110.7156 - val_loss: 14.8054
Epoch 6
267/267 [==============================] - 1s - loss: 109.8483 - val_loss: 14.7823
Epoch 7
267/267 [==============================] - 1s - loss: 110.5945 - val_loss: 14.7577
Epoch 8
267/267 [==============================] - 1s - loss: 110.7163 - val_loss: 14.7414
Epoch 9
267/267 [==============================] - 1s - loss: 110.0546 - val_loss: 15.0051






    Out[39]:





<keras.callbacks.History at 0x1211de190>



In [131]:

    
test = random.randint(0,len(XX))
print(model.predict(XX[test:test+1])[0][0])
print(yy[test])









    



5.88562275325
5.334003



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:

	minutes_past	radardist_km	Ref	Ref_5x5_10th	Ref_5x5_50th	Ref_5x5_90th	RefComposite	RefComposite_5x5_10th	RefComposite_5x5_50th	RefComposite_5x5_90th	...	RhoHV_5x5_90th	Zdr	Zdr_5x5_10th	Zdr_5x5_50th	Zdr_5x5_90th	Kdp	Kdp_5x5_10th	Kdp_5x5_50th	Kdp_5x5_90th	Expected
Id
2	1	2	9.0	5.0	7.5	10.5	15.0	10.5	16.5	23.5	...	0.998333	0.3750	-0.1250	0.3125	0.8750	1.059998	-1.410004	-0.350006	1.059998	1.016
2	16	2	18.0	14.0	17.5	21.0	20.5	18.0	20.5	23.0	...	1.001667	0.2500	0.1250	0.3750	0.6875	0.349991	-1.059998	0.000000	1.059998	1.016
2	21	2	24.5	16.5	21.0	24.5	24.5	21.0	24.0	28.0	...	0.998333	0.2500	0.0625	0.1875	0.5625	-0.350006	-1.059998	-0.350006	1.759994	1.016
2	26	2	12.0	12.0	16.0	20.0	16.5	17.0	19.0	21.0	...	0.998333	0.5625	0.2500	0.4375	0.6875	-1.760010	-1.760010	-0.350006	0.709991	1.016
2	31	2	22.5	19.0	22.0	25.0	26.0	23.5	25.5	27.5	...	1.001667	0.0000	-0.1875	0.2500	0.6250	-1.059998	-2.120010	-0.710007	0.349991	1.016