FDMS TME3

Florian Toque & Paul Willot

Dear professor Denoyer...

Warning

This is an early version of our entry for the Kaggle challenge

It's still very messy and we send it because we forgot that we had to submit our progress step by step...

To summarize our goal, we plan to use a RNN to take advantage of the sequential data



In [1]:

    
# from __future__ import exam_success
from __future__ import absolute_import
from __future__ import print_function

%matplotlib inline
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import random
import pandas as pd

# Sk cheats
from sklearn.cross_validation import cross_val_score  # cross val
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.preprocessing import Imputer   # get rid of nan

13.765.202 lines in train.csv
8.022.757 lines in test.csv

Reduced to

10.000
5.000



In [2]:

    
filename = "data/reduced_train_10000.csv"
train = pd.read_csv(filename)
train = train.set_index('Id')
train = train.dropna()



In [3]:

    
train.head()









    Out[3]:






  
    
      
      minutes_past
      radardist_km
      Ref
      Ref_5x5_10th
      Ref_5x5_50th
      Ref_5x5_90th
      RefComposite
      RefComposite_5x5_10th
      RefComposite_5x5_50th
      RefComposite_5x5_90th
      ...
      RhoHV_5x5_90th
      Zdr
      Zdr_5x5_10th
      Zdr_5x5_50th
      Zdr_5x5_90th
      Kdp
      Kdp_5x5_10th
      Kdp_5x5_50th
      Kdp_5x5_90th
      Expected
    
    
      Id
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      2
      1
      2
      9.0
      5.0
      7.5
      10.5
      15.0
      10.5
      16.5
      23.5
      ...
      0.998333
      0.3750
      -0.1250
      0.3125
      0.8750
      1.059998
      -1.410004
      -0.350006
      1.059998
      1.016
    
    
      2
      16
      2
      18.0
      14.0
      17.5
      21.0
      20.5
      18.0
      20.5
      23.0
      ...
      1.001667
      0.2500
      0.1250
      0.3750
      0.6875
      0.349991
      -1.059998
      0.000000
      1.059998
      1.016
    
    
      2
      21
      2
      24.5
      16.5
      21.0
      24.5
      24.5
      21.0
      24.0
      28.0
      ...
      0.998333
      0.2500
      0.0625
      0.1875
      0.5625
      -0.350006
      -1.059998
      -0.350006
      1.759994
      1.016
    
    
      2
      26
      2
      12.0
      12.0
      16.0
      20.0
      16.5
      17.0
      19.0
      21.0
      ...
      0.998333
      0.5625
      0.2500
      0.4375
      0.6875
      -1.760010
      -1.760010
      -0.350006
      0.709991
      1.016
    
    
      2
      31
      2
      22.5
      19.0
      22.0
      25.0
      26.0
      23.5
      25.5
      27.5
      ...
      1.001667
      0.0000
      -0.1875
      0.2500
      0.6250
      -1.059998
      -2.120010
      -0.710007
      0.349991
      1.016
    
  

5 rows × 23 columns



In [4]:

    
train["Expected"].describe()









    Out[4]:





count    2048.000000
mean        3.731957
std         8.168651
min         0.010000
25%         0.508000
50%         2.032001
75%         3.950002
max       129.000080
Name: Expected, dtype: float64

Get rid of Nan value for now



In [5]:

    
#train_clean = train[[not i for i in np.isnan(train["Ref_5x5_10th"])]]

Forums indicate that a higher than 1m rainfall is probably an error. Which is quite understandable. We filter that out



In [6]:

    
train = train[train['Expected'] < 1000]



In [7]:

    
train['Expected'].describe()









    Out[7]:





count    2048.000000
mean        3.731957
std         8.168651
min         0.010000
25%         0.508000
50%         2.032001
75%         3.950002
max       129.000080
Name: Expected, dtype: float64

Memento (mauri)

# swap litteral values train_df["Sex"] = train_df["Sex"].apply(lambda sex: 0 if sex == "male" else 1) def replace_non_numeric(df): df["Sex"] = df["Sex"].apply(lambda sex: 0 if sex == "male" else 1) return df

RandomForestRegressor()

import pandas as pd from sklearn.ensemble import ExtraTreesClassifier from sklearn.cross_validation import cross_val_score train_df = pd.read_csv("train.csv") et = ExtraTreesClassifier(n_estimators=100, max_depth=None, min_samples_split=1, random_state=0) columns = ["Fare", "Pclass"] labels = train_df["Survived"].values features = train_df[list(columns)].values et_score = cross_val_score(et, features, labels, n_jobs=-1).mean() print("{0} -> ET: {1})".format(columns, et_score))



In [8]:

    
etreg = ExtraTreesRegressor(n_estimators=100, max_depth=None, min_samples_split=1, random_state=0)



In [9]:

    
"""
columns = train_clean.columns
columns = ["minutes_past","radardist_km","Ref","Ref_5x5_10th", "Ref_5x5_50th"]
columns = [u'Id', u'minutes_past', u'radardist_km', u'Ref', u'Ref_5x5_10th',
       u'Ref_5x5_50th', u'Ref_5x5_90th', u'RefComposite',
       u'RefComposite_5x5_10th', u'RefComposite_5x5_50th',
       u'RefComposite_5x5_90th', u'RhoHV', u'RhoHV_5x5_10th',
       u'RhoHV_5x5_50th', u'RhoHV_5x5_90th', u'Zdr', u'Zdr_5x5_10th',
       u'Zdr_5x5_50th', u'Zdr_5x5_90th', u'Kdp', u'Kdp_5x5_10th',
       u'Kdp_5x5_50th', u'Kdp_5x5_90th', u'Expected']
"""
columns = [u'minutes_past', u'radardist_km', u'Ref', u'Ref_5x5_10th',
       u'Ref_5x5_50th', u'Ref_5x5_90th', u'RefComposite',
       u'RefComposite_5x5_10th', u'RefComposite_5x5_50th',
       u'RefComposite_5x5_90th', u'RhoHV', u'RhoHV_5x5_10th',
       u'RhoHV_5x5_50th', u'RhoHV_5x5_90th', u'Zdr', u'Zdr_5x5_10th',
       u'Zdr_5x5_50th', u'Zdr_5x5_90th', u'Kdp', u'Kdp_5x5_10th',
       u'Kdp_5x5_50th', u'Kdp_5x5_90th']
 
labels = train_clean["Expected"].values
features = train_clean[list(columns)].values









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-9-52cb5443f7aa> in <module>()
     18        u'Kdp_5x5_50th', u'Kdp_5x5_90th']
     19 
---> 20 labels = train_clean["Expected"].values
     21 features = train_clean[list(columns)].values

NameError: name 'train_clean' is not defined



In [10]:

    
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit(features)
features_trans = imp.transform(features)









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-10-cd4038d809d6> in <module>()
      1 imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
----> 2 imp.fit(features)
      3 features_trans = imp.transform(features)

NameError: name 'features' is not defined



In [10]:

    
ftrain = features_trans[:3000]
ltrain = labels[:3000]
ftest = features_trans[3000:]
ltest = labels[3000:]









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-10-283533f4de89> in <module>()
----> 1 ftrain = features_trans[:3000]
      2 ltrain = labels[:3000]
      3 ftest = features_trans[3000:]
      4 ltest = labels[3000:]

NameError: name 'features_trans' is not defined



In [139]:

    
%%time
etreg.fit(ftrain,ltrain)









    



CPU times: user 1.25 s, sys: 42.6 ms, total: 1.3 s
Wall time: 1.3 s






    Out[139]:





ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=None,
          max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
          min_samples_split=1, min_weight_fraction_leaf=0.0,
          n_estimators=100, n_jobs=1, oob_score=False, random_state=0,
          verbose=0, warm_start=False)



In [89]:

    
def scorer(estimator, X, y):
    return (estimator.predict(X[0])-y)**2



In [140]:

    
%%time
et_score = cross_val_score(etreg, features_trans, labels, cv=5)
 
print("Features: %s\nScore: %s\tMean: %.03f"%(columns, et_score,et_score.mean()))









    



Features: [u'minutes_past', u'radardist_km', u'Ref', u'Ref_5x5_10th', u'Ref_5x5_50th', u'Ref_5x5_90th', u'RefComposite', u'RefComposite_5x5_10th', u'RefComposite_5x5_50th', u'RefComposite_5x5_90th', u'RhoHV', u'RhoHV_5x5_10th', u'RhoHV_5x5_50th', u'RhoHV_5x5_90th', u'Zdr', u'Zdr_5x5_10th', u'Zdr_5x5_50th', u'Zdr_5x5_90th', u'Kdp', u'Kdp_5x5_10th', u'Kdp_5x5_50th', u'Kdp_5x5_90th']
Score: [ -2.26736705e-03   7.75750100e-01  -3.28141465e+00   1.32098540e-01
  -6.60898231e+00]	Mean: -1.797)
CPU times: user 6.21 s, sys: 146 ms, total: 6.35 s
Wall time: 6.35 s



In [141]:

    
r = random.randrange(len(ltrain))
print(r)
print(etreg.predict(ftrain[r]))
print(ltrain[r])









    



2910
[ 6.858004]
6.858004



In [153]:

    
r = random.randrange(len(ltest))
print(r)
print(etreg.predict(ftest[r]))
print(ltest[r])









    



654
[ 3.90697198]
2.7940013



In [143]:

    
err = (etreg.predict(ftest)-ltest)**2



In [144]:

    
err.sum()/len(err)









    Out[144]:





196.01609423033887

Submit



In [154]:

    
filename = "data/reduced_test_5000.csv"
test = pd.read_csv(filename)



In [164]:

    
columns = [u'minutes_past', u'radardist_km', u'Ref', u'Ref_5x5_10th',
       u'Ref_5x5_50th', u'Ref_5x5_90th', u'RefComposite',
       u'RefComposite_5x5_10th', u'RefComposite_5x5_50th',
       u'RefComposite_5x5_90th', u'RhoHV', u'RhoHV_5x5_10th',
       u'RhoHV_5x5_50th', u'RhoHV_5x5_90th', u'Zdr', u'Zdr_5x5_10th',
       u'Zdr_5x5_50th', u'Zdr_5x5_90th', u'Kdp', u'Kdp_5x5_10th',
       u'Kdp_5x5_50th', u'Kdp_5x5_90th']
features = test[list(columns)].values



In [165]:

    
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit(features)
features_trans = imp.transform(features)



In [166]:

    
fall = test[test.columns].values



In [177]:

    
fall[20]









    Out[177]:





array([  2.,  12.,  15.,  nan,  nan,  nan,  nan,  nan,  nan,  nan,  nan,
        nan,  nan,  nan,  nan,  nan,  nan,  nan,  nan,  nan,  nan,  nan,
        nan])



In [173]:

    
features_trans[0]









    Out[173]:





array([  1.        ,   8.        ,  22.41518913,  19.80155642,
        22.10491607,  14.        ,  24.05639308,  21.99042553,
        23.72933182,  15.        ,   0.98224537,   0.93533269,
         0.98130561,   1.01558046,   0.47618814,  -0.66830184,
         0.24352433,   1.84881703,   0.33119267,  -3.23234445,
        -0.39269493,   3.89739166])



In [188]:

    
i = 1
pred = 0
while fall[i][0] == 1:
    #print(fall[i])
    pred+=etreg.predict(features_trans[i])[0]
    #print(etreg.predict(features_trans[i])[0])
    i+=1
print(i)



In [192]:

    
fall[-1][0]









    Out[192]:





460.0



In [202]:

    
%%time
res=[]
i=0
while i<len(fall) and i < 10000:
    pred = 0
    lenn = 0
    curr=fall[i][0]
    while i<len(fall) and fall[i][0] == curr:
        #print(fall[i])
        pred+=etreg.predict(features_trans[i])[0]
        #print(etreg.predict(features_trans[i])[0])
        i+=1
        lenn += 1
    res.append((curr,pred/lenn))
    #i+=1
    #print(i)









    



CPU times: user 11.9 s, sys: 47.2 ms, total: 11.9 s
Wall time: 12 s



In [199]:

    
len(res)









    Out[199]:





460



In [203]:

    
res[:10]









    Out[203]:





[(1.0, 2.1142275617605883),
 (2.0, 1.0368167780596871),
 (3.0, 6.6831796034460016),
 (4.0, 7.0145211283436399),
 (5.0, 2.3839445058508328),
 (6.0, 3.2730617268046158),
 (7.0, 4.1804031388372724),
 (8.0, 1.9472981445571425),
 (9.0, 1.0272548098142853),
 (10.0, 8.0115426694183345)]



In [11]:

    
def myfunc(hour):
    #rowid = hour['Id'].iloc[0]
    # sort hour by minutes_past
    hour = hour.sort('minutes_past', ascending=True)
    #est = (hour['Id'],random.random())
    est = random.random()
    return est



In [12]:

    
def marshall_palmer(ref, minutes_past):
    #print("Estimating rainfall from {0} observations".format(len(minutes_past)))
    # how long is each observation valid?
    valid_time = np.zeros_like(minutes_past)
    valid_time[0] = minutes_past.iloc[0]
    for n in xrange(1, len(minutes_past)):
        valid_time[n] = minutes_past.iloc[n] - minutes_past.iloc[n-1]
    valid_time[-1] = valid_time[-1] + 60 - np.sum(valid_time)
    valid_time = valid_time / 60.0

    # sum up rainrate * validtime
    sum = 0
    for dbz, hours in zip(ref, valid_time):
        # See: https://en.wikipedia.org/wiki/DBZ_(meteorology)
        if np.isfinite(dbz):
            mmperhr = pow(pow(10, dbz/10)/200, 0.625)
            sum = sum + mmperhr * hours
    return sum


def simplesum(ref,hour):
    hour.sum()

# each unique Id is an hour of data at some gauge
def myfunc(hour):
    #rowid = hour['Id'].iloc[0]
    # sort hour by minutes_past
    hour = hour.sort('minutes_past', ascending=True)
    est = marshall_palmer(hour['Ref'], hour['minutes_past'])
    return est



In [13]:

    
estimates = train.groupby(train.index).apply(myfunc)
estimates.head(20)









    Out[13]:





Id
2      0.448765
10     5.103416
11     0.599822
12     4.399598
14     0.403563
15     0.522524
17     0.906720
18     0.614962
24     6.483500
27     4.939502
29     0.529641
32     0.947299
33     4.238946
34     1.339219
38     2.557478
41     4.407576
51    16.523660
56     1.038116
58     1.489662
62     8.366335
dtype: float64



In [14]:

    
train["Expected"].head(20)









    Out[14]:





Id
2     1.016
2     1.016
2     1.016
2     1.016
2     1.016
2     1.016
2     1.016
2     1.016
2     1.016
10    0.010
10    0.010
10    0.010
10    0.010
10    0.010
10    0.010
10    0.010
10    0.010
10    0.010
10    0.010
10    0.010
Name: Expected, dtype: float64



In [15]:

    
res=[]
for i in fall:
    pred = 0
    curr=i[0]
    while fall[i][0] == 1:
        #print(fall[i])
        pred+=etreg.predict(features_trans[i])[0]
        #print(etreg.predict(features_trans[i])[0])
        i+=1
    print(i)









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-15-f8eed31e3235> in <module>()
      1 res=[]
----> 2 for i in fall:
      3     pred = 0
      4     curr=i[0]
      5     while fall[i][0] == 1:

NameError: name 'fall' is not defined



In [178]:

    
etreg.predict(features_trans[0])









    Out[178]:





array([ 2.08148106])



In [16]:

    
def marshall_palmer(data):
    res=[]
    for n in data:
        res.append(etreg.predict(n)[0])
    return np.array(res).mean()


def simplesum(ref,hour):
    hour.sum()

def myfunc(hour):
    hour = hour.sort('minutes_past', ascending=True)
    est = marshall_palmer(hour[train.columns])
    return est



In [302]:

    
estimates = train_clean.groupby(train_clean.index).apply(myfunc)
estimates.head(20)









    



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-302-edc026cfb441> in <module>()
----> 1 estimates = train_clean.groupby(train_clean.index).apply(myfunc)
      2 estimates.head(20)

/Library/Python/2.7/site-packages/pandas/core/groupby.pyc in apply(self, func, *args, **kwargs)
    669         # ignore SettingWithCopy here in case the user mutates
    670         with option_context('mode.chained_assignment',None):
--> 671             return self._python_apply_general(f)
    672 
    673     def _python_apply_general(self, f):

/Library/Python/2.7/site-packages/pandas/core/groupby.pyc in _python_apply_general(self, f)
    673     def _python_apply_general(self, f):
    674         keys, values, mutated = self.grouper.apply(f, self._selected_obj,
--> 675                                                    self.axis)
    676 
    677         return self._wrap_applied_output(keys, values,

/Library/Python/2.7/site-packages/pandas/core/groupby.pyc in apply(self, f, data, axis)
   1292             # group might be modified
   1293             group_axes = _get_axes(group)
-> 1294             res = f(group)
   1295             if not _is_indexed_like(res, group_axes):
   1296                 mutated = True

/Library/Python/2.7/site-packages/pandas/core/groupby.pyc in f(g)
    665         @wraps(func)
    666         def f(g):
--> 667             return func(g, *args, **kwargs)
    668 
    669         # ignore SettingWithCopy here in case the user mutates

<ipython-input-291-903557efb5b5> in myfunc(hour)
     11 def myfunc(hour):
     12     hour = hour.sort('minutes_past', ascending=True)
---> 13     est = marshall_palmer(hour[train.columns])
     14     return est

<ipython-input-291-903557efb5b5> in marshall_palmer(data)
      2     res=[]
      3     for n in data:
----> 4         res.append(int(etreg.predict(n)[0]))
      5     return np.array(res).mean()
      6 

/Library/Python/2.7/site-packages/sklearn/ensemble/forest.pyc in predict(self, X)
    615 
    616         # Check data
--> 617         X = check_array(X, dtype=DTYPE, accept_sparse="csr")
    618         if issparse(X) and (X.indices.dtype != np.intc or
    619                             X.indptr.dtype != np.intc):

/Library/Python/2.7/site-packages/sklearn/utils/validation.pyc in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features)
    342             else:
    343                 dtype = None
--> 344         array = np.array(array, dtype=dtype, order=order, copy=copy)
    345         # make sure we actually converted to numeric:
    346         if dtype_numeric and array.dtype.kind == "O":

ValueError: could not convert string to float: minutes_past



In [ ]:



In [ ]:

RNN



In [11]:

    
import pandas as pd  
from random import random

flow = (list(range(1,10,1)) + list(range(10,1,-1)))*1000  
pdata = pd.DataFrame({"a":flow, "b":flow})  
pdata.b = pdata.b.shift(9)  
data = pdata.iloc[10:] * random()  # some noise



In [12]:

    
#columns = [u'minutes_past', u'radardist_km', u'Ref', u'Ref_5x5_10th',
#       u'Ref_5x5_50th', u'Ref_5x5_90th', u'RefComposite',
#       u'RefComposite_5x5_10th', u'RefComposite_5x5_50th',
#       u'RefComposite_5x5_90th', u'RhoHV', u'RhoHV_5x5_10th',
#       u'RhoHV_5x5_50th', u'RhoHV_5x5_90th', u'Zdr', u'Zdr_5x5_10th',
#       u'Zdr_5x5_50th', u'Zdr_5x5_90th', u'Kdp', u'Kdp_5x5_10th',
#       u'Kdp_5x5_50th', u'Kdp_5x5_90th']
columns = [u'radardist_km', u'Ref', u'Ref_5x5_10th']
nb_features = len(columns)
data = train[list(columns)]
data.head(10)









    Out[12]:






  
    
      
      radardist_km
      Ref
      Ref_5x5_10th
    
    
      Id
      
      
      
    
  
  
    
      2
      2
      9.0
      5.0
    
    
      2
      2
      18.0
      14.0
    
    
      2
      2
      24.5
      16.5
    
    
      2
      2
      12.0
      12.0
    
    
      2
      2
      22.5
      19.0
    
    
      2
      2
      14.0
      14.0
    
    
      2
      2
      12.0
      11.0
    
    
      2
      2
      1.5
      3.5
    
    
      2
      2
      16.0
      14.5
    
    
      10
      10
      32.5
      32.0



In [13]:

    
data.iloc[0].as_matrix()









    Out[13]:





array([ 2.,  9.,  5.])



In [14]:

    
train.head(5)









    Out[14]:






  
    
      
      minutes_past
      radardist_km
      Ref
      Ref_5x5_10th
      Ref_5x5_50th
      Ref_5x5_90th
      RefComposite
      RefComposite_5x5_10th
      RefComposite_5x5_50th
      RefComposite_5x5_90th
      ...
      RhoHV_5x5_90th
      Zdr
      Zdr_5x5_10th
      Zdr_5x5_50th
      Zdr_5x5_90th
      Kdp
      Kdp_5x5_10th
      Kdp_5x5_50th
      Kdp_5x5_90th
      Expected
    
    
      Id
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      2
      1
      2
      9.0
      5.0
      7.5
      10.5
      15.0
      10.5
      16.5
      23.5
      ...
      0.998333
      0.3750
      -0.1250
      0.3125
      0.8750
      1.059998
      -1.410004
      -0.350006
      1.059998
      1.016
    
    
      2
      16
      2
      18.0
      14.0
      17.5
      21.0
      20.5
      18.0
      20.5
      23.0
      ...
      1.001667
      0.2500
      0.1250
      0.3750
      0.6875
      0.349991
      -1.059998
      0.000000
      1.059998
      1.016
    
    
      2
      21
      2
      24.5
      16.5
      21.0
      24.5
      24.5
      21.0
      24.0
      28.0
      ...
      0.998333
      0.2500
      0.0625
      0.1875
      0.5625
      -0.350006
      -1.059998
      -0.350006
      1.759994
      1.016
    
    
      2
      26
      2
      12.0
      12.0
      16.0
      20.0
      16.5
      17.0
      19.0
      21.0
      ...
      0.998333
      0.5625
      0.2500
      0.4375
      0.6875
      -1.760010
      -1.760010
      -0.350006
      0.709991
      1.016
    
    
      2
      31
      2
      22.5
      19.0
      22.0
      25.0
      26.0
      23.5
      25.5
      27.5
      ...
      1.001667
      0.0000
      -0.1875
      0.2500
      0.6250
      -1.059998
      -2.120010
      -0.710007
      0.349991
      1.016
    
  

5 rows × 23 columns



In [15]:

    
train.loc[11]









    Out[15]:






  
    
      
      minutes_past
      radardist_km
      Ref
      Ref_5x5_10th
      Ref_5x5_50th
      Ref_5x5_90th
      RefComposite
      RefComposite_5x5_10th
      RefComposite_5x5_50th
      RefComposite_5x5_90th
      ...
      RhoHV_5x5_90th
      Zdr
      Zdr_5x5_10th
      Zdr_5x5_50th
      Zdr_5x5_90th
      Kdp
      Kdp_5x5_10th
      Kdp_5x5_50th
      Kdp_5x5_90th
      Expected
    
    
      Id
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      11
      1
      12
      21.0
      16.5
      20.0
      23.0
      21.0
      16.5
      20.0
      23.0
      ...
      1.015000
      -0.4375
      -0.4375
      0.2500
      0.8750
      -1.410004
      -1.760010
      1.059998
      3.879990
      2.540001
    
    
      11
      10
      12
      23.5
      22.0
      24.5
      26.5
      23.5
      22.0
      24.5
      26.5
      ...
      1.001667
      0.5625
      -0.1250
      0.2500
      0.7500
      0.709991
      -2.470001
      0.000000
      2.119995
      2.540001
    
    
      11
      20
      12
      20.0
      17.0
      19.5
      23.0
      20.0
      17.0
      19.5
      23.0
      ...
      1.001667
      0.0625
      -0.3125
      0.0625
      0.5625
      -1.059998
      -2.110001
      -1.059998
      1.759994
      2.540001
    
    
      11
      29
      12
      16.5
      16.5
      19.0
      23.0
      16.5
      16.5
      19.0
      23.0
      ...
      0.998333
      0.0000
      -0.6875
      0.1250
      0.9375
      -3.169998
      -3.169998
      0.709991
      2.470001
      2.540001
    
    
      11
      39
      12
      20.0
      16.5
      18.5
      21.5
      20.0
      16.5
      18.5
      21.5
      ...
      1.021667
      0.6875
      -0.3750
      0.3125
      1.2500
      -1.340012
      -1.360001
      -0.350006
      1.759994
      2.540001
    
    
      11
      49
      12
      19.0
      18.0
      19.5
      22.0
      19.0
      18.0
      19.5
      22.0
      ...
      1.035000
      1.4375
      -0.6250
      0.1875
      1.3125
      -3.169998
      -3.169998
      -1.059998
      3.509994
      2.540001
    
    
      11
      58
      12
      15.0
      15.0
      18.5
      22.5
      15.0
      15.0
      18.5
      22.5
      ...
      1.008333
      1.0625
      -0.4375
      0.1875
      1.0625
      -0.710007
      -3.169998
      -0.350006
      1.759994
      2.540001
    
  

7 rows × 23 columns



In [16]:

    
train.loc[11][:1]["Expected"].as_matrix









    Out[16]:





<bound method Series.as_matrix of Id
11    2.540001
Name: Expected, dtype: float64>



In [17]:

    
#train.index.unique()



In [18]:

    
def _load_data(data, n_prev = 100):  
    """
    data should be pd.DataFrame()
    """

    docX, docY = [], []
    for i in range(len(data)-n_prev):
        docX.append(data.iloc[i:i+n_prev].as_matrix())
        docY.append(data.iloc[i+n_prev].as_matrix())
    alsX = np.array(docX)
    alsY = np.array(docY)

    return alsX, alsY

def train_test_split(df, test_size=0.1):  
    ntrn = round(len(df) * (1 - test_size))

    X_train, y_train = _load_data(df.iloc[0:ntrn])
    X_test, y_test = _load_data(df.iloc[ntrn:])
    
    return (X_train, y_train), (X_test, y_test)

(X_train, y_train), (X_test, y_test) = train_test_split(data)









    



/Library/Python/2.7/site-packages/pandas/core/index.py:843: FutureWarning: slice indexers when using iloc should be integers and not floating point
  "and not floating point",FutureWarning)



In [19]:

    
np.shape(X_train)









    Out[19]:





(1743, 100, 3)



In [20]:

    
t = np.array([2,1])
t.shape = (1,2)
t.tolist()[0]









    Out[20]:





[2, 1]



In [21]:

    
np.shape(t)









    Out[21]:





(1, 2)



In [22]:

    
X_train[:2,:2]









    Out[22]:





array([[[  2. ,   9. ,   5. ],
        [  2. ,  18. ,  14. ]],

       [[  2. ,  18. ,  14. ],
        [  2. ,  24.5,  16.5]]])



In [23]:

    
XX[:2,:2]









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-23-71fbb2c61d17> in <module>()
----> 1 XX[:2,:2]

NameError: name 'XX' is not defined



In [29]:

    
XX[:2][:2]









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-29-3fb3b3c029b8> in <module>()
----> 1 XX[:2][:2]

NameError: name 'XX' is not defined



In [30]:

    
np.shape(XX)









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-30-7df2bfd098fa> in <module>()
----> 1 np.shape(XX)

NameError: name 'XX' is not defined



In [31]:

    
for i in XX:
    print(np.shape(i))









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-31-621bdcf57787> in <module>()
----> 1 for i in XX:
      2     print(np.shape(i))

NameError: name 'XX' is not defined



In [24]:

    
np.shape(XX[0])









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-24-5d0aacd18a95> in <module>()
----> 1 np.shape(XX[0])

NameError: name 'XX' is not defined



In [28]:

    
z = np.zeros([297,9,23])



In [29]:

    
np.shape(z)









    Out[29]:





(297, 9, 23)



In [791]:

    
np.shape(np.reshape(XX,(297,1)))









    Out[791]:





(1, 297)



In [716]:

    
tl = train.loc[2][:1]["Expected"]



In [718]:

    
tl.as_blocks()









    Out[718]:





{'float64': Id
 2    1.016
 Name: Expected, dtype: float64}



In [719]:

    
tl.as_matrix()









    Out[719]:





array([ 1.0160005])



In [777]:

    
data.iloc[2:4].as_matrix()









    Out[777]:





array([[ 4.5760933 ,  2.61491046],
       [ 3.92236568,  3.26863807]])



In [776]:

    
train.loc[2].as_matrix()









    Out[776]:





array([[  1.        ,   2.        ,   9.        ,   5.        ,
          7.5       ,  10.5       ,  15.        ,  10.5       ,
         16.5       ,  23.5       ,   0.99833333,   0.99833333,
          0.99833333,   0.99833333,   0.375     ,  -0.125     ,
          0.3125    ,   0.875     ,   1.0599976 ,  -1.4100037 ,
         -0.3500061 ,   1.0599976 ,   1.0160005 ],
       [ 16.        ,   2.        ,  18.        ,  14.        ,
         17.5       ,  21.        ,  20.5       ,  18.        ,
         20.5       ,  23.        ,   0.995     ,   0.995     ,
          0.99833333,   1.0016667 ,   0.25      ,   0.125     ,
          0.375     ,   0.6875    ,   0.34999084,  -1.0599976 ,
          0.        ,   1.0599976 ,   1.0160005 ],
       [ 21.        ,   2.        ,  24.5       ,  16.5       ,
         21.        ,  24.5       ,  24.5       ,  21.        ,
         24.        ,  28.        ,   0.99833333,   0.995     ,
          0.99833333,   0.99833333,   0.25      ,   0.0625    ,
          0.1875    ,   0.5625    ,  -0.3500061 ,  -1.0599976 ,
         -0.3500061 ,   1.7599945 ,   1.0160005 ],
       [ 26.        ,   2.        ,  12.        ,  12.        ,
         16.        ,  20.        ,  16.5       ,  17.        ,
         19.        ,  21.        ,   0.99833333,   0.995     ,
          0.99833333,   0.99833333,   0.5625    ,   0.25      ,
          0.4375    ,   0.6875    ,  -1.7600098 ,  -1.7600098 ,
         -0.3500061 ,   0.70999146,   1.0160005 ],
       [ 31.        ,   2.        ,  22.5       ,  19.        ,
         22.        ,  25.        ,  26.        ,  23.5       ,
         25.5       ,  27.5       ,   0.99833333,   0.995     ,
          0.99833333,   1.0016667 ,   0.        ,  -0.1875    ,
          0.25      ,   0.625     ,  -1.0599976 ,  -2.1200104 ,
         -0.7100067 ,   0.34999084,   1.0160005 ],
       [ 37.        ,   2.        ,  14.        ,  14.        ,
         18.5       ,  21.        ,  19.5       ,  20.        ,
         21.        ,  23.        ,   0.99833333,   0.9916667 ,
          0.99833333,   0.99833333,   0.5       ,   0.1875    ,
          0.4375    ,   0.8125    ,   0.        ,  -1.7600098 ,
         -0.3500061 ,   1.0599976 ,   1.0160005 ],
       [ 42.        ,   2.        ,  12.        ,  11.        ,
         12.5       ,  17.        ,  19.5       ,  18.        ,
         21.        ,  23.        ,   0.99833333,   0.995     ,
          0.99833333,   0.99833333,   0.625     ,   0.375     ,
          0.625     ,   0.875     ,  -0.3500061 ,  -0.3500061 ,
          0.        ,   0.34999084,   1.0160005 ],
       [ 47.        ,   2.        ,   1.5       ,   3.5       ,
          7.        ,  10.5       ,  18.        ,  16.5       ,
         18.5       ,  21.5       ,   0.99833333,   0.995     ,
          0.99833333,   0.99833333,   0.375     ,   0.1875    ,
          0.5       ,   0.6875    ,   0.34999084,  -2.1100006 ,
         -0.3500061 ,   1.0599976 ,   1.0160005 ],
       [ 53.        ,   2.        ,  16.        ,  14.5       ,
         18.        ,  23.5       ,  28.        ,  23.5       ,
         26.5       ,  29.5       ,   0.99833333,   0.9916667 ,
          0.99833333,   0.99833333,   0.875     ,   0.625     ,
          0.9375    ,   1.375     ,  -0.3500061 ,  -1.4100037 ,
         -0.3500061 ,   2.119995  ,   1.0160005 ]])



In [46]:

    
m = data.loc[10].as_matrix()
pad = np.pad(m, ((0, max_padding -len(m) ),(0,0)), 'constant')



In [47]:

    
pad









    Out[47]:





array([[  2.        ,  10.        ,  32.5       , ...,  -6.330002  ,
          0.70999146,  11.089996  ],
       [  6.        ,  10.        ,  37.5       , ..., -11.75      ,
          0.        ,   8.429993  ],
       [ 11.        ,  10.        ,  36.        , ..., -13.449997  ,
         -3.1699982 ,   8.080002  ],
       ..., 
       [  0.        ,   0.        ,   0.        , ...,   0.        ,
          0.        ,   0.        ],
       [  0.        ,   0.        ,   0.        , ...,   0.        ,
          0.        ,   0.        ],
       [  0.        ,   0.        ,   0.        , ...,   0.        ,
          0.        ,   0.        ]])



In [44]:

    
train.index.unique()









    Out[44]:





array([  2,  10,  11,  12,  14,  15,  17,  18,  24,  27,  29,  32,  33,
        34,  38,  41,  51,  56,  58,  62,  69,  72,  73,  74,  77,  78,
        87,  90,  92,  94,  98, 102, 103, 104, 105, 106, 111, 117, 123,
       127, 128, 133, 135, 136, 139, 144, 145, 151, 153, 155, 156, 160,
       163, 164, 166, 174, 177, 178, 184, 186, 188, 189, 191, 192, 193,
       198, 200, 206, 208, 210, 219, 220, 221, 224, 225, 226, 227, 230,
       237, 240, 243, 245, 246, 249, 250, 251, 254, 255, 261, 263, 265,
       266, 269, 274, 276, 279, 280, 284, 285, 288, 289, 291, 300, 302,
       304, 309, 310, 316, 322, 325, 327, 330, 332, 335, 336, 340, 341,
       344, 345, 347, 348, 350, 353, 358, 361, 362, 364, 365, 366, 368,
       369, 371, 373, 379, 382, 390, 391, 396, 402, 403, 404, 405, 406,
       410, 411, 416, 418, 425, 429, 431, 432, 436, 438, 442, 443, 445,
       451, 453, 462, 465, 474, 475, 476, 483, 489, 495, 500, 502, 506,
       521, 523, 527, 531, 539, 541, 545, 546, 547, 548, 551, 552, 556,
       557, 559, 560, 563, 565, 566, 569, 575, 576, 577, 583, 584, 585,
       592, 597, 599, 603, 605, 616, 617, 620, 625, 628, 629, 639, 640,
       641, 644, 647, 651, 655, 657, 658, 662, 669, 673, 675, 682, 685,
       689, 697, 706, 707, 708, 720, 727, 729, 736, 739, 741, 742, 743,
       746, 748, 752, 754, 762, 770, 771, 778, 781, 782, 784, 786, 788,
       789, 793, 796, 797, 801, 803, 804, 812, 817, 818, 821, 823, 825,
       826, 828, 830, 836, 837, 838, 861, 862, 864, 868, 871, 872, 875,
       877, 878, 880, 885, 887, 894, 899, 901, 902, 904, 905, 906, 908,
       909, 912, 913, 917, 925, 929, 930, 931, 938, 939, 940])



In [25]:

    
max_padding = 20



In [26]:

    
%%time

docX, docY = [], []
for i in train.index.unique():
    if isinstance(train.loc[i],pd.core.series.Series):
        m = [data.loc[i].as_matrix()]
        pad = np.pad(m, ((max_padding -len(m), 0),(0,0)), 'constant')  # pre-padding
        docX.append(pad)
        docY.append(float(train.loc[i]["Expected"]))
    else:
        m = data.loc[i].as_matrix()
        pad = np.pad(m, ((max_padding -len(m), 0),(0,0)), 'constant')
        docX.append(pad)
        docY.append(float(train.loc[i][:1]["Expected"]))
    #docY.append(train.loc[i][:1]["Expected"].as_matrix)
XX = np.array(docX)
yy = np.array(docY)









    



CPU times: user 205 ms, sys: 4.03 ms, total: 209 ms
Wall time: 208 ms



In [27]:

    
np.shape(XX)









    Out[27]:





(297, 20, 3)



In [28]:

    
#from keras.preprocessing import sequence
#sequence.pad_sequences(X_train, maxlen=maxlen)



In [29]:

    
def _load_data(data):  
    """
    data should be pd.DataFrame()
    """
    docX, docY = [], []
    for i in data.index.unique():
        #np.pad(tmp, ((0, max_padding -len(tmp) ),(0,0)), 'constant')
        m = data.loc[i].as_matrix()
        pad = np.pad(m, ((0, max_padding -len(m) ),(0,0)), 'constant')
        docX.append(pad)
        if isinstance(train.loc[i],pd.core.series.Series):
            docY.append(float(train.loc[i]["Expected"]))
        else:
            docY.append(float(train.loc[i][:1]["Expected"]))
    alsX = np.array(docX)
    alsY = np.array(docY)

    return alsX, alsY

def train_test_split(df, test_size=0.1):  
    ntrn = round(len(df) * (1 - test_size))

    X_train, y_train = _load_data(df.iloc[0:ntrn])
    X_test, y_test = _load_data(df.iloc[ntrn:])
    
    return (X_train, y_train), (X_test, y_test)

(X_train, y_train), (X_test, y_test) = train_test_split(train)









    



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-29-8b5129e33870> in <module>()
     26     return (X_train, y_train), (X_test, y_test)
     27 
---> 28 (X_train, y_train), (X_test, y_test) = train_test_split(train)

<ipython-input-29-8b5129e33870> in train_test_split(df, test_size)
     21     ntrn = round(len(df) * (1 - test_size))
     22 
---> 23     X_train, y_train = _load_data(df.iloc[0:ntrn])
     24     X_test, y_test = _load_data(df.iloc[ntrn:])
     25 

<ipython-input-29-8b5129e33870> in _load_data(data)
      7         #np.pad(tmp, ((0, max_padding -len(tmp) ),(0,0)), 'constant')
      8         m = data.loc[i].as_matrix()
----> 9         pad = np.pad(m, ((0, max_padding -len(m) ),(0,0)), 'constant')
     10         docX.append(pad)
     11         if isinstance(train.loc[i],pd.core.series.Series):

/Library/Python/2.7/site-packages/numpy/lib/arraypad.pyc in pad(array, pad_width, mode, **kwargs)
   1316 
   1317     narray = np.array(array)
-> 1318     pad_width = _validate_lengths(narray, pad_width)
   1319 
   1320     allowedkwargs = {

/Library/Python/2.7/site-packages/numpy/lib/arraypad.pyc in _validate_lengths(narray, number_elements)
   1101 
   1102     """
-> 1103     normshp = _normalize_shape(narray, number_elements)
   1104     for i in normshp:
   1105         chk = [1 if x is None else x for x in i]

/Library/Python/2.7/site-packages/numpy/lib/arraypad.pyc in _normalize_shape(ndarray, shape, cast_to_int)
   1056         else:
   1057             fmt = "Unable to create correctly shaped tuple from %s"
-> 1058             raise ValueError(fmt % (shape,))
   1059 
   1060     else:

ValueError: Unable to create correctly shaped tuple from ((0, -3), (0, 0))



In [30]:

    
len(X_train[0])









    Out[30]:





100



In [31]:

    
train.head()









    Out[31]:






  
    
      
      minutes_past
      radardist_km
      Ref
      Ref_5x5_10th
      Ref_5x5_50th
      Ref_5x5_90th
      RefComposite
      RefComposite_5x5_10th
      RefComposite_5x5_50th
      RefComposite_5x5_90th
      ...
      RhoHV_5x5_90th
      Zdr
      Zdr_5x5_10th
      Zdr_5x5_50th
      Zdr_5x5_90th
      Kdp
      Kdp_5x5_10th
      Kdp_5x5_50th
      Kdp_5x5_90th
      Expected
    
    
      Id
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      2
      1
      2
      9.0
      5.0
      7.5
      10.5
      15.0
      10.5
      16.5
      23.5
      ...
      0.998333
      0.3750
      -0.1250
      0.3125
      0.8750
      1.059998
      -1.410004
      -0.350006
      1.059998
      1.016
    
    
      2
      16
      2
      18.0
      14.0
      17.5
      21.0
      20.5
      18.0
      20.5
      23.0
      ...
      1.001667
      0.2500
      0.1250
      0.3750
      0.6875
      0.349991
      -1.059998
      0.000000
      1.059998
      1.016
    
    
      2
      21
      2
      24.5
      16.5
      21.0
      24.5
      24.5
      21.0
      24.0
      28.0
      ...
      0.998333
      0.2500
      0.0625
      0.1875
      0.5625
      -0.350006
      -1.059998
      -0.350006
      1.759994
      1.016
    
    
      2
      26
      2
      12.0
      12.0
      16.0
      20.0
      16.5
      17.0
      19.0
      21.0
      ...
      0.998333
      0.5625
      0.2500
      0.4375
      0.6875
      -1.760010
      -1.760010
      -0.350006
      0.709991
      1.016
    
    
      2
      31
      2
      22.5
      19.0
      22.0
      25.0
      26.0
      23.5
      25.5
      27.5
      ...
      1.001667
      0.0000
      -0.1875
      0.2500
      0.6250
      -1.059998
      -2.120010
      -0.710007
      0.349991
      1.016
    
  

5 rows × 23 columns



In [32]:

    
X_train[0][:10]









    Out[32]:





array([[  2. ,   9. ,   5. ],
       [  2. ,  18. ,  14. ],
       [  2. ,  24.5,  16.5],
       [  2. ,  12. ,  12. ],
       [  2. ,  22.5,  19. ],
       [  2. ,  14. ,  14. ],
       [  2. ,  12. ,  11. ],
       [  2. ,   1.5,   3.5],
       [  2. ,  16. ,  14.5],
       [ 10. ,  32.5,  32. ]])



In [33]:

    
yt = []
for i in y_train:
    yt.append([i[0]])



In [34]:

    
yt[0]









    Out[34]:





[5.0]



In [35]:

    
X_train.shape









    Out[35]:





(1743, 100, 3)



In [36]:

    
len(fea[0])









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-36-25ad14ca1e03> in <module>()
----> 1 len(fea[0])

NameError: name 'fea' is not defined



In [37]:

    
len(X_train[0][0])









    Out[37]:





3



In [38]:

    
f = np.array(fea)









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-38-ddf58c206832> in <module>()
----> 1 f = np.array(fea)

NameError: name 'fea' is not defined



In [443]:

    
f.shape()









    



---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-443-4a75c17b35fb> in <module>()
----> 1 f.shape()

TypeError: 'tuple' object is not callable



In [39]:

    
XX[0]









    Out[39]:





array([[  0. ,   0. ,   0. ],
       [  0. ,   0. ,   0. ],
       [  0. ,   0. ,   0. ],
       [  0. ,   0. ,   0. ],
       [  0. ,   0. ,   0. ],
       [  0. ,   0. ,   0. ],
       [  0. ,   0. ,   0. ],
       [  0. ,   0. ,   0. ],
       [  0. ,   0. ,   0. ],
       [  0. ,   0. ,   0. ],
       [  0. ,   0. ,   0. ],
       [  2. ,   9. ,   5. ],
       [  2. ,  18. ,  14. ],
       [  2. ,  24.5,  16.5],
       [  2. ,  12. ,  12. ],
       [  2. ,  22.5,  19. ],
       [  2. ,  14. ,  14. ],
       [  2. ,  12. ,  11. ],
       [  2. ,   1.5,   3.5],
       [  2. ,  16. ,  14.5]])



In [428]:

    
#(X_train, y_train), (X_test, y_test) = train_test_split(data)  # retrieve data

# and now train the model
# batch_size should be appropriate to your memory size
# number of epochs should be higher for real world problems
model.fit(X_train, yt, batch_size=450, nb_epoch=2, validation_split=0.05)









    



Train on 15286 samples, validate on 805 samples
Epoch 0
15286/15286 [==============================] - 11s - loss: 1.4571 - val_loss: 0.5908
Epoch 1
15286/15286 [==============================] - 10s - loss: 0.4297 - val_loss: 0.3123






    Out[428]:





<keras.callbacks.History at 0x1495fe7d0>



In [43]:

    
from keras.models import Sequential  
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM
from keras.layers.embeddings import Embedding



In [44]:

    
%%time
input_dim = nb_features
out_dim = 1  
hidden_dim = 200

model = Sequential()
#Embedding(input_dim, hidden_dim, mask_zero=True)
#model.add(LSTM(hidden_dim, hidden_dim, return_sequences=False))  
model.add(LSTM(input_dim, hidden_dim, return_sequences=False))  
model.add(Dropout(0.5))
model.add(Dense(hidden_dim, out_dim))  
model.add(Activation("linear"))  
model.compile(loss="mean_squared_error", optimizer="rmsprop")









    



CPU times: user 42.8 s, sys: 663 ms, total: 43.5 s
Wall time: 44.2 s






    



/Library/Python/2.7/site-packages/theano/scan_module/scan_perform_ext.py:133: RuntimeWarning: numpy.ndarray size changed, may indicate binary incompatibility
  from scan_perform.scan_perform import *



In [45]:

    
model.fit(XX, yy, batch_size=10, nb_epoch=10, validation_split=0.1)









    



Train on 267 samples, validate on 30 samples
Epoch 0
267/267 [==============================] - 1s - loss: 119.8705 - val_loss: 14.7404
Epoch 1
267/267 [==============================] - 1s - loss: 114.1986 - val_loss: 14.3260
Epoch 2
267/267 [==============================] - 1s - loss: 112.4801 - val_loss: 14.3957
Epoch 3
267/267 [==============================] - 1s - loss: 110.7651 - val_loss: 14.4841
Epoch 4
267/267 [==============================] - 1s - loss: 112.2983 - val_loss: 14.4571
Epoch 5
267/267 [==============================] - 1s - loss: 111.1093 - val_loss: 14.3645
Epoch 6
267/267 [==============================] - 1s - loss: 110.0830 - val_loss: 14.6030
Epoch 7
267/267 [==============================] - 1s - loss: 110.6827 - val_loss: 14.7468
Epoch 8
267/267 [==============================] - 1s - loss: 109.4308 - val_loss: 14.6829
Epoch 9
267/267 [==============================] - 1s - loss: 108.5179 - val_loss: 14.5945






    Out[45]:





<keras.callbacks.History at 0x107918210>



In [46]:

    
test = random.randint(0,len(XX))
print(model.predict(XX[test:test+1])[0][0])
print(yy[test])









    



---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-46-84226bc0a03c> in <module>()
----> 1 test = random.randint(0,len(XX))
      2 print(model.predict(XX[test:test+1])[0][0])
      3 print(yy[test])

AttributeError: 'builtin_function_or_method' object has no attribute 'randint'



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:

	minutes_past	radardist_km	Ref	Ref_5x5_10th	Ref_5x5_50th	Ref_5x5_90th	RefComposite	RefComposite_5x5_10th	RefComposite_5x5_50th	RefComposite_5x5_90th	...	RhoHV_5x5_90th	Zdr	Zdr_5x5_10th	Zdr_5x5_50th	Zdr_5x5_90th	Kdp	Kdp_5x5_10th	Kdp_5x5_50th	Kdp_5x5_90th	Expected
Id
2	1	2	9.0	5.0	7.5	10.5	15.0	10.5	16.5	23.5	...	0.998333	0.3750	-0.1250	0.3125	0.8750	1.059998	-1.410004	-0.350006	1.059998	1.016
2	16	2	18.0	14.0	17.5	21.0	20.5	18.0	20.5	23.0	...	1.001667	0.2500	0.1250	0.3750	0.6875	0.349991	-1.059998	0.000000	1.059998	1.016
2	21	2	24.5	16.5	21.0	24.5	24.5	21.0	24.0	28.0	...	0.998333	0.2500	0.0625	0.1875	0.5625	-0.350006	-1.059998	-0.350006	1.759994	1.016
2	26	2	12.0	12.0	16.0	20.0	16.5	17.0	19.0	21.0	...	0.998333	0.5625	0.2500	0.4375	0.6875	-1.760010	-1.760010	-0.350006	0.709991	1.016
2	31	2	22.5	19.0	22.0	25.0	26.0	23.5	25.5	27.5	...	1.001667	0.0000	-0.1875	0.2500	0.6250	-1.059998	-2.120010	-0.710007	0.349991	1.016