Script



In [1]:

    
# from __future__ import exam_success
from __future__ import absolute_import 
from __future__ import print_function  

# Standard imports
%matplotlib inline
import os
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import random
import pandas as pd
import scipy.stats as stats

# Sk cheats
from sklearn.cross_validation import cross_val_score
from sklearn import grid_search
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
#from sklearn.preprocessing import Imputer   # get rid of nan
from sklearn.decomposition import NMF        # to add features based on the latent representation
from sklearn.decomposition import ProjectedGradientNMF

# Faster gradient boosting
import xgboost as xgb

# For neural networks models
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation 
from keras.optimizers import SGD, RMSprop









    



/Library/Python/2.7/site-packages/matplotlib/__init__.py:872: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))



In [2]:

    
print("STARTED...")









    



STARTED...



In [3]:

    
#filename = "data/train.csv"
filename = "data/reduced_train_10000.csv"
raw = pd.read_csv(filename)
raw = raw.set_index('Id')









    



CPU times: user 44.6 ms, sys: 11.9 ms, total: 56.5 ms
Wall time: 61.9 ms



In [4]:

    
print("LOADED")









    



LOADED



In [5]:

    
# Considering that the gauge may concentrate the rainfall, we set the cap to 1000
# Comment this line to analyse the complete dataset 
l = len(raw)
raw = raw[raw['Expected'] < 300]  #1000
print("Dropped %d (%0.2f%%)"%(l-len(raw),(l-len(raw))/float(l)*100))









    



Dropped 678 (6.78%)



In [6]:

    
# We select all features except for the minutes past,
# because we ignore the time repartition of the sequence for now

features_columns = list([u'Ref', u'Ref_5x5_10th',
       u'Ref_5x5_50th', u'Ref_5x5_90th', u'RefComposite',
       u'RefComposite_5x5_10th', u'RefComposite_5x5_50th',
       u'RefComposite_5x5_90th', u'RhoHV', u'RhoHV_5x5_10th',
       u'RhoHV_5x5_50th', u'RhoHV_5x5_90th', u'Zdr', u'Zdr_5x5_10th',
       u'Zdr_5x5_50th', u'Zdr_5x5_90th', u'Kdp', u'Kdp_5x5_10th',
       u'Kdp_5x5_50th', u'Kdp_5x5_90th'])

def getXy(raw):
    selected_columns = list([ u'minutes_past',u'radardist_km', u'Ref', u'Ref_5x5_10th',
       u'Ref_5x5_50th', u'Ref_5x5_90th', u'RefComposite',
       u'RefComposite_5x5_10th', u'RefComposite_5x5_50th',
       u'RefComposite_5x5_90th', u'RhoHV', u'RhoHV_5x5_10th',
       u'RhoHV_5x5_50th', u'RhoHV_5x5_90th', u'Zdr', u'Zdr_5x5_10th',
       u'Zdr_5x5_50th', u'Zdr_5x5_90th', u'Kdp', u'Kdp_5x5_10th',
       u'Kdp_5x5_50th', u'Kdp_5x5_90th'])
    
    data = raw[selected_columns]
    
    docX, docY = [], []
    for i in data.index.unique():
        if isinstance(data.loc[i],pd.core.series.Series):
            m = [data.loc[i].as_matrix()]
            docX.append(m)
            docY.append(float(raw.loc[i]["Expected"]))
        else:
            m = data.loc[i].as_matrix()
            docX.append(m)
            docY.append(float(raw.loc[i][:1]["Expected"]))
    X , y = np.array(docX) , np.array(docY)
    return X,y



In [7]:

    
noFullNan = raw.loc[raw[features_columns].dropna(how='all').index.unique()]



In [8]:

    
X,y=getXy(noFullNan)



In [9]:

    
global_means = np.nanmean(noFullNan,0)

XX=[]
for t in X:
    nm = np.nanmean(t,0)
    for idx,j in enumerate(nm):
        if np.isnan(j):
            nm[idx]=global_means[idx]
    XX.append(nm)
XX=np.array(XX)

# rescale to clip min at 0 (for non negative matrix factorization)
XX_rescaled=XX[:,:]-np.min(XX,0)









    



/Library/Python/2.7/site-packages/numpy/lib/nanfunctions.py:675: RuntimeWarning: Mean of empty slice
  warnings.warn("Mean of empty slice", RuntimeWarning)



In [10]:

    
nmf = NMF(max_iter=5000)
W = nmf.fit_transform(XX_rescaled)
#H = nn.components_









    



/Library/Python/2.7/site-packages/sklearn/decomposition/nmf.py:252: UserWarning: Iteration limit reached in nls subproblem.
  warnings.warn("Iteration limit reached in nls subproblem.")



In [11]:

    
# reduce the sequence structure of the data and produce
# new hopefully informatives features
def addFeatures(X,mf=0):
    # used to fill fully empty datas
    #global_means = np.nanmean(X,0)
    
    XX=[]
    nbFeatures=float(len(X[0][0]))
    for idxt,t in enumerate(X):
        
        # compute means, ignoring nan when possible, marking it when fully filled with nan
        nm = np.nanmean(t,0)
        tt=[]
        for idx,j in enumerate(nm):
            if np.isnan(j):
                nm[idx]=global_means[idx]
                tt.append(1)
            else:
                tt.append(0)
        tmp = np.append(nm,np.append(tt,tt.count(0)/nbFeatures))
        
        # faster if working on fully filled data:
        #tmp = np.append(np.nanmean(np.array(t),0),(np.array(t)[1:] - np.array(t)[:-1]).sum(0) )
        
        # add the percentiles
        tmp = np.append(tmp,np.nanpercentile(t,10,axis=0))
        tmp = np.append(tmp,np.nanpercentile(t,50,axis=0))
        tmp = np.append(tmp,np.nanpercentile(t,90,axis=0))
        
        for idx,i in enumerate(tmp):
            if np.isnan(i):
                tmp[idx]=0

        # adding the dbz as a feature
        test = t
        try:
            taa=test[:,0]
        except TypeError:
            taa=[test[0][0]]
        valid_time = np.zeros_like(taa)
        valid_time[0] = taa[0]
        for n in xrange(1,len(taa)):
            valid_time[n] = taa[n] - taa[n-1]
        valid_time[-1] = valid_time[-1] + 60 - np.sum(valid_time)
        valid_time = valid_time / 60.0


        sum=0
        try:
            column_ref=test[:,2]
        except TypeError:
            column_ref=[test[0][2]]
        for dbz, hours in zip(column_ref, valid_time):
            # See: https://en.wikipedia.org/wiki/DBZ_(meteorology)
            if np.isfinite(dbz):
                mmperhr = pow(pow(10, dbz/10)/200, 0.625)
                sum = sum + mmperhr * hours
        
        if not(mf is 0):
            tmp = np.append(tmp,mf[idxt])

        XX.append(np.append(np.array(sum),tmp))
        #XX.append(np.array([sum]))
        #XX.append(tmp)
    return np.array(XX)



In [12]:

    
XX=addFeatures(X,mf=W)









    



/Library/Python/2.7/site-packages/numpy/lib/nanfunctions.py:993: RuntimeWarning: All-NaN slice encountered
  warnings.warn("All-NaN slice encountered", RuntimeWarning)



In [13]:

    
print("FEATURES EXTRACTED")









    



FEATURES EXTRACTED



In [14]:

    
etreg = ExtraTreesRegressor(n_estimators=200, max_depth=None, min_samples_split=1, n_jobs=8)
etreg = etreg.fit(XX,y)



In [15]:

    
print("MODEL 1 TRAINED")









    



MODEL 1 TRAINED



In [16]:

    
etreg2 = ExtraTreesRegressor(n_estimators=200, max_depth=None, min_samples_split=1, n_jobs=8)
etreg2 = etreg.fit(XX,y)



In [17]:

    
print("MODEL 2 TRAINED")









    



MODEL 2 TRAINED



In [18]:

    
etreg3 = ExtraTreesRegressor(n_estimators=200, max_depth=None, min_samples_split=1, n_jobs=8)
etreg3 = etreg.fit(XX,y)



In [19]:

    
print("MODEL 3 TRAINED")









    



MODEL 3 TRAINED



In [20]:

    
#knn,etreg,rfr,xgbr,gbr

Predict on testset



In [21]:

    
#filename = "data/test.csv"
filename = "data/reduced_test_5000.csv"
test = pd.read_csv(filename)
test = test.set_index('Id')



In [22]:

    
print("TEST LOADED")









    



TEST LOADED



In [23]:

    
features_columns = list([u'Ref', u'Ref_5x5_10th',
       u'Ref_5x5_50th', u'Ref_5x5_90th', u'RefComposite',
       u'RefComposite_5x5_10th', u'RefComposite_5x5_50th',
       u'RefComposite_5x5_90th', u'RhoHV', u'RhoHV_5x5_10th',
       u'RhoHV_5x5_50th', u'RhoHV_5x5_90th', u'Zdr', u'Zdr_5x5_10th',
       u'Zdr_5x5_50th', u'Zdr_5x5_90th', u'Kdp', u'Kdp_5x5_10th',
       u'Kdp_5x5_50th', u'Kdp_5x5_90th'])

def getX(raw):
    selected_columns = list([ u'minutes_past',u'radardist_km', u'Ref', u'Ref_5x5_10th',
       u'Ref_5x5_50th', u'Ref_5x5_90th', u'RefComposite',
       u'RefComposite_5x5_10th', u'RefComposite_5x5_50th',
       u'RefComposite_5x5_90th', u'RhoHV', u'RhoHV_5x5_10th',
       u'RhoHV_5x5_50th', u'RhoHV_5x5_90th', u'Zdr', u'Zdr_5x5_10th',
       u'Zdr_5x5_50th', u'Zdr_5x5_90th', u'Kdp', u'Kdp_5x5_10th',
       u'Kdp_5x5_50th', u'Kdp_5x5_90th'])
    
    data = raw[selected_columns]
    
    docX= []
    for i in data.index.unique():
        if isinstance(data.loc[i],pd.core.series.Series):
            m = [data.loc[i].as_matrix()]
            docX.append(m)
        else:
            m = data.loc[i].as_matrix()
            docX.append(m)
    X = np.array(docX)
    return X



In [24]:

    
#%%time
#X=getX(test)

#tmp = []
#for i in X:
#    tmp.append(len(i))
#tmp = np.array(tmp)
#sns.countplot(tmp,order=range(tmp.min(),tmp.max()+1))
#plt.title("Number of ID per number of observations\n(On test dataset)")
#plt.plot()



In [25]:

    
#testFull = test.dropna()
testNoFullNan = test.loc[test[features_columns].dropna(how='all').index.unique()]



In [26]:

    
test_X=getX(testNoFullNan)  # 1min
#XX = [np.array(t).mean(0) for t in X]  # 10s



In [27]:

    
test_XX=[]
for t in test_X:
    nm = np.nanmean(t,0)
    for idx,j in enumerate(nm):
        if np.isnan(j):
            nm[idx]=global_means[idx]
    test_XX.append(nm)
test_XX=np.array(test_XX)

# rescale to clip min at 0 (for non negative matrix factorization)
test_XX_rescaled=test_XX[:,:]-np.min(test_XX,0)



In [28]:

    
test_W = nmf.transform(test_XX_rescaled)



In [29]:

    
test_XX=addFeatures(test_X,mf=test_W)



In [30]:

    
print("PREDICTING...")









    



PREDICTING...



In [31]:

    
reducedModelList = [etreg,etreg2,etreg3]
test_globalPred = np.array([f.predict(test_XX) for f in reducedModelList]).T
predTest = test_globalPred.mean(1)



In [32]:

    
predFull = zip(testNoFullNan.index.unique(),predTest)



In [33]:

    
testNan = test.drop(test[features_columns].dropna(how='all').index)



In [34]:

    
tmp = np.empty(len(testNan))
tmp.fill(0.445000)   # 50th percentile of full Nan dataset
predNan = zip(testNan.index.unique(),tmp)



In [35]:

    
pred = predFull + predNan



In [36]:

    
pred.sort(key=lambda x: x[0], reverse=False)



In [37]:

    
submission = pd.DataFrame(pred)
submission.columns = ["Id","Expected"]
submission.head()



In [38]:

    
print("DUMPING")









    



DUMPING



In [39]:

    
submission.loc[submission['Expected']<0,'Expected'] = 0.01



In [40]:

    
submission.to_csv("submit6.csv",index=False)



In [41]:

    
print("OVER")









    



OVER



In [ ]:

	Id	Expected
0	1	2.161731
1	2	17.276049
2	3	3.113642
3	4	6.035318
4	5	0.445000