In [97]:
# from __future__ import exam_success
from __future__ import absolute_import
from __future__ import print_function
%matplotlib inline
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import random
import pandas as pd
import scipy.stats as stats
# Sk cheatsfrom sklearn.ensemble import ExtraTreesRegressor
from sklearn.cross_validation import cross_val_score # cross val
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import Imputer # get rid of nan
from sklearn.neighbors import KNeighborsRegressor
from sklearn import grid_search
import os
Predictions is made in the USA corn growing states (mainly Iowa, Illinois, Indiana) during the season with the highest rainfall (as illustrated by Iowa for the april to august months)
The Kaggle page indicate that the dataset have been shuffled, so working on a subset seems acceptable
The test set is not a extracted from the same data as the training set however, which make the evaluation trickier
In [111]:
%%time
#filename = "data/train.csv"
filename = "data/reduced_train_10000.csv"
#filename = "data/reduced_train_1000000.csv"
raw = pd.read_csv(filename)
raw = raw.set_index('Id')
In [112]:
raw.columns
Out[112]:
In [110]:
testFull.columns
Out[110]:
In [3]:
raw['Expected'].describe()
Out[3]:
Per wikipedia, a value of more than 421 mm/h is considered "Extreme/large hail"
If we encounter the value 327.40 meter per hour, we should probably start building Noah's ark
Therefor, it seems reasonable to drop values too large, considered as outliers
In [87]:
# Considering that the gauge may concentrate the rainfall, we set the cap to 1000
# Comment this line to analyse the complete dataset
l = len(raw)
raw = raw[raw['Expected'] < 300] #1000
print("Dropped %d (%0.2f%%)"%(l-len(raw),(l-len(raw))/float(l)*100))
In [5]:
raw.head(5)
Out[5]:
In [6]:
raw.describe()
Out[6]:
We regroup the data by ID
In [7]:
# We select all features except for the minutes past,
# because we ignore the time repartition of the sequence for now
features_columns = list([u'Ref', u'Ref_5x5_10th',
u'Ref_5x5_50th', u'Ref_5x5_90th', u'RefComposite',
u'RefComposite_5x5_10th', u'RefComposite_5x5_50th',
u'RefComposite_5x5_90th', u'RhoHV', u'RhoHV_5x5_10th',
u'RhoHV_5x5_50th', u'RhoHV_5x5_90th', u'Zdr', u'Zdr_5x5_10th',
u'Zdr_5x5_50th', u'Zdr_5x5_90th', u'Kdp', u'Kdp_5x5_10th',
u'Kdp_5x5_50th', u'Kdp_5x5_90th'])
def getXy(raw):
selected_columns = list([ u'minutes_past',u'radardist_km', u'Ref', u'Ref_5x5_10th',
u'Ref_5x5_50th', u'Ref_5x5_90th', u'RefComposite',
u'RefComposite_5x5_10th', u'RefComposite_5x5_50th',
u'RefComposite_5x5_90th', u'RhoHV', u'RhoHV_5x5_10th',
u'RhoHV_5x5_50th', u'RhoHV_5x5_90th', u'Zdr', u'Zdr_5x5_10th',
u'Zdr_5x5_50th', u'Zdr_5x5_90th', u'Kdp', u'Kdp_5x5_10th',
u'Kdp_5x5_50th', u'Kdp_5x5_90th'])
data = raw[selected_columns]
docX, docY = [], []
for i in data.index.unique():
if isinstance(data.loc[i],pd.core.series.Series):
m = [data.loc[i].as_matrix()]
docX.append(m)
docY.append(float(raw.loc[i]["Expected"]))
else:
m = data.loc[i].as_matrix()
docX.append(m)
docY.append(float(raw.loc[i][:1]["Expected"]))
X , y = np.array(docX) , np.array(docY)
return X,y
In [88]:
#noAnyNan = raw.loc[raw[features_columns].dropna(how='any').index.unique()]
noAnyNan = raw.dropna()
In [89]:
noFullNan = raw.loc[raw[features_columns].dropna(how='all').index.unique()]
In [90]:
fullNan = raw.drop(raw[features_columns].dropna(how='all').index)
As a first try, we make predictions on the complete data, and return the 50th percentile and uncomplete and fully empty data
In [91]:
%%time
X,y=getXy(noAnyNan)
In [12]:
%%time
#XX = [np.array(t).mean(0) for t in X]
XX = [np.append(np.nanmean(np.array(t),0),(np.array(t)[1:] - np.array(t)[:-1]).sum(0) ) for t in X]
In [92]:
%%time
XX=[]
for t in X:
#print(idx)
tmp = np.append(np.nanmean(np.array(t),0),(np.array(t)[1:] - np.array(t)[:-1]).sum(0) )
tmp = np.append(tmp,np.percentile(t,10,axis=0))
tmp = np.append(tmp,np.percentile(t,50,axis=0))
tmp = np.append(tmp,np.percentile(t,90,axis=0))
test = t
try:
taa=test[:,0]
except TypeError:
taa=[test[0][0]]
valid_time = np.zeros_like(taa)
valid_time[0] = taa[0]
for n in xrange(1,len(taa)):
valid_time[n] = taa[n] - taa[n-1]
valid_time[-1] = valid_time[-1] + 60 - np.sum(valid_time)
valid_time = valid_time / 60.0
sum=0
try:
column_ref=test[:,2]
except TypeError:
column_ref=[test[0][2]]
for dbz, hours in zip(column_ref, valid_time):
# See: https://en.wikipedia.org/wiki/DBZ_(meteorology)
if np.isfinite(dbz):
mmperhr = pow(pow(10, dbz/10)/200, 0.625)
sum = sum + mmperhr * hours
XX.append(np.append(np.array(sum),tmp))
#XX.append(np.array([sum]))
#XX.append(tmp)
In [14]:
XX[2]
Out[14]:
In [93]:
def splitTrainTest(X, y, split=0.2):
tmp1, tmp2 = [], []
ps = int(len(X) * (1-split))
index_shuf = range(len(X))
random.shuffle(index_shuf)
for i in index_shuf:
tmp1.append(X[i])
tmp2.append(y[i])
return tmp1[:ps], tmp2[:ps], tmp1[ps:], tmp2[ps:]
In [94]:
X_train,y_train, X_test, y_test = splitTrainTest(XX,y)
In [22]:
def manualScorer(estimator, X, y):
err = (estimator.predict(X_test)-y_test)**2
return -err.sum()/len(err)
max prof 24 nb trees 84 min sample per leaf 17 min sample to split 51
In [21]:
from sklearn import svm
In [18]:
svr = svm.SVR(C=100000)
In [ ]:
%%time
srv = svr.fit(X_train,y_train)
In [ ]:
err = (svr.predict(X_train)-y_train)**2
err.sum()/len(err)
In [ ]:
err = (svr.predict(X_test)-y_test)**2
err.sum()/len(err)
In [25]:
%%time
svr_score = cross_val_score(svr, XX, y, cv=5)
print("Score: %s\nMean: %.03f"%(svr_score,svr_score.mean()))
In [27]:
knn = KNeighborsRegressor(n_neighbors=6,weights='distance',algorithm='ball_tree')
In [28]:
#parameters = {'weights':('distance','uniform'),'algorithm':('auto', 'ball_tree', 'kd_tree', 'brute')}
parameters = {'n_neighbors':range(1,10,1)}
grid_knn = grid_search.GridSearchCV(knn, parameters,scoring=manualScorer)
In [29]:
%%time
grid_knn.fit(X_train,y_train)
Out[29]:
In [30]:
print(grid_knn.grid_scores_)
print("Best: ",grid_knn.best_params_)
In [31]:
knn = grid_knn.best_estimator_
In [32]:
knn= knn.fit(X_train,y_train)
In [33]:
err = (knn.predict(X_train)-y_train)**2
err.sum()/len(err)
Out[33]:
In [34]:
err = (knn.predict(X_test)-y_test)**2
err.sum()/len(err)
Out[34]:
In [35]:
etreg = ExtraTreesRegressor(n_estimators=200, max_depth=None, min_samples_split=1, random_state=0)
In [36]:
parameters = {'n_estimators':range(100,200,20)}
grid_rf = grid_search.GridSearchCV(etreg, parameters,n_jobs=2,scoring=manualScorer)
In [37]:
%%time
grid_rf.fit(X_train,y_train)
Out[37]:
In [38]:
print(grid_rf.grid_scores_)
print("Best: ",grid_rf.best_params_)
In [39]:
grid_rf.best_params_
Out[39]:
In [40]:
es = etreg
#es = grid_rf.best_estimator_
In [41]:
%%time
es = es.fit(X_train,y_train)
In [42]:
err = (es.predict(X_train)-y_train)**2
err.sum()/len(err)
Out[42]:
In [43]:
err = (es.predict(X_test)-y_test)**2
err.sum()/len(err)
Out[43]:
In [98]:
gbr = GradientBoostingRegressor(loss='ls', learning_rate=0.1, n_estimators=900,
subsample=1.0, min_samples_split=2, min_samples_leaf=1,
min_weight_fraction_leaf=0.0, max_depth=4, init=None,
random_state=None, max_features=None, alpha=0.5,
verbose=0, max_leaf_nodes=None, warm_start=False)
In [99]:
%%time
gbr = gbr.fit(X_train,y_train)
os.system('say "終わりだ"') #its over!
In [80]:
#parameters = {'max_depth':range(2,5,1),'alpha':[0.5,0.6,0.7,0.8,0.9]}
#parameters = {'subsample':[0.2,0.4,0.5,0.6,0.8,1]}
#parameters = {'subsample':[0.2,0.5,0.6,0.8,1],'n_estimators':[800,1000,1200]}
#parameters = {'max_depth':range(2,4,1)}
#parameters = {'n_estimators':[600,800,900]}
#parameters = {'loss':['ls', 'lad', 'huber', 'quantile'],'alpha':[0.3,0.5,0.8,0.9]}
grid_gbr = grid_search.GridSearchCV(gbr, parameters,n_jobs=2,scoring=manualScorer)
In [81]:
%%time
grid_gbr = grid_gbr.fit(X_train,y_train)
In [82]:
print(grid_gbr.grid_scores_)
print("Best: ",grid_gbr.best_params_)
In [85]:
err = (gbr.predict(X_train)-y_train)**2
print(err.sum()/len(err))
err = (gbr.predict(X_test)-y_test)**2
print(err.sum()/len(err))
In [100]:
err = (gbr.predict(X_train)-y_train)**2
print(err.sum()/len(err))
err = (gbr.predict(X_test)-y_test)**2
print(err.sum()/len(err))
In [ ]:
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.optimizers import SGD
in_dim = len(XX[0])
out_dim = 1
model = Sequential()
# Dense(64) is a fully-connected layer with 64 hidden units.
# in the first layer, you must specify the expected input data shape:
# here, 20-dimensional vectors.
model.add(Dense(128, input_shape=(in_dim,)))
model.add(Activation('tanh'))
model.add(Dropout(0.5))
model.add(Dense(1, init='uniform'))
model.add(Activation('linear'))
#sgd = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)
#model.compile(loss='mean_squared_error', optimizer=sgd)
rms = RMSprop()
model.compile(loss='mean_squared_error', optimizer=rms)
#model.fit(X_train, y_train, nb_epoch=20, batch_size=16)
#score = model.evaluate(X_test, y_test, batch_size=16)
In [ ]:
prep = []
for i in y_train:
prep.append(min(i,20))
In [ ]:
prep=np.array(prep)
mi,ma = prep.min(),prep.max()
fy = (prep-mi) / (ma-mi)
#my = fy.max()
#fy = fy/fy.max()
In [ ]:
model.fit(np.array(X_train), fy, batch_size=10, nb_epoch=10, validation_split=0.1)
In [ ]:
pred = model.predict(np.array(X_test))*ma+mi
In [ ]:
err = (pred-y_test)**2
err.sum()/len(err)
In [ ]:
r = random.randrange(len(X_train))
print("(Train) Prediction %0.4f, True: %0.4f"%(model.predict(np.array([X_train[r]]))[0][0]*ma+mi,y_train[r]))
r = random.randrange(len(X_test))
print("(Test) Prediction %0.4f, True: %0.4f"%(model.predict(np.array([X_test[r]]))[0][0]*ma+mi,y_test[r]))
In [ ]:
def marshall_palmer(ref, minutes_past):
#print("Estimating rainfall from {0} observations".format(len(minutes_past)))
# how long is each observation valid?
valid_time = np.zeros_like(minutes_past)
valid_time[0] = minutes_past.iloc[0]
for n in xrange(1, len(minutes_past)):
valid_time[n] = minutes_past.iloc[n] - minutes_past.iloc[n-1]
valid_time[-1] = valid_time[-1] + 60 - np.sum(valid_time)
valid_time = valid_time / 60.0
# sum up rainrate * validtime
sum = 0
for dbz, hours in zip(ref, valid_time):
# See: https://en.wikipedia.org/wiki/DBZ_(meteorology)
if np.isfinite(dbz):
mmperhr = pow(pow(10, dbz/10)/200, 0.625)
sum = sum + mmperhr * hours
return sum
def simplesum(ref,hour):
hour.sum()
# each unique Id is an hour of data at some gauge
def myfunc(hour):
#rowid = hour['Id'].iloc[0]
# sort hour by minutes_past
hour = hour.sort('minutes_past', ascending=True)
est = marshall_palmer(hour['Ref'], hour['minutes_past'])
return est
In [ ]:
info = raw.groupby(raw.index)
In [ ]:
estimates = raw.groupby(raw.index).apply(myfunc)
estimates.head(20)
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
%%time
etreg.fit(X_train,y_train)
In [ ]:
%%time
et_score = cross_val_score(etreg, XX, y, cv=5)
print("Score: %s\tMean: %.03f"%(et_score,et_score.mean()))
In [ ]:
%%time
et_score = cross_val_score(etreg, XX, y, cv=5)
print("Score: %s\tMean: %.03f"%(et_score,et_score.mean()))
In [ ]:
err = (etreg.predict(X_test)-y_test)**2
err.sum()/len(err)
In [ ]:
err = (etreg.predict(X_test)-y_test)**2
err.sum()/len(err)
In [ ]:
r = random.randrange(len(X_train))
print(r)
print(etreg.predict(X_train[r]))
print(y_train[r])
r = random.randrange(len(X_test))
print(r)
print(etreg.predict(X_test[r]))
print(y_test[r])
In [102]:
%%time
filename = "data/reduced_test_5000.csv"
#filename = "data/test.csv"
test = pd.read_csv(filename)
test = test.set_index('Id')
In [113]:
features_columns = list([u'Ref', u'Ref_5x5_10th',
u'Ref_5x5_50th', u'Ref_5x5_90th', u'RefComposite',
u'RefComposite_5x5_10th', u'RefComposite_5x5_50th',
u'RefComposite_5x5_90th', u'RhoHV', u'RhoHV_5x5_10th',
u'RhoHV_5x5_50th', u'RhoHV_5x5_90th', u'Zdr', u'Zdr_5x5_10th',
u'Zdr_5x5_50th', u'Zdr_5x5_90th', u'Kdp', u'Kdp_5x5_10th',
u'Kdp_5x5_50th', u'Kdp_5x5_90th'])
def getX(raw):
selected_columns = list([ u'minutes_past',u'radardist_km', u'Ref', u'Ref_5x5_10th',
u'Ref_5x5_50th', u'Ref_5x5_90th', u'RefComposite',
u'RefComposite_5x5_10th', u'RefComposite_5x5_50th',
u'RefComposite_5x5_90th', u'RhoHV', u'RhoHV_5x5_10th',
u'RhoHV_5x5_50th', u'RhoHV_5x5_90th', u'Zdr', u'Zdr_5x5_10th',
u'Zdr_5x5_50th', u'Zdr_5x5_90th', u'Kdp', u'Kdp_5x5_10th',
u'Kdp_5x5_50th', u'Kdp_5x5_90th'])
data = raw[selected_columns]
docX= []
for i in data.index.unique():
if isinstance(data.loc[i],pd.core.series.Series):
m = [data.loc[i].as_matrix()]
docX.append(m)
else:
m = data.loc[i].as_matrix()
docX.append(m)
X = np.array(docX)
return X
In [ ]:
# We select all features except for the minutes past,
# because we ignore the time repartition of the sequence for now
features_columns = list([u'Ref', u'Ref_5x5_10th',
u'Ref_5x5_50th', u'Ref_5x5_90th', u'RefComposite',
u'RefComposite_5x5_10th', u'RefComposite_5x5_50th',
u'RefComposite_5x5_90th', u'RhoHV', u'RhoHV_5x5_10th',
u'RhoHV_5x5_50th', u'RhoHV_5x5_90th', u'Zdr', u'Zdr_5x5_10th',
u'Zdr_5x5_50th', u'Zdr_5x5_90th', u'Kdp', u'Kdp_5x5_10th',
u'Kdp_5x5_50th', u'Kdp_5x5_90th'])
def getXy(raw):
selected_columns = list([ u'minutes_past',u'radardist_km', u'Ref', u'Ref_5x5_10th',
u'Ref_5x5_50th', u'Ref_5x5_90th', u'RefComposite',
u'RefComposite_5x5_10th', u'RefComposite_5x5_50th',
u'RefComposite_5x5_90th', u'RhoHV', u'RhoHV_5x5_10th',
u'RhoHV_5x5_50th', u'RhoHV_5x5_90th', u'Zdr', u'Zdr_5x5_10th',
u'Zdr_5x5_50th', u'Zdr_5x5_90th', u'Kdp', u'Kdp_5x5_10th',
u'Kdp_5x5_50th', u'Kdp_5x5_90th'])
data = raw[selected_columns]
docX, docY = [], []
for i in data.index.unique():
if isinstance(data.loc[i],pd.core.series.Series):
m = [data.loc[i].as_matrix()]
docX.append(m)
docY.append(float(raw.loc[i]["Expected"]))
else:
m = data.loc[i].as_matrix()
docX.append(m)
docY.append(float(raw.loc[i][:1]["Expected"]))
X , y = np.array(docX) , np.array(docY)
return X,y
In [114]:
X=getX(test)
tmp = []
for i in X:
tmp.append(len(i))
tmp = np.array(tmp)
sns.countplot(tmp,order=range(tmp.min(),tmp.max()+1))
plt.title("Number of ID per number of observations\n(On test dataset)")
plt.plot()
In [105]:
testFull = test.dropna()
In [106]:
%%time
X=getX(testFull) # 1min
XX = [np.array(t).mean(0) for t in X] # 10s
In [107]:
%%time
XX=[]
for t in X:
#print(idx)
tmp = np.append(np.nanmean(np.array(t),0),(np.array(t)[1:] - np.array(t)[:-1]).sum(0) )
tmp = np.append(tmp,np.percentile(t,10,axis=0))
tmp = np.append(tmp,np.percentile(t,50,axis=0))
tmp = np.append(tmp,np.percentile(t,90,axis=0))
test = t
try:
taa=test[:,0]
except TypeError:
taa=[test[0][0]]
valid_time = np.zeros_like(taa)
valid_time[0] = taa[0]
for n in xrange(1,len(taa)):
valid_time[n] = taa[n] - taa[n-1]
valid_time[-1] = valid_time[-1] + 60 - np.sum(valid_time)
valid_time = valid_time / 60.0
sum=0
try:
column_ref=test[:,2]
except TypeError:
column_ref=[test[0][2]]
for dbz, hours in zip(column_ref, valid_time):
# See: https://en.wikipedia.org/wiki/DBZ_(meteorology)
if np.isfinite(dbz):
mmperhr = pow(pow(10, dbz/10)/200, 0.625)
sum = sum + mmperhr * hours
XX.append(np.append(np.array(sum),tmp))
#XX.append(np.array([sum]))
#XX.append(tmp)
In [108]:
pd.DataFrame(gbr.predict(XX)).describe()
In [ ]:
predFull = zip(testFull.index.unique(),etreg.predict(XX))
In [ ]:
testNan = test.drop(test[features_columns].dropna(how='all').index)
In [ ]:
tmp = np.empty(len(testNan))
tmp.fill(0.445000) # 50th percentile of full Nan dataset
predNan = zip(testNan.index.unique(),tmp)
In [ ]:
testLeft = test.drop(testNan.index.unique()).drop(testFull.index.unique())
In [ ]:
tmp = np.empty(len(testLeft))
tmp.fill(1.27) # 50th percentile of full Nan dataset
predLeft = zip(testLeft.index.unique(),tmp)
In [ ]:
len(testFull.index.unique())
In [ ]:
len(testNan.index.unique())
In [ ]:
len(testLeft.index.unique())
In [ ]:
pred = predFull + predNan + predLeft
In [ ]:
pred.sort(key=lambda x: x[0], reverse=False)
In [ ]:
submission = pd.DataFrame(pred)
submission.columns = ["Id","Expected"]
submission.head()
In [ ]:
submission.to_csv("first_submit.csv",index=False)
In [ ]:
In [ ]:
filename = "data/sample_solution.csv"
sol = pd.read_csv(filename)
In [ ]:
ss = np.array(sol)
In [ ]:
%%time
for a,b in predFull:
ss[a-1][1]=b
In [ ]:
ss
In [ ]:
sub = pd.DataFrame(ss)
sub.columns = ["Id","Expected"]
sub.Id = sub.Id.astype(int)
sub.head()
In [ ]:
sub.to_csv("submit_2.csv",index=False)
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: