In [ ]:
## FDMS TME3
Kaggle [How Much Did It Rain? II](https://www.kaggle.com/c/how-much-did-it-rain-ii)
Florian Toque & Paul Willot
In [1]:
# from __future__ import exam_success
from __future__ import absolute_import
from __future__ import print_function
%matplotlib inline
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import random
import pandas as pd
import scipy.stats as stats
# Sk cheats
from sklearn.cross_validation import cross_val_score # cross val
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.preprocessing import Imputer # get rid of nan
Reduced to
In [2]:
%%time
filename = "data/reduced_train_100000.csv"
#filename = "data/reduced_test_100000.csv"
raw = pd.read_csv(filename)
raw = raw.set_index('Id')
#train = train.dropna()
In [3]:
l = float(len(raw["minutes_past"]))
comp = []
for i in raw.columns:
#print(raw"%.03f, %s"%(1-train[i].isnull().sum()/l , i) )
comp.append([1-raw[i].isnull().sum()/l , i])
comp.sort(key=lambda x: x[0], reverse=True)
comp
Out[3]:
In [4]:
raw = raw.dropna()
In [5]:
raw.columns
Out[5]:
In [6]:
i = raw[list([u'Ref', u'Ref_5x5_10th',
u'Ref_5x5_50th', u'Ref_5x5_90th', u'RefComposite',
u'RefComposite_5x5_10th', u'RefComposite_5x5_50th',
u'RefComposite_5x5_90th', u'RhoHV', u'RhoHV_5x5_10th',
u'RhoHV_5x5_50th', u'RhoHV_5x5_90th', u'Zdr', u'Zdr_5x5_10th',
u'Zdr_5x5_50th', u'Zdr_5x5_90th', u'Kdp', u'Kdp_5x5_10th',
u'Kdp_5x5_50th', u'Kdp_5x5_90th'])].dropna(how='all').index
In [7]:
fullna.difference(i)
In [8]:
idWithNoNan = np.unique(i)
In [9]:
idWithNoNan[:10]
Out[9]:
In [10]:
AllId = np.unique(raw.index)
In [11]:
AllId[:10]
Out[11]:
In [12]:
tr = [burg not in idWithNoNan for burg in AllId]
In [13]:
tmp = []
for idx,i in enumerate(tr):
if i:
tmp.append(idx+1)
In [14]:
tmp[-10:]
Out[14]:
In [15]:
tttt = AllId*tr
In [16]:
fullna = raw.drop(raw[list([u'Ref', u'Ref_5x5_10th',
u'Ref_5x5_50th', u'Ref_5x5_90th', u'RefComposite',
u'RefComposite_5x5_10th', u'RefComposite_5x5_50th',
u'RefComposite_5x5_90th', u'RhoHV', u'RhoHV_5x5_10th',
u'RhoHV_5x5_50th', u'RhoHV_5x5_90th', u'Zdr', u'Zdr_5x5_10th',
u'Zdr_5x5_50th', u'Zdr_5x5_90th', u'Kdp', u'Kdp_5x5_10th',
u'Kdp_5x5_50th', u'Kdp_5x5_90th'])].dropna(how='all').index).index
In [17]:
fullna.head(100)
verifier val aberantes sur labels
In [18]:
raw.head(20)
Out[18]:
In [19]:
raw["Expected"].describe()
Out[19]:
Get rid of Nan value for now
In [20]:
#train_clean = train[[not i for i in np.isnan(train["Ref_5x5_10th"])]]
Forums indicate that a higher than 1m rainfall is probably an error. Which is quite understandable. We filter that out
In [21]:
raw = raw[raw['Expected'] < 1000]
In [22]:
raw['Expected'].describe()
Out[22]:
In [23]:
split = 0.2
train = raw.tail(int(len(raw)*1-split))
test = raw.tail(int(len(raw)*split))
In [24]:
#columns = [u'minutes_past', u'radardist_km', u'Ref', u'Ref_5x5_10th',
# u'Ref_5x5_50th', u'Ref_5x5_90th', u'RefComposite',
# u'RefComposite_5x5_10th', u'RefComposite_5x5_50th',
# u'RefComposite_5x5_90th', u'RhoHV', u'RhoHV_5x5_10th',
# u'RhoHV_5x5_50th', u'RhoHV_5x5_90th', u'Zdr', u'Zdr_5x5_10th',
# u'Zdr_5x5_50th', u'Zdr_5x5_90th', u'Kdp', u'Kdp_5x5_10th',
# u'Kdp_5x5_50th', u'Kdp_5x5_90th']
#columns = [u'radardist_km', u'Ref', u'Ref_5x5_10th']
columns = [ u'radardist_km', u'Ref', u'Ref_5x5_10th',
u'Ref_5x5_50th', u'Ref_5x5_90th', u'RefComposite',
u'RefComposite_5x5_10th', u'RefComposite_5x5_50th',
u'RefComposite_5x5_90th', u'RhoHV', u'RhoHV_5x5_10th',
u'RhoHV_5x5_50th', u'RhoHV_5x5_90th', u'Zdr', u'Zdr_5x5_10th',
u'Zdr_5x5_50th', u'Zdr_5x5_90th', u'Kdp', u'Kdp_5x5_10th',
u'Kdp_5x5_50th', u'Kdp_5x5_90th']
nb_features = len(columns)
data = raw[list(columns)]
data.head(5)
Out[24]:
In [25]:
data.describe()
Out[25]:
In [32]:
data.head(40)
Out[32]:
In [27]:
#%%time
#max_padding = 20
docX, docY = [], []
for i in train.index.unique():
if isinstance(train.loc[i],pd.core.series.Series):
m = [data.loc[i].as_matrix()]
#pad = np.pad(m, ((max_padding -len(m), 0),(0,0)), 'constant') # pre-padding
docX.append(m)
docY.append(float(train.loc[i]["Expected"]))
else:
m = data.loc[i].as_matrix()
#pad = np.pad(m, ((max_padding -len(m), 0),(0,0)), 'constant')
docX.append(m)
docY.append(float(train.loc[i][:1]["Expected"]))
#docY.append(train.loc[i][:1]["Expected"].as_matrix)
X = np.array(docX)
y = np.array(docY)
In [33]:
train.index.unique()
Out[33]:
In [28]:
np.shape(X)
Out[28]:
In [31]:
X[2]
Out[31]:
In [76]:
tmp = []
for i in X:
tmp.append(len(i))
tmp = np.array(tmp)
pd.DataFrame(tmp).describe()
Out[76]:
In [77]:
sns.countplot(tmp,order=range(tmp.min(),tmp.max()+1))
plt.title("Number of ID per number of observations")
plt.plot()
Out[77]:
In [78]:
dicPerObs = {}
for idx,i in enumerate(X):
l = len(i)
try:
dicPerObs[str(l)].append(y[idx])
except KeyError:
dicPerObs[str(l)]=[y[idx]]
In [79]:
for i in dicPerObs.keys():
t = np.array(dicPerObs[i])
print(t.mean())
In [80]:
dicPerObs = {}
for idx,i in enumerate(X):
l = len(i)
try:
dicPerObs[str(l)].append(np.count_nonzero(~np.isnan(i)) / float(i.size))
except KeyError:
dicPerObs[str(l)]=[np.count_nonzero(~np.isnan(i)) / float(i.size)]
In [81]:
# percentage of data filled
# the more the less sparse
for i in dicPerObs.keys():
t = np.array(dicPerObs[i])
print(t.mean())
In [82]:
sns.countplot(tmp,order=range(tmp.min(),tmp.max()+1),zorder=1)
sns.countplot(tmp2,order=range(tmp.min(),tmp.max()+1),zorder=10)
plt.title("Number of ID per number of obesrvations")
plt.plot()
In [83]:
XX = [np.array(t).mean(0) for t in X]
In [84]:
#XX[0]
In [85]:
np.shape(XX)
Out[85]:
In [86]:
global_means = np.nanmean(data,0)
##global_means = data.mean(0).values
t = [] for i in XX: t.append(np.count_nonzero(~np.isnan(i)) / float(i.size)) pd.DataFrame(np.array(t)).describe()
In [87]:
XX = []
for i in X:
nm = np.nanmean(i,0)
for idx,j in enumerate(nm):
if np.isnan(j):
nm[idx]=global_means[idx]
XX.append(np.array(nm))
In [88]:
XX = [np.array(t).mean(0) for t in X]
In [89]:
split = 0.2
ps = int(len(XX) * (1-split))
X_train = XX[:ps]
y_train = y[:ps]
X_test = XX[ps:]
y_test = y[ps:]
In [90]:
etreg = ExtraTreesRegressor(n_estimators=100, max_depth=None, min_samples_split=1, random_state=0)
In [91]:
%%time
etreg.fit(X_train,y_train)
Out[91]:
In [92]:
%%time
et_score = cross_val_score(etreg, XX, y, cv=5)
print("Features: %s\nScore: %s\tMean: %.03f"%(columns, et_score,et_score.mean()))
In [93]:
err = (etreg.predict(X_test)-y_test)**2
err.sum()/len(err)
Out[93]:
In [ ]:
In [94]:
def marshall_palmer(ref, minutes_past):
#print("Estimating rainfall from {0} observations".format(len(minutes_past)))
# how long is each observation valid?
valid_time = np.zeros_like(minutes_past)
valid_time[0] = minutes_past.iloc[0]
for n in xrange(1, len(minutes_past)):
valid_time[n] = minutes_past.iloc[n] - minutes_past.iloc[n-1]
valid_time[-1] = valid_time[-1] + 60 - np.sum(valid_time)
valid_time = valid_time / 60.0
# sum up rainrate * validtime
sum = 0
for dbz, hours in zip(ref, valid_time):
# See: https://en.wikipedia.org/wiki/DBZ_(meteorology)
if np.isfinite(dbz):
mmperhr = pow(pow(10, dbz/10)/200, 0.625)
sum = sum + mmperhr * hours
return sum
def simplesum(ref,hour):
hour.sum()
# each unique Id is an hour of data at some gauge
def myfunc(hour):
#rowid = hour['Id'].iloc[0]
# sort hour by minutes_past
hour = hour.sort('minutes_past', ascending=True)
est = marshall_palmer(hour['Ref'], hour['minutes_past'])
return est
In [95]:
estimates = raw.groupby(raw.index).apply(myfunc)
estimates.head(20)
Out[95]:
In [96]:
err = (estimates-(np.hstack((y_train,y_test))))**2
err.sum()/len(err)
Out[96]:
Memento (mauri)
In [66]:
etreg = ExtraTreesRegressor(n_estimators=100, max_depth=None, min_samples_split=1, random_state=0)
In [67]:
"""
columns = train_clean.columns
columns = ["minutes_past","radardist_km","Ref","Ref_5x5_10th", "Ref_5x5_50th"]
columns = [u'Id', u'minutes_past', u'radardist_km', u'Ref', u'Ref_5x5_10th',
u'Ref_5x5_50th', u'Ref_5x5_90th', u'RefComposite',
u'RefComposite_5x5_10th', u'RefComposite_5x5_50th',
u'RefComposite_5x5_90th', u'RhoHV', u'RhoHV_5x5_10th',
u'RhoHV_5x5_50th', u'RhoHV_5x5_90th', u'Zdr', u'Zdr_5x5_10th',
u'Zdr_5x5_50th', u'Zdr_5x5_90th', u'Kdp', u'Kdp_5x5_10th',
u'Kdp_5x5_50th', u'Kdp_5x5_90th', u'Expected']
"""
columns = [u'minutes_past', u'radardist_km', u'Ref', u'Ref_5x5_10th',
u'Ref_5x5_50th', u'Ref_5x5_90th', u'RefComposite',
u'RefComposite_5x5_10th', u'RefComposite_5x5_50th',
u'RefComposite_5x5_90th', u'RhoHV', u'RhoHV_5x5_10th',
u'RhoHV_5x5_50th', u'RhoHV_5x5_90th', u'Zdr', u'Zdr_5x5_10th',
u'Zdr_5x5_50th', u'Zdr_5x5_90th', u'Kdp', u'Kdp_5x5_10th',
u'Kdp_5x5_50th', u'Kdp_5x5_90th']
labels = train["Expected"].values
features = train[list(columns)].values
In [68]:
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit(features)
features_trans = imp.transform(features)
In [69]:
len(features_trans)
Out[69]:
In [70]:
split = 0.2
ps = int(len(features_trans) * split)
ftrain = features_trans[:ps]
ltrain = labels[:ps]
ftest = features_trans[ps:]
ltest = labels[ps:]
In [71]:
%%time
etreg.fit(ftrain,ltrain)
Out[71]:
In [72]:
def scorer(estimator, X, y):
return (estimator.predict(X[0])-y)**2
In [57]:
%%time
et_score = cross_val_score(etreg, features_trans, labels, cv=3)
print("Features: %s\nScore: %s\tMean: %.03f"%(columns, et_score,et_score.mean()))
In [73]:
r = random.randrange(len(ltrain))
print(r)
print(etreg.predict(ftrain[r]))
print(ltrain[r])
In [94]:
r = random.randrange(len(ltest))
print(r)
print(etreg.predict(ftest[r]))
print(ltest[r])
In [95]:
err = (etreg.predict(ftest)-ltest)**2
In [96]:
err.sum()/len(err)
Out[96]:
Submit
In [154]:
filename = "data/reduced_test_5000.csv"
test = pd.read_csv(filename)
In [164]:
columns = [u'minutes_past', u'radardist_km', u'Ref', u'Ref_5x5_10th',
u'Ref_5x5_50th', u'Ref_5x5_90th', u'RefComposite',
u'RefComposite_5x5_10th', u'RefComposite_5x5_50th',
u'RefComposite_5x5_90th', u'RhoHV', u'RhoHV_5x5_10th',
u'RhoHV_5x5_50th', u'RhoHV_5x5_90th', u'Zdr', u'Zdr_5x5_10th',
u'Zdr_5x5_50th', u'Zdr_5x5_90th', u'Kdp', u'Kdp_5x5_10th',
u'Kdp_5x5_50th', u'Kdp_5x5_90th']
features = test[list(columns)].values
In [165]:
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit(features)
features_trans = imp.transform(features)
In [166]:
fall = test[test.columns].values
In [177]:
fall[20]
Out[177]:
In [173]:
features_trans[0]
Out[173]:
In [188]:
i = 1
pred = 0
while fall[i][0] == 1:
#print(fall[i])
pred+=etreg.predict(features_trans[i])[0]
#print(etreg.predict(features_trans[i])[0])
i+=1
print(i)
In [192]:
fall[-1][0]
Out[192]:
In [202]:
%%time
res=[]
i=0
while i<len(fall) and i < 10000:
pred = 0
lenn = 0
curr=fall[i][0]
while i<len(fall) and fall[i][0] == curr:
#print(fall[i])
pred+=etreg.predict(features_trans[i])[0]
#print(etreg.predict(features_trans[i])[0])
i+=1
lenn += 1
res.append((curr,pred/lenn))
#i+=1
#print(i)
In [199]:
len(res)
Out[199]:
In [203]:
res[:10]
Out[203]:
In [97]:
def myfunc(hour):
#rowid = hour['Id'].iloc[0]
# sort hour by minutes_past
hour = hour.sort('minutes_past', ascending=True)
#est = (hour['Id'],random.random())
est = random.random()
return est
In [98]:
def marshall_palmer(ref, minutes_past):
#print("Estimating rainfall from {0} observations".format(len(minutes_past)))
# how long is each observation valid?
valid_time = np.zeros_like(minutes_past)
valid_time[0] = minutes_past.iloc[0]
for n in xrange(1, len(minutes_past)):
valid_time[n] = minutes_past.iloc[n] - minutes_past.iloc[n-1]
valid_time[-1] = valid_time[-1] + 60 - np.sum(valid_time)
valid_time = valid_time / 60.0
# sum up rainrate * validtime
sum = 0
for dbz, hours in zip(ref, valid_time):
# See: https://en.wikipedia.org/wiki/DBZ_(meteorology)
if np.isfinite(dbz):
mmperhr = pow(pow(10, dbz/10)/200, 0.625)
sum = sum + mmperhr * hours
return sum
def simplesum(ref,hour):
hour.sum()
# each unique Id is an hour of data at some gauge
def myfunc(hour):
#rowid = hour['Id'].iloc[0]
# sort hour by minutes_past
hour = hour.sort('minutes_past', ascending=True)
est = marshall_palmer(hour['Ref'], hour['minutes_past'])
return est
In [122]:
estimates = test.groupby(train.index).apply(myfunc)
estimates.head(20)
In [123]:
estimates = train.groupby(train.index).apply(myfunc)
estimates.head(20)
Out[123]:
In [100]:
train["Expected"].head(20)
Out[100]:
In [102]:
print(features_trans[0])
print(etreg.predict(features_trans[0]))
In [16]:
def marshall_palmer(data):
res=[]
for n in data:
res.append(etreg.predict(n)[0])
return np.array(res).mean()
def simplesum(ref,hour):
hour.sum()
def myfunc(hour):
hour = hour.sort('minutes_past', ascending=True)
est = marshall_palmer(hour[train.columns])
return est
In [302]:
estimates = train_clean.groupby(train_clean.index).apply(myfunc)
estimates.head(20)
In [ ]:
In [ ]:
RNN
In [134]:
import pandas as pd
from random import random
flow = (list(range(1,10,1)) + list(range(10,1,-1)))*1000
pdata = pd.DataFrame({"a":flow, "b":flow})
pdata.b = pdata.b.shift(9)
data = pdata.iloc[10:] * random() # some noise
In [135]:
#columns = [u'minutes_past', u'radardist_km', u'Ref', u'Ref_5x5_10th',
# u'Ref_5x5_50th', u'Ref_5x5_90th', u'RefComposite',
# u'RefComposite_5x5_10th', u'RefComposite_5x5_50th',
# u'RefComposite_5x5_90th', u'RhoHV', u'RhoHV_5x5_10th',
# u'RhoHV_5x5_50th', u'RhoHV_5x5_90th', u'Zdr', u'Zdr_5x5_10th',
# u'Zdr_5x5_50th', u'Zdr_5x5_90th', u'Kdp', u'Kdp_5x5_10th',
# u'Kdp_5x5_50th', u'Kdp_5x5_90th']
columns = [u'radardist_km', u'Ref', u'Ref_5x5_10th']
nb_features = len(columns)
data = train[list(columns)]
data.head(10)
Out[135]:
In [136]:
data.iloc[0].as_matrix()
Out[136]:
In [137]:
train.head(5)
Out[137]:
In [138]:
train.loc[11]
Out[138]:
In [139]:
train.loc[11][:1]["Expected"].as_matrix
Out[139]:
In [140]:
#train.index.unique()
In [141]:
def _load_data(data, n_prev = 100):
"""
data should be pd.DataFrame()
"""
docX, docY = [], []
for i in range(len(data)-n_prev):
docX.append(data.iloc[i:i+n_prev].as_matrix())
docY.append(data.iloc[i+n_prev].as_matrix())
alsX = np.array(docX)
alsY = np.array(docY)
return alsX, alsY
def train_test_split(df, test_size=0.1):
ntrn = round(len(df) * (1 - test_size))
X_train, y_train = _load_data(df.iloc[0:ntrn])
X_test, y_test = _load_data(df.iloc[ntrn:])
return (X_train, y_train), (X_test, y_test)
(X_train, y_train), (X_test, y_test) = train_test_split(data)
In [142]:
np.shape(X_train)
Out[142]:
In [144]:
t = np.array([2,1])
t.shape = (1,2)
t.tolist()[0]
Out[144]:
In [145]:
np.shape(t)
Out[145]:
In [146]:
X_train[:2,:2]
Out[146]:
In [44]:
train.index.unique()
Out[44]:
In [148]:
max_padding = 20
In [149]:
%%time
docX, docY = [], []
for i in train.index.unique():
if isinstance(train.loc[i],pd.core.series.Series):
m = [data.loc[i].as_matrix()]
pad = np.pad(m, ((max_padding -len(m), 0),(0,0)), 'constant') # pre-padding
docX.append(pad)
docY.append(float(train.loc[i]["Expected"]))
else:
m = data.loc[i].as_matrix()
pad = np.pad(m, ((max_padding -len(m), 0),(0,0)), 'constant')
docX.append(pad)
docY.append(float(train.loc[i][:1]["Expected"]))
#docY.append(train.loc[i][:1]["Expected"].as_matrix)
XX = np.array(docX)
yy = np.array(docY)
In [151]:
np.shape(XX)
Out[151]:
In [154]:
XX[0].mean()
Out[154]:
In [36]:
#from keras.preprocessing import sequence
#sequence.pad_sequences(X_train, maxlen=maxlen)
In [37]:
def _load_data(data):
"""
data should be pd.DataFrame()
"""
docX, docY = [], []
for i in data.index.unique():
#np.pad(tmp, ((0, max_padding -len(tmp) ),(0,0)), 'constant')
m = data.loc[i].as_matrix()
pad = np.pad(m, ((0, max_padding -len(m) ),(0,0)), 'constant')
docX.append(pad)
if isinstance(train.loc[i],pd.core.series.Series):
docY.append(float(train.loc[i]["Expected"]))
else:
docY.append(float(train.loc[i][:1]["Expected"]))
alsX = np.array(docX)
alsY = np.array(docY)
return alsX, alsY
def train_test_split(df, test_size=0.1):
ntrn = round(len(df) * (1 - test_size))
X_train, y_train = _load_data(df.iloc[0:ntrn])
X_test, y_test = _load_data(df.iloc[ntrn:])
return (X_train, y_train), (X_test, y_test)
(X_train, y_train), (X_test, y_test) = train_test_split(train)
In [38]:
len(X_train[0])
Out[38]:
In [39]:
train.head()
Out[39]:
In [40]:
X_train[0][:10]
Out[40]:
In [41]:
yt = []
for i in y_train:
yt.append([i[0]])
In [42]:
yt[0]
Out[42]:
In [439]:
X_train.shape
Out[439]:
In [450]:
len(fea[0])
Out[450]:
In [449]:
len(X_train[0][0])
Out[449]:
In [442]:
f = np.array(fea)
In [443]:
f.shape()
In [428]:
#(X_train, y_train), (X_test, y_test) = train_test_split(data) # retrieve data
# and now train the model
# batch_size should be appropriate to your memory size
# number of epochs should be higher for real world problems
model.fit(X_train, yt, batch_size=450, nb_epoch=2, validation_split=0.05)
Out[428]:
In [37]:
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM
from keras.layers.embeddings import Embedding
In [38]:
%%time
input_dim = nb_features
out_dim = 1
hidden_dim = 200
model = Sequential()
#Embedding(input_dim, hidden_dim, mask_zero=True)
#model.add(LSTM(hidden_dim, hidden_dim, return_sequences=False))
model.add(LSTM(input_dim, hidden_dim, return_sequences=False))
model.add(Dropout(0.5))
model.add(Dense(hidden_dim, out_dim))
model.add(Activation("linear"))
model.compile(loss="mean_squared_error", optimizer="rmsprop")
In [39]:
model.fit(XX, yy, batch_size=10, nb_epoch=10, validation_split=0.1)
Out[39]:
In [131]:
test = random.randint(0,len(XX))
print(model.predict(XX[test:test+1])[0][0])
print(yy[test])
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: