In [1]:
# from __future__ import exam_success
from __future__ import absolute_import
from __future__ import print_function
%matplotlib inline
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import random
import pandas as pd
import scipy.stats as stats
# Sk cheats
from sklearn.cross_validation import cross_val_score # cross val
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.preprocessing import Imputer # get rid of nan
Reduced to
In [2]:
%%time
#filename = "data/reduced_train_100000.csv"
#filename = "data/reduced_test_5000.csv"
filename = "data/reduced_train_100000.csv"
raw = pd.read_csv(filename)
raw = raw.set_index('Id')
#train = train.dropna()
In [3]:
l = float(len(raw["minutes_past"]))
comp = []
for i in raw.columns:
#print(raw"%.03f, %s"%(1-train[i].isnull().sum()/l , i) )
comp.append([1-raw[i].isnull().sum()/l , i])
comp.sort(key=lambda x: x[0], reverse=True)
comp
Out[3]:
In [4]:
raw = raw.dropna()
verifier val aberantes sur labels
In [5]:
raw.head()
Out[5]:
In [6]:
raw["Expected"].describe()
Out[6]:
Get rid of Nan value for now
In [7]:
#train_clean = train[[not i for i in np.isnan(train["Ref_5x5_10th"])]]
Forums indicate that a higher than 1m rainfall is probably an error. Which is quite understandable. We filter that out
In [8]:
raw = raw[raw['Expected'] < 1000]
In [9]:
raw['Expected'].describe()
Out[9]:
In [10]:
split = 0.2
train = raw.tail(int(len(raw)*1-split))
test = raw.tail(int(len(raw)*split))
In [11]:
#columns = [u'minutes_past', u'radardist_km', u'Ref', u'Ref_5x5_10th',
# u'Ref_5x5_50th', u'Ref_5x5_90th', u'RefComposite',
# u'RefComposite_5x5_10th', u'RefComposite_5x5_50th',
# u'RefComposite_5x5_90th', u'RhoHV', u'RhoHV_5x5_10th',
# u'RhoHV_5x5_50th', u'RhoHV_5x5_90th', u'Zdr', u'Zdr_5x5_10th',
# u'Zdr_5x5_50th', u'Zdr_5x5_90th', u'Kdp', u'Kdp_5x5_10th',
# u'Kdp_5x5_50th', u'Kdp_5x5_90th']
#columns = [u'radardist_km', u'Ref', u'Ref_5x5_10th']
columns = [ u'radardist_km', u'Ref', u'Ref_5x5_10th',
u'Ref_5x5_50th', u'Ref_5x5_90th', u'RefComposite',
u'RefComposite_5x5_10th', u'RefComposite_5x5_50th',
u'RefComposite_5x5_90th', u'RhoHV', u'RhoHV_5x5_10th',
u'RhoHV_5x5_50th', u'RhoHV_5x5_90th', u'Zdr', u'Zdr_5x5_10th',
u'Zdr_5x5_50th', u'Zdr_5x5_90th', u'Kdp', u'Kdp_5x5_10th',
u'Kdp_5x5_50th', u'Kdp_5x5_90th']
nb_features = len(columns)
data = raw[list(columns)]
data.head(5)
Out[11]:
In [12]:
data.head(20)
Out[12]:
In [13]:
%%time
#max_padding = 20
docX, docY = [], []
for i in raw.index.unique():
if isinstance(raw.loc[i],pd.core.series.Series):
m = [raw.loc[i].as_matrix()]
#pad = np.pad(m, ((max_padding -len(m), 0),(0,0)), 'constant') # pre-padding
docX.append(m)
docY.append(float(raw.loc[i]["Expected"]))
else:
m = data.loc[i].as_matrix()
#pad = np.pad(m, ((max_padding -len(m), 0),(0,0)), 'constant')
docX.append(m)
docY.append(float(raw.loc[i][:1]["Expected"]))
#docY.append(train.loc[i][:1]["Expected"].as_matrix)
X = np.array(docX)
y = np.array(docY)
In [20]:
np.shape(X)
Out[20]:
In [17]:
XX = [np.array(t).mean(0) for t in X]
In [19]:
np.shape(XX)
Out[19]:
In [243]:
XX[0]
Out[243]:
In [213]:
global_means = np.nanmean(data,0)
#global_means = data.mean(0).values
In [214]:
a = [
[1,2,np.nan],
[3,4,np.nan],
[2,np.nan,np.nan]
]
In [215]:
n = np.nanmean(a,0)
In [216]:
[np.isnan(i) for i in n]
Out[216]:
In [217]:
n
Out[217]:
In [218]:
np.count_nonzero(~np.isnan(X[0])) / float(X[0].size)
Out[218]:
In [219]:
t = []
for i in X:
t.append(np.count_nonzero(~np.isnan(i)) / float(i.size))
In [220]:
pd.DataFrame(np.array(t)).describe()
Out[220]:
In [231]:
XX = []
for i in X:
nm = np.nanmean(i,0)
for idx,j in enumerate(nm):
if np.isnan(j):
nm[idx]=global_means[idx]
XX.append(np.array(nm))
In [226]:
XX = [np.array(t).mean(0) for t in X]
In [227]:
split = 0.2
ps = int(len(XX) * (1-split))
X_train = XX[:ps]
y_train = y[:ps]
X_test = XX[ps:]
y_test = y[ps:]
In [228]:
etreg = ExtraTreesRegressor(n_estimators=100, max_depth=None, min_samples_split=1, random_state=0)
In [229]:
y_train[0]
Out[229]:
In [238]:
#%%time
#etreg = etreg.fit(X_train,y_train)
etreg = etreg.fit(XX,y)
In [74]:
%%time
et_score = cross_val_score(etreg, XX, y, cv=5)
print("Features: %s\nScore: %s\tMean: %.03f"%(columns, et_score,et_score.mean()))
In [155]:
pred = etreg.predict(X_test)
In [156]:
#pred = len(XX)
for idx,i in enumerate(X_test):
if (np.count_nonzero(~np.isnan(i)) / float(i.size)) < 0.7 :
pred[idx]=0.2
In [157]:
pred[1]
Out[157]:
In [158]:
err = (pred-y_test)**2
err.sum()/len(err)
Out[158]:
In [129]:
r = random.randrange(len(pred))
print(r)
print(pred[r])
print(y_test[r])
In [96]:
def marshall_palmer(ref, minutes_past):
#print("Estimating rainfall from {0} observations".format(len(minutes_past)))
# how long is each observation valid?
valid_time = np.zeros_like(minutes_past)
valid_time[0] = minutes_past.iloc[0]
for n in xrange(1, len(minutes_past)):
valid_time[n] = minutes_past.iloc[n] - minutes_past.iloc[n-1]
valid_time[-1] = valid_time[-1] + 60 - np.sum(valid_time)
valid_time = valid_time / 60.0
# sum up rainrate * validtime
sum = 0
for dbz, hours in zip(ref, valid_time):
# See: https://en.wikipedia.org/wiki/DBZ_(meteorology)
if np.isfinite(dbz):
mmperhr = pow(pow(10, dbz/10)/200, 0.625)
sum = sum + mmperhr * hours
return sum
def simplesum(ref,hour):
hour.sum()
# each unique Id is an hour of data at some gauge
def myfunc(hour):
#rowid = hour['Id'].iloc[0]
# sort hour by minutes_past
hour = hour.sort('minutes_past', ascending=True)
est = marshall_palmer(hour['Ref'], hour['minutes_past'])
return est
In [99]:
estimates = raw.groupby(raw.index).apply(myfunc)
estimates.head(20)
Out[99]:
In [127]:
err = (estimates-(np.hstack((y_train,y_test))))**2
err.sum()/len(err)
Out[127]:
Memento (mauri)
In [66]:
etreg = ExtraTreesRegressor(n_estimators=100, max_depth=None, min_samples_split=1, random_state=0)
In [67]:
"""
columns = train_clean.columns
columns = ["minutes_past","radardist_km","Ref","Ref_5x5_10th", "Ref_5x5_50th"]
columns = [u'Id', u'minutes_past', u'radardist_km', u'Ref', u'Ref_5x5_10th',
u'Ref_5x5_50th', u'Ref_5x5_90th', u'RefComposite',
u'RefComposite_5x5_10th', u'RefComposite_5x5_50th',
u'RefComposite_5x5_90th', u'RhoHV', u'RhoHV_5x5_10th',
u'RhoHV_5x5_50th', u'RhoHV_5x5_90th', u'Zdr', u'Zdr_5x5_10th',
u'Zdr_5x5_50th', u'Zdr_5x5_90th', u'Kdp', u'Kdp_5x5_10th',
u'Kdp_5x5_50th', u'Kdp_5x5_90th', u'Expected']
"""
columns = [u'minutes_past', u'radardist_km', u'Ref', u'Ref_5x5_10th',
u'Ref_5x5_50th', u'Ref_5x5_90th', u'RefComposite',
u'RefComposite_5x5_10th', u'RefComposite_5x5_50th',
u'RefComposite_5x5_90th', u'RhoHV', u'RhoHV_5x5_10th',
u'RhoHV_5x5_50th', u'RhoHV_5x5_90th', u'Zdr', u'Zdr_5x5_10th',
u'Zdr_5x5_50th', u'Zdr_5x5_90th', u'Kdp', u'Kdp_5x5_10th',
u'Kdp_5x5_50th', u'Kdp_5x5_90th']
labels = train["Expected"].values
features = train[list(columns)].values
In [68]:
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit(features)
features_trans = imp.transform(features)
In [69]:
len(features_trans)
Out[69]:
In [70]:
split = 0.2
ps = int(len(features_trans) * split)
ftrain = features_trans[:ps]
ltrain = labels[:ps]
ftest = features_trans[ps:]
ltest = labels[ps:]
In [71]:
%%time
etreg.fit(ftrain,ltrain)
Out[71]:
In [72]:
def scorer(estimator, X, y):
return (estimator.predict(X[0])-y)**2
In [57]:
%%time
et_score = cross_val_score(etreg, features_trans, labels, cv=3)
print("Features: %s\nScore: %s\tMean: %.03f"%(columns, et_score,et_score.mean()))
In [73]:
r = random.randrange(len(ltrain))
print(r)
print(etreg.predict(ftrain[r]))
print(ltrain[r])
In [94]:
r = random.randrange(len(ltest))
print(r)
print(etreg.predict(ftest[r]))
print(ltest[r])
In [95]:
err = (etreg.predict(ftest)-ltest)**2
In [96]:
err.sum()/len(err)
Out[96]:
Submit
In [154]:
filename = "data/reduced_test_5000.csv"
test = pd.read_csv(filename)
In [164]:
columns = [u'minutes_past', u'radardist_km', u'Ref', u'Ref_5x5_10th',
u'Ref_5x5_50th', u'Ref_5x5_90th', u'RefComposite',
u'RefComposite_5x5_10th', u'RefComposite_5x5_50th',
u'RefComposite_5x5_90th', u'RhoHV', u'RhoHV_5x5_10th',
u'RhoHV_5x5_50th', u'RhoHV_5x5_90th', u'Zdr', u'Zdr_5x5_10th',
u'Zdr_5x5_50th', u'Zdr_5x5_90th', u'Kdp', u'Kdp_5x5_10th',
u'Kdp_5x5_50th', u'Kdp_5x5_90th']
features = test[list(columns)].values
In [165]:
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit(features)
features_trans = imp.transform(features)
In [166]:
fall = test[test.columns].values
In [177]:
fall[20]
Out[177]:
In [173]:
features_trans[0]
Out[173]:
In [188]:
i = 1
pred = 0
while fall[i][0] == 1:
#print(fall[i])
pred+=etreg.predict(features_trans[i])[0]
#print(etreg.predict(features_trans[i])[0])
i+=1
print(i)
In [192]:
fall[-1][0]
Out[192]:
In [202]:
%%time
res=[]
i=0
while i<len(fall) and i < 10000:
pred = 0
lenn = 0
curr=fall[i][0]
while i<len(fall) and fall[i][0] == curr:
#print(fall[i])
pred+=etreg.predict(features_trans[i])[0]
#print(etreg.predict(features_trans[i])[0])
i+=1
lenn += 1
res.append((curr,pred/lenn))
#i+=1
#print(i)
In [199]:
len(res)
Out[199]:
In [203]:
res[:10]
Out[203]:
In [97]:
def myfunc(hour):
#rowid = hour['Id'].iloc[0]
# sort hour by minutes_past
hour = hour.sort('minutes_past', ascending=True)
#est = (hour['Id'],random.random())
est = random.random()
return est
In [98]:
def marshall_palmer(ref, minutes_past):
#print("Estimating rainfall from {0} observations".format(len(minutes_past)))
# how long is each observation valid?
valid_time = np.zeros_like(minutes_past)
valid_time[0] = minutes_past.iloc[0]
for n in xrange(1, len(minutes_past)):
valid_time[n] = minutes_past.iloc[n] - minutes_past.iloc[n-1]
valid_time[-1] = valid_time[-1] + 60 - np.sum(valid_time)
valid_time = valid_time / 60.0
# sum up rainrate * validtime
sum = 0
for dbz, hours in zip(ref, valid_time):
# See: https://en.wikipedia.org/wiki/DBZ_(meteorology)
if np.isfinite(dbz):
mmperhr = pow(pow(10, dbz/10)/200, 0.625)
sum = sum + mmperhr * hours
return sum
def simplesum(ref,hour):
hour.sum()
# each unique Id is an hour of data at some gauge
def myfunc(hour):
#rowid = hour['Id'].iloc[0]
# sort hour by minutes_past
hour = hour.sort('minutes_past', ascending=True)
est = marshall_palmer(hour['Ref'], hour['minutes_past'])
return est
In [122]:
estimates = test.groupby(train.index).apply(myfunc)
estimates.head(20)
In [123]:
estimates = train.groupby(train.index).apply(myfunc)
estimates.head(20)
Out[123]:
In [100]:
train["Expected"].head(20)
Out[100]:
In [102]:
print(features_trans[0])
print(etreg.predict(features_trans[0]))
In [16]:
def marshall_palmer(data):
res=[]
for n in data:
res.append(etreg.predict(n)[0])
return np.array(res).mean()
def simplesum(ref,hour):
hour.sum()
def myfunc(hour):
hour = hour.sort('minutes_past', ascending=True)
est = marshall_palmer(hour[train.columns])
return est
In [302]:
estimates = train_clean.groupby(train_clean.index).apply(myfunc)
estimates.head(20)
In [ ]:
In [ ]:
RNN
In [134]:
import pandas as pd
from random import random
flow = (list(range(1,10,1)) + list(range(10,1,-1)))*1000
pdata = pd.DataFrame({"a":flow, "b":flow})
pdata.b = pdata.b.shift(9)
data = pdata.iloc[10:] * random() # some noise
In [135]:
#columns = [u'minutes_past', u'radardist_km', u'Ref', u'Ref_5x5_10th',
# u'Ref_5x5_50th', u'Ref_5x5_90th', u'RefComposite',
# u'RefComposite_5x5_10th', u'RefComposite_5x5_50th',
# u'RefComposite_5x5_90th', u'RhoHV', u'RhoHV_5x5_10th',
# u'RhoHV_5x5_50th', u'RhoHV_5x5_90th', u'Zdr', u'Zdr_5x5_10th',
# u'Zdr_5x5_50th', u'Zdr_5x5_90th', u'Kdp', u'Kdp_5x5_10th',
# u'Kdp_5x5_50th', u'Kdp_5x5_90th']
columns = [u'radardist_km', u'Ref', u'Ref_5x5_10th']
nb_features = len(columns)
data = train[list(columns)]
data.head(10)
Out[135]:
In [136]:
data.iloc[0].as_matrix()
Out[136]:
In [137]:
train.head(5)
Out[137]:
In [138]:
train.loc[11]
Out[138]:
In [139]:
train.loc[11][:1]["Expected"].as_matrix
Out[139]:
In [140]:
#train.index.unique()
In [141]:
def _load_data(data, n_prev = 100):
"""
data should be pd.DataFrame()
"""
docX, docY = [], []
for i in range(len(data)-n_prev):
docX.append(data.iloc[i:i+n_prev].as_matrix())
docY.append(data.iloc[i+n_prev].as_matrix())
alsX = np.array(docX)
alsY = np.array(docY)
return alsX, alsY
def train_test_split(df, test_size=0.1):
ntrn = round(len(df) * (1 - test_size))
X_train, y_train = _load_data(df.iloc[0:ntrn])
X_test, y_test = _load_data(df.iloc[ntrn:])
return (X_train, y_train), (X_test, y_test)
(X_train, y_train), (X_test, y_test) = train_test_split(data)
In [142]:
np.shape(X_train)
Out[142]:
In [144]:
t = np.array([2,1])
t.shape = (1,2)
t.tolist()[0]
Out[144]:
In [145]:
np.shape(t)
Out[145]:
In [146]:
X_train[:2,:2]
Out[146]:
In [44]:
train.index.unique()
Out[44]:
In [148]:
max_padding = 20
In [149]:
%%time
docX, docY = [], []
for i in train.index.unique():
if isinstance(train.loc[i],pd.core.series.Series):
m = [data.loc[i].as_matrix()]
pad = np.pad(m, ((max_padding -len(m), 0),(0,0)), 'constant') # pre-padding
docX.append(pad)
docY.append(float(train.loc[i]["Expected"]))
else:
m = data.loc[i].as_matrix()
pad = np.pad(m, ((max_padding -len(m), 0),(0,0)), 'constant')
docX.append(pad)
docY.append(float(train.loc[i][:1]["Expected"]))
#docY.append(train.loc[i][:1]["Expected"].as_matrix)
XX = np.array(docX)
yy = np.array(docY)
In [151]:
np.shape(XX)
Out[151]:
In [154]:
XX[0].mean()
Out[154]:
In [36]:
#from keras.preprocessing import sequence
#sequence.pad_sequences(X_train, maxlen=maxlen)
In [37]:
def _load_data(data):
"""
data should be pd.DataFrame()
"""
docX, docY = [], []
for i in data.index.unique():
#np.pad(tmp, ((0, max_padding -len(tmp) ),(0,0)), 'constant')
m = data.loc[i].as_matrix()
pad = np.pad(m, ((0, max_padding -len(m) ),(0,0)), 'constant')
docX.append(pad)
if isinstance(train.loc[i],pd.core.series.Series):
docY.append(float(train.loc[i]["Expected"]))
else:
docY.append(float(train.loc[i][:1]["Expected"]))
alsX = np.array(docX)
alsY = np.array(docY)
return alsX, alsY
def train_test_split(df, test_size=0.1):
ntrn = round(len(df) * (1 - test_size))
X_train, y_train = _load_data(df.iloc[0:ntrn])
X_test, y_test = _load_data(df.iloc[ntrn:])
return (X_train, y_train), (X_test, y_test)
(X_train, y_train), (X_test, y_test) = train_test_split(train)
In [38]:
len(X_train[0])
Out[38]:
In [39]:
train.head()
Out[39]:
In [40]:
X_train[0][:10]
Out[40]:
In [41]:
yt = []
for i in y_train:
yt.append([i[0]])
In [42]:
yt[0]
Out[42]:
In [439]:
X_train.shape
Out[439]:
In [450]:
len(fea[0])
Out[450]:
In [449]:
len(X_train[0][0])
Out[449]:
In [442]:
f = np.array(fea)
In [443]:
f.shape()
In [428]:
#(X_train, y_train), (X_test, y_test) = train_test_split(data) # retrieve data
# and now train the model
# batch_size should be appropriate to your memory size
# number of epochs should be higher for real world problems
model.fit(X_train, yt, batch_size=450, nb_epoch=2, validation_split=0.05)
Out[428]:
In [37]:
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM
from keras.layers.embeddings import Embedding
In [38]:
%%time
input_dim = nb_features
out_dim = 1
hidden_dim = 200
model = Sequential()
#Embedding(input_dim, hidden_dim, mask_zero=True)
#model.add(LSTM(hidden_dim, hidden_dim, return_sequences=False))
model.add(LSTM(input_dim, hidden_dim, return_sequences=False))
model.add(Dropout(0.5))
model.add(Dense(hidden_dim, out_dim))
model.add(Activation("linear"))
model.compile(loss="mean_squared_error", optimizer="rmsprop")
In [39]:
model.fit(XX, yy, batch_size=10, nb_epoch=10, validation_split=0.1)
Out[39]:
In [131]:
test = random.randint(0,len(XX))
print(model.predict(XX[test:test+1])[0][0])
print(yy[test])
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: