In [1]:
# from __future__ import exam_success
from __future__ import absolute_import
from __future__ import print_function
%matplotlib inline
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import random
import pandas as pd
# Sk cheats
from sklearn.cross_validation import cross_val_score # cross val
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.preprocessing import Imputer # get rid of nan
Reduced to
In [2]:
filename = "data/reduced_train_10000.csv"
train = pd.read_csv(filename)
train = train.set_index('Id')
train = train.dropna()
In [3]:
train.head()
Out[3]:
In [4]:
train["Expected"].describe()
Out[4]:
Get rid of Nan value for now
In [5]:
#train_clean = train[[not i for i in np.isnan(train["Ref_5x5_10th"])]]
Forums indicate that a higher than 1m rainfall is probably an error. Which is quite understandable. We filter that out
In [6]:
train = train[train['Expected'] < 1000]
In [7]:
train['Expected'].describe()
Out[7]:
Memento (mauri)
In [8]:
etreg = ExtraTreesRegressor(n_estimators=100, max_depth=None, min_samples_split=1, random_state=0)
In [9]:
"""
columns = train_clean.columns
columns = ["minutes_past","radardist_km","Ref","Ref_5x5_10th", "Ref_5x5_50th"]
columns = [u'Id', u'minutes_past', u'radardist_km', u'Ref', u'Ref_5x5_10th',
u'Ref_5x5_50th', u'Ref_5x5_90th', u'RefComposite',
u'RefComposite_5x5_10th', u'RefComposite_5x5_50th',
u'RefComposite_5x5_90th', u'RhoHV', u'RhoHV_5x5_10th',
u'RhoHV_5x5_50th', u'RhoHV_5x5_90th', u'Zdr', u'Zdr_5x5_10th',
u'Zdr_5x5_50th', u'Zdr_5x5_90th', u'Kdp', u'Kdp_5x5_10th',
u'Kdp_5x5_50th', u'Kdp_5x5_90th', u'Expected']
"""
columns = [u'minutes_past', u'radardist_km', u'Ref', u'Ref_5x5_10th',
u'Ref_5x5_50th', u'Ref_5x5_90th', u'RefComposite',
u'RefComposite_5x5_10th', u'RefComposite_5x5_50th',
u'RefComposite_5x5_90th', u'RhoHV', u'RhoHV_5x5_10th',
u'RhoHV_5x5_50th', u'RhoHV_5x5_90th', u'Zdr', u'Zdr_5x5_10th',
u'Zdr_5x5_50th', u'Zdr_5x5_90th', u'Kdp', u'Kdp_5x5_10th',
u'Kdp_5x5_50th', u'Kdp_5x5_90th']
labels = train_clean["Expected"].values
features = train_clean[list(columns)].values
In [10]:
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit(features)
features_trans = imp.transform(features)
In [10]:
ftrain = features_trans[:3000]
ltrain = labels[:3000]
ftest = features_trans[3000:]
ltest = labels[3000:]
In [139]:
%%time
etreg.fit(ftrain,ltrain)
Out[139]:
In [89]:
def scorer(estimator, X, y):
return (estimator.predict(X[0])-y)**2
In [140]:
%%time
et_score = cross_val_score(etreg, features_trans, labels, cv=5)
print("Features: %s\nScore: %s\tMean: %.03f"%(columns, et_score,et_score.mean()))
In [141]:
r = random.randrange(len(ltrain))
print(r)
print(etreg.predict(ftrain[r]))
print(ltrain[r])
In [153]:
r = random.randrange(len(ltest))
print(r)
print(etreg.predict(ftest[r]))
print(ltest[r])
In [143]:
err = (etreg.predict(ftest)-ltest)**2
In [144]:
err.sum()/len(err)
Out[144]:
Submit
In [154]:
filename = "data/reduced_test_5000.csv"
test = pd.read_csv(filename)
In [164]:
columns = [u'minutes_past', u'radardist_km', u'Ref', u'Ref_5x5_10th',
u'Ref_5x5_50th', u'Ref_5x5_90th', u'RefComposite',
u'RefComposite_5x5_10th', u'RefComposite_5x5_50th',
u'RefComposite_5x5_90th', u'RhoHV', u'RhoHV_5x5_10th',
u'RhoHV_5x5_50th', u'RhoHV_5x5_90th', u'Zdr', u'Zdr_5x5_10th',
u'Zdr_5x5_50th', u'Zdr_5x5_90th', u'Kdp', u'Kdp_5x5_10th',
u'Kdp_5x5_50th', u'Kdp_5x5_90th']
features = test[list(columns)].values
In [165]:
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit(features)
features_trans = imp.transform(features)
In [166]:
fall = test[test.columns].values
In [177]:
fall[20]
Out[177]:
In [173]:
features_trans[0]
Out[173]:
In [188]:
i = 1
pred = 0
while fall[i][0] == 1:
#print(fall[i])
pred+=etreg.predict(features_trans[i])[0]
#print(etreg.predict(features_trans[i])[0])
i+=1
print(i)
In [192]:
fall[-1][0]
Out[192]:
In [202]:
%%time
res=[]
i=0
while i<len(fall) and i < 10000:
pred = 0
lenn = 0
curr=fall[i][0]
while i<len(fall) and fall[i][0] == curr:
#print(fall[i])
pred+=etreg.predict(features_trans[i])[0]
#print(etreg.predict(features_trans[i])[0])
i+=1
lenn += 1
res.append((curr,pred/lenn))
#i+=1
#print(i)
In [199]:
len(res)
Out[199]:
In [203]:
res[:10]
Out[203]:
In [11]:
def myfunc(hour):
#rowid = hour['Id'].iloc[0]
# sort hour by minutes_past
hour = hour.sort('minutes_past', ascending=True)
#est = (hour['Id'],random.random())
est = random.random()
return est
In [12]:
def marshall_palmer(ref, minutes_past):
#print("Estimating rainfall from {0} observations".format(len(minutes_past)))
# how long is each observation valid?
valid_time = np.zeros_like(minutes_past)
valid_time[0] = minutes_past.iloc[0]
for n in xrange(1, len(minutes_past)):
valid_time[n] = minutes_past.iloc[n] - minutes_past.iloc[n-1]
valid_time[-1] = valid_time[-1] + 60 - np.sum(valid_time)
valid_time = valid_time / 60.0
# sum up rainrate * validtime
sum = 0
for dbz, hours in zip(ref, valid_time):
# See: https://en.wikipedia.org/wiki/DBZ_(meteorology)
if np.isfinite(dbz):
mmperhr = pow(pow(10, dbz/10)/200, 0.625)
sum = sum + mmperhr * hours
return sum
def simplesum(ref,hour):
hour.sum()
# each unique Id is an hour of data at some gauge
def myfunc(hour):
#rowid = hour['Id'].iloc[0]
# sort hour by minutes_past
hour = hour.sort('minutes_past', ascending=True)
est = marshall_palmer(hour['Ref'], hour['minutes_past'])
return est
In [13]:
estimates = train.groupby(train.index).apply(myfunc)
estimates.head(20)
Out[13]:
In [14]:
train["Expected"].head(20)
Out[14]:
In [15]:
res=[]
for i in fall:
pred = 0
curr=i[0]
while fall[i][0] == 1:
#print(fall[i])
pred+=etreg.predict(features_trans[i])[0]
#print(etreg.predict(features_trans[i])[0])
i+=1
print(i)
In [178]:
etreg.predict(features_trans[0])
Out[178]:
In [16]:
def marshall_palmer(data):
res=[]
for n in data:
res.append(etreg.predict(n)[0])
return np.array(res).mean()
def simplesum(ref,hour):
hour.sum()
def myfunc(hour):
hour = hour.sort('minutes_past', ascending=True)
est = marshall_palmer(hour[train.columns])
return est
In [302]:
estimates = train_clean.groupby(train_clean.index).apply(myfunc)
estimates.head(20)
In [ ]:
In [ ]:
RNN
In [11]:
import pandas as pd
from random import random
flow = (list(range(1,10,1)) + list(range(10,1,-1)))*1000
pdata = pd.DataFrame({"a":flow, "b":flow})
pdata.b = pdata.b.shift(9)
data = pdata.iloc[10:] * random() # some noise
In [12]:
#columns = [u'minutes_past', u'radardist_km', u'Ref', u'Ref_5x5_10th',
# u'Ref_5x5_50th', u'Ref_5x5_90th', u'RefComposite',
# u'RefComposite_5x5_10th', u'RefComposite_5x5_50th',
# u'RefComposite_5x5_90th', u'RhoHV', u'RhoHV_5x5_10th',
# u'RhoHV_5x5_50th', u'RhoHV_5x5_90th', u'Zdr', u'Zdr_5x5_10th',
# u'Zdr_5x5_50th', u'Zdr_5x5_90th', u'Kdp', u'Kdp_5x5_10th',
# u'Kdp_5x5_50th', u'Kdp_5x5_90th']
columns = [u'radardist_km', u'Ref', u'Ref_5x5_10th']
nb_features = len(columns)
data = train[list(columns)]
data.head(10)
Out[12]:
In [13]:
data.iloc[0].as_matrix()
Out[13]:
In [14]:
train.head(5)
Out[14]:
In [15]:
train.loc[11]
Out[15]:
In [16]:
train.loc[11][:1]["Expected"].as_matrix
Out[16]:
In [17]:
#train.index.unique()
In [18]:
def _load_data(data, n_prev = 100):
"""
data should be pd.DataFrame()
"""
docX, docY = [], []
for i in range(len(data)-n_prev):
docX.append(data.iloc[i:i+n_prev].as_matrix())
docY.append(data.iloc[i+n_prev].as_matrix())
alsX = np.array(docX)
alsY = np.array(docY)
return alsX, alsY
def train_test_split(df, test_size=0.1):
ntrn = round(len(df) * (1 - test_size))
X_train, y_train = _load_data(df.iloc[0:ntrn])
X_test, y_test = _load_data(df.iloc[ntrn:])
return (X_train, y_train), (X_test, y_test)
(X_train, y_train), (X_test, y_test) = train_test_split(data)
In [19]:
np.shape(X_train)
Out[19]:
In [20]:
t = np.array([2,1])
t.shape = (1,2)
t.tolist()[0]
Out[20]:
In [21]:
np.shape(t)
Out[21]:
In [22]:
X_train[:2,:2]
Out[22]:
In [23]:
XX[:2,:2]
In [29]:
XX[:2][:2]
In [30]:
np.shape(XX)
In [31]:
for i in XX:
print(np.shape(i))
In [24]:
np.shape(XX[0])
In [28]:
z = np.zeros([297,9,23])
In [29]:
np.shape(z)
Out[29]:
In [791]:
np.shape(np.reshape(XX,(297,1)))
Out[791]:
In [716]:
tl = train.loc[2][:1]["Expected"]
In [718]:
tl.as_blocks()
Out[718]:
In [719]:
tl.as_matrix()
Out[719]:
In [777]:
data.iloc[2:4].as_matrix()
Out[777]:
In [776]:
train.loc[2].as_matrix()
Out[776]:
In [46]:
m = data.loc[10].as_matrix()
pad = np.pad(m, ((0, max_padding -len(m) ),(0,0)), 'constant')
In [47]:
pad
Out[47]:
In [44]:
train.index.unique()
Out[44]:
In [25]:
max_padding = 20
In [26]:
%%time
docX, docY = [], []
for i in train.index.unique():
if isinstance(train.loc[i],pd.core.series.Series):
m = [data.loc[i].as_matrix()]
pad = np.pad(m, ((max_padding -len(m), 0),(0,0)), 'constant') # pre-padding
docX.append(pad)
docY.append(float(train.loc[i]["Expected"]))
else:
m = data.loc[i].as_matrix()
pad = np.pad(m, ((max_padding -len(m), 0),(0,0)), 'constant')
docX.append(pad)
docY.append(float(train.loc[i][:1]["Expected"]))
#docY.append(train.loc[i][:1]["Expected"].as_matrix)
XX = np.array(docX)
yy = np.array(docY)
In [27]:
np.shape(XX)
Out[27]:
In [28]:
#from keras.preprocessing import sequence
#sequence.pad_sequences(X_train, maxlen=maxlen)
In [29]:
def _load_data(data):
"""
data should be pd.DataFrame()
"""
docX, docY = [], []
for i in data.index.unique():
#np.pad(tmp, ((0, max_padding -len(tmp) ),(0,0)), 'constant')
m = data.loc[i].as_matrix()
pad = np.pad(m, ((0, max_padding -len(m) ),(0,0)), 'constant')
docX.append(pad)
if isinstance(train.loc[i],pd.core.series.Series):
docY.append(float(train.loc[i]["Expected"]))
else:
docY.append(float(train.loc[i][:1]["Expected"]))
alsX = np.array(docX)
alsY = np.array(docY)
return alsX, alsY
def train_test_split(df, test_size=0.1):
ntrn = round(len(df) * (1 - test_size))
X_train, y_train = _load_data(df.iloc[0:ntrn])
X_test, y_test = _load_data(df.iloc[ntrn:])
return (X_train, y_train), (X_test, y_test)
(X_train, y_train), (X_test, y_test) = train_test_split(train)
In [30]:
len(X_train[0])
Out[30]:
In [31]:
train.head()
Out[31]:
In [32]:
X_train[0][:10]
Out[32]:
In [33]:
yt = []
for i in y_train:
yt.append([i[0]])
In [34]:
yt[0]
Out[34]:
In [35]:
X_train.shape
Out[35]:
In [36]:
len(fea[0])
In [37]:
len(X_train[0][0])
Out[37]:
In [38]:
f = np.array(fea)
In [443]:
f.shape()
In [39]:
XX[0]
Out[39]:
In [428]:
#(X_train, y_train), (X_test, y_test) = train_test_split(data) # retrieve data
# and now train the model
# batch_size should be appropriate to your memory size
# number of epochs should be higher for real world problems
model.fit(X_train, yt, batch_size=450, nb_epoch=2, validation_split=0.05)
Out[428]:
In [43]:
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM
from keras.layers.embeddings import Embedding
In [44]:
%%time
input_dim = nb_features
out_dim = 1
hidden_dim = 200
model = Sequential()
#Embedding(input_dim, hidden_dim, mask_zero=True)
#model.add(LSTM(hidden_dim, hidden_dim, return_sequences=False))
model.add(LSTM(input_dim, hidden_dim, return_sequences=False))
model.add(Dropout(0.5))
model.add(Dense(hidden_dim, out_dim))
model.add(Activation("linear"))
model.compile(loss="mean_squared_error", optimizer="rmsprop")
In [45]:
model.fit(XX, yy, batch_size=10, nb_epoch=10, validation_split=0.1)
Out[45]:
In [46]:
test = random.randint(0,len(XX))
print(model.predict(XX[test:test+1])[0][0])
print(yy[test])
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: