In [1]:
import h2o
import time,os
In [5]:
%matplotlib inline
#IMPORT ALL THE THINGS
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from h2o.estimators.deeplearning import H2OAutoEncoderEstimator, H2ODeepLearningEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.estimators.random_forest import H2ORandomForestEstimator
In [3]:
h2o.init(max_mem_size = 20) #uses all cores by default
h2o.remove_all()
In [6]:
xy_tr = h2o.import_file(path = os.path.realpath("../daielee/xy_tr.csv"))
x_test = h2o.import_file(path = os.path.realpath("../daielee/x_test.csv"))
In [7]:
xy_tr_df = xy_tr.as_data_frame(use_pandas=True)
x_test_df = x_test.as_data_frame(use_pandas=True)
In [8]:
print (xy_tr_df.shape,x_test_df.shapepe)
=============
Reported on train data.
MSE: 0.02257823450695032
In [10]:
X = xy_tr.col_names[0:57]
y = xy_tr.col_names[57]
dl_model = H2ODeepLearningEstimator(epochs=1000)
dl_model.train(X, y, xy_tr)
In [11]:
dl_model.summary
Out[11]:
In [14]:
sh = dl_model.score_history()
sh = pd.DataFrame(sh)
print(sh.columns)
In [16]:
sh.plot(x='epochs',y = ['training_deviance', 'training_mae'])
Out[16]:
In [20]:
dl_model.default_params
Out[20]:
In [38]:
dl_model.model_performance(test_data=xy_tr)
Out[38]:
In [39]:
pd.DataFrame(dl_model.varimp())
Out[39]:
In [40]:
y_test = dl_model.predict(test_data=x_test)
In [41]:
print(y_test.shape)
In [103]:
nuron_cnts = [40,80,160]
layer_cnts = [1,2,3,4,5]
acts = ["Tanh","Maxout","Rectifier","RectifierWithDropout"]
models_list = []
m_names_list = []
i = 0
# N 3 * L 5 * A 4 = 60n
for act in acts:
for layer_cnt in layer_cnts:
for nuron_cnt in nuron_cnts:
m_names_list.append("N:"+str(nuron_cnt)+"L:"+str(layer_cnt)+"A:"+act)
print(m_names_list[i])
models_list.append(H2ODeepLearningEstimator(
model_id=m_names_list[i],
hidden=[nuron_cnt]*layer_cnt, # more hidden layers -> more complex interactions
activation = act,
epochs=10, # to keep it short enough
score_validation_samples=10000,
overwrite_with_best_model=True,
adaptive_rate=True,
l1=0.00001, # add some L1/L2 regularization
l2=0.00001,
max_w2=10.0 # helps stability for Rectifier
))
models_list[i].train(x=X,y=y,training_frame=xy_tr,
validation_frame=xy_tr)
i+=1
In [104]:
for i in range(0,639): #range(len(models_list)-1):
try:
sh = models_list[i].score_history()
sh = pd.DataFrame(sh)
perform = sh['validation_deviance'].tolist()[-1]
print(models_list[i].model_id,end=" ")
print(perform)
except:
print(end="")
20% for final testing
We will train a data set on one set and use the others to test the validity of the model by ensuring that it can predict accurately on data the model has not been shown.
Use Rect-dropout
In [181]:
train_h2o, valid_h2o, test_h2o = xy_tr.split_frame([0.6, 0.2], seed=1234)
In [112]:
nuron_cnts = [40,80,160]
layer_cnts = [1,2,3,4,5]
acts = ["RectifierWithDropout"] #"Tanh","Maxout","Rectifier",
models_list = []
m_names_list = []
time_tkn_wall =[]
time_tkn_clk=[]
i = 0
# N 3 * L 5 * A 1 = 15n
for act in acts:
for layer_cnt in layer_cnts:
for nuron_cnt in nuron_cnts:
m_names_list.append("N: "+str(nuron_cnt)+" L: "+str(layer_cnt)+" A: "+act)
print(m_names_list[i])
models_list.append(H2ODeepLearningEstimator(
model_id=m_names_list[i],
hidden=[nuron_cnt]*layer_cnt, # more hidden layers -> more complex interactions
activation = act,
epochs=10, # to keep it short enough
score_validation_samples=10000,
overwrite_with_best_model=True,
adaptive_rate=True,
l1=0.00001, # add some L1/L2 regularization
l2=0.00001,
max_w2=10.0 # helps stability for Rectifier
))
str_time_clk = time.clock()
str_time_wall = time.time()
models_list[i].train(x=X,y=y,training_frame=train,
validation_frame=valid)
time_tkn_clk.append(time.clock()-str_time_clk)
time_tkn_wall.append(time.time()-str_time_wall)
i+=1
time.time() shows that the wall-clock time has passed approximately one second while time.clock() shows the CPU time spent on the current process is less than 1 microsecond. time.clock() has a much higher precision than time.time().
In [115]:
for i in range(len(models_list)-1):
try:
sh = models_list[i].score_history()
sh = pd.DataFrame(sh)
perform = sh['validation_deviance'].tolist()[-1]
print(models_list[i].model_id,end=" ")
print(" clk "+str(time_tkn_clk[i])+" wall "+str(time_tkn_wall[i]),end=" ")
print(perform)
except:
print(end="")
In [172]:
nuron_cnts = [30,40,50]
layer_cnts = [4,5]
acts = ["RectifierWithDropout"] #"Tanh","Maxout","Rectifier",
dout=0.5
models_list = []
m_names_list = []
time_tkn_wall =[]
time_tkn_clk=[]
i = 0
# N 1 * L 10 * A 1 = 10n
for act in acts:
for layer_cnt in layer_cnts:
for nuron_cnt in nuron_cnts:
m_names_list.append("N: "+str(nuron_cnt)+" L: "+str(layer_cnt)+" A: "+act)
print(m_names_list[i])
models_list.append(H2ODeepLearningEstimator(
model_id=m_names_list[i],
hidden=[nuron_cnt]*layer_cnt, # more hidden layers -> more complex interactions
hidden_dropout_ratios=[dout]*layer_cnt,
activation = act,
epochs=500, # to keep it short enough
train_samples_per_iteration=300,
score_validation_samples=10000,
loss="absolute",
overwrite_with_best_model=True,
adaptive_rate=True,
l1=0.00001, # add some L1/L2 regularization
l2=0.0001,
max_w2=10.0, # helps stability for Rectifier
variable_importances=True
))
str_time_clk = time.clock()
str_time_wall = time.time()
models_list[i].train(x=X,y=y,training_frame=train,
validation_frame=valid)
time_tkn_clk.append(time.clock()-str_time_clk)
time_tkn_wall.append(time.time()-str_time_wall)
i+=1
In [142]:
dl_pref=dl_model.model_performance(test_data=test)
In [ ]:
dl_model.mean
In [155]:
dl_pref.mae()
Out[155]:
In [163]:
train.shape
models_list[0].model_id
Out[163]:
In [173]:
for i in range(len(models_list)):
try:
sh = models_list[i].score_history()
sh = pd.DataFrame(sh)
sh.plot(x='epochs',y = ['training_mae', 'validation_mae'])
tr_perform = sh['training_mae'].tolist()[-1]
val_perform = sh['validation_mae'].tolist()[-1]
ts_perform= models_list[i].model_performance(test_data=test).mae()
print(models_list[i].model_id,end=" ")
print("clk "+str(round(time_tkn_clk[i],2))+"\twall "+str(round(time_tkn_wall[i]/60,2)),end="\t")
print(
"tr " + str(round(tr_perform,6)) +"\tval " + str(round(val_perform,6)) + "\tts " + str(round(ts_perform,6))
)
except:
print(end="")
In [174]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import gc
from sklearn.linear_model import LinearRegression
import random
import datetime as dt
In [175]:
np.random.seed(17)
random.seed(17)
train = pd.read_csv("../input/train_2016_v2.csv", parse_dates=["transactiondate"])
properties = pd.read_csv("../input/properties_2016.csv")
submission = pd.read_csv("../input/sample_submission.csv")
print(len(train),len(properties),len(submission))
In [176]:
def get_features(df):
df["transactiondate"] = pd.to_datetime(df["transactiondate"])
df["transactiondate_year"] = df["transactiondate"].dt.year
df["transactiondate_month"] = df["transactiondate"].dt.month
df['transactiondate'] = df['transactiondate'].dt.quarter
df = df.fillna(-1.0)
return df
def MAE(y, ypred):
#logerror=log(Zestimate)−log(SalePrice)
return np.sum([abs(y[i]-ypred[i]) for i in range(len(y))]) / len(y)
In [177]:
train = pd.merge(train, properties, how='left', on='parcelid')
y = train['logerror'].values
test = pd.merge(submission, properties, how='left', left_on='ParcelId', right_on='parcelid')
properties = [] #memory
exc = [train.columns[c] for c in range(len(train.columns)) if train.dtypes[c] == 'O'] + ['logerror','parcelid']
col = [c for c in train.columns if c not in exc]
In [178]:
train = get_features(train[col])
test['transactiondate'] = '2016-01-01' #should use the most common training date
test = get_features(test[col])
In [179]:
reg = LinearRegression(n_jobs=-1)
reg.fit(train, y); print('fit...')
print(MAE(y, reg.predict(train)))
train = []; y = [] #memory
test_dates = ['2016-10-01','2016-11-01','2016-12-01','2017-10-01','2017-11-01','2017-12-01']
test_columns = ['201610','201611','201612','201710','201711','201712']
In [187]:
pred0 = models_list[1].predict(test_data=x_test).as_data_frame(use_pandas=True)
In [188]:
pred0.head(n=5)
Out[188]:
In [189]:
OLS_WEIGHT = 0.0856
print( "\nPredicting with OLS and combining with XGB/LGB/baseline predicitons: ..." )
for i in range(len(test_dates)):
test['transactiondate'] = test_dates[i]
pred = OLS_WEIGHT * reg.predict(get_features(test)) + (1-OLS_WEIGHT)*pred0.values[:,0]
submission[test_columns[i]] = [float(format(x, '.4f')) for x in pred]
print('predict...', i)
print( "\nCombined XGB/LGB/baseline/OLS predictions:" )
print( submission.head() )
In [190]:
from datetime import datetime
submission.to_csv('sub{}.csv'.format(datetime.now().strftime('%Y%m%d_%H%M%S')), index=False)
In [ ]:
h2o.model.regression.h2o_mean_absolute_error(y_actual=,y_predicted=)