Victor Kantor, xead.wl@gmail.com
In [1]:
import pandas as pd
from sklearn import model_selection, metrics
import numpy as np
train = pd.read_csv("train.tsv")
test = pd.read_csv("test.tsv")
sample_submission = pd.read_csv("sample_submission.tsv")
In [2]:
def score_func(y_true, y_pred):
y_true = np.array(y_true)
y_pred = np.array(y_pred)
return (np.abs(y_true - y_pred) / (np.abs(y_true) + np.abs(y_pred))).mean() * 200.
return s
scorer = metrics.make_scorer(score_func=score_func, greater_is_better=False)
In [61]:
frac = 1. # fraction of learning examples used for model fitting
# sample the train set if your don't want to deel with all examples
train = train.sample(frac=frac, random_state=42)
X = train.drop(['Num','y'], axis=1)
y = train['y']
print len(X), len(y)
In [62]:
%%time
from sklearn.ensemble import GradientBoostingRegressor
import xgboost
#model = GradientBoostingRegressor(n_estimators=100, max_depth=5, random_state=43)
#model = xgboost.XGBRegressor(n_estimators=100, max_depth=3)
model = best
model.fit(X, y)
preds = model.predict(test.drop(['Num'], axis=1))
print len(preds)
print len(sample_submission)
In [63]:
sample_submission['y'] = preds
In [64]:
sample_submission.head(5)
Out[64]:
In [65]:
# In GBM you can get some negative predictions:
print sample_submission[sample_submission['y'] < 0]
In [66]:
sample_submission['y'] = sample_submission['y'].map(lambda x: x if x > 0 else 0.0)
In [67]:
sample_submission.to_csv("baseline_submission.csv", sep=',', index=False)
In [167]:
model_selection.cross_val_score(model, X, y, scoring=scorer)
Out[167]:
In [160]:
def find(params):
model = xgboost.XGBRegressor(n_estimators=params[0], max_depth=params[1])
return model_selection.cross_val_score(model, X, y).mean()
In [54]:
params = []
params.append({
'n_estimators': [112, 114, 116],
'learning_rate': [0.1]
})
params2 = {
'n_estimators' : [115],
'learning_rate': [0.1],
'max_depth': [2, 6, 10, 14, 18],
'min_child_weight' : [1, 3, 5]
}
params3 = {
'gamma':[i/10.0 for i in range(0,5)]
}
params4 = {
'gamma': [0.3],
'n_estimators' : [115],
'learning_rate': [0.1],
'max_depth': [18],
'subsample':[i/10.0 for i in range(6,10)],
'colsample_bytree':[i/10.0 for i in range(6,10)]
}
In [55]:
a = model_selection.GridSearchCV(xgboost.XGBRegressor(silent=False), params4,
scoring=scorer, cv=model_selection.TimeSeriesSplit(),
fit_params={'eval_metric' : 'mae'})
In [56]:
%%time
a.fit(X, y)
Out[56]:
In [57]:
a.best_params_
Out[57]:
In [45]:
z = dict(a.best_params_, **params3)
In [58]:
best = xgboost.XGBRegressor(**a.best_params_)
In [59]:
best
Out[59]:
In [43]:
z = dict(params3, **params2)
In [44]:
z
Out[44]:
In [ ]: