In [1]:
import sys
sys.path.append("d:/Kaggle_ws/Bosch/src/")
import numpy as np
import pandas as pd
#import xgboost as xgb
from datetime import datetime
import random
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV # for optimizing some parameters
from sklearn.metrics import matthews_corrcoef
from include.dataset_fnames import train_numeric_fname, train_date_fname
In [2]:
#
# MCC = \frac{(TP*TN) - (FP * FN)}{\sqrt{(TP+FP)(TP+FN)(TN + FP)(TN+FN)}},
#
# sklearn.metrics.matthews_corrcoef(y_true, y_pred, sample_weight=None)
In [3]:
print "1 Reading CSV files"
t0 = datetime.now()
# X = pd.read_csv(train_date_fname, index_col=0, dtype=np.float32).values
X = np.concatenate([pd.read_csv(train_date_fname, index_col=0, dtype=np.float32, nrows=250000).values,
pd.read_csv(train_numeric_fname, index_col=0, dtype=np.float32, nrows=250000).values],
axis=1)
y = pd.read_csv(train_numeric_fname, index_col=0, dtype=np.float32, usecols=[0,969], nrows=250000).values.ravel()
t1 = datetime.now()
print "loaded in", t1 - t0
In [4]:
xgb_param_grid = {
'max_depth': [3], #, 5, 7],
'learning_rate': [0.1, 0.01],
'n_estimators': [1000], #500, 1000],
'base_score': [0.001, 0.005, 0.01],
}
In [5]:
print "Running Gridsearch"
t0 = datetime.now()
print "Removed..."
# xgb_grid = GridSearchCV(XGBClassifier(objective='binary:logistic'), xgb_param_grid, n_jobs=1, cv=3)
# xgb_grid.fit(X, y)
t1 = datetime.now()
print "Ran in", t1 - t0
In [6]:
# print (xgb_grid.best_params_)
# print (xgb_grid.best_score_)
In [7]:
from include.GCForest import gcForest
In [8]:
gcf = gcForest(shape_1X=2125, window=1, tolerance=0.0)
In [ ]:
gcf.fit(X,y)
In [8]:
type(X)
Out[8]:
In [ ]: