In [1]:
import sys
sys.path.append("d:/Kaggle_ws/Bosch/src/")

import numpy as np
import pandas as pd
#import xgboost as xgb
from datetime import datetime

import random
from xgboost import XGBClassifier

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV # for optimizing some parameters

from sklearn.metrics import matthews_corrcoef

from include.dataset_fnames import train_numeric_fname, train_date_fname

In [2]:
#
# MCC = \frac{(TP*TN) - (FP * FN)}{\sqrt{(TP+FP)(TP+FN)(TN + FP)(TN+FN)}},
#
# sklearn.metrics.matthews_corrcoef(y_true, y_pred, sample_weight=None)

In [3]:
print "1 Reading CSV files"
t0 = datetime.now()

# X = pd.read_csv(train_date_fname, index_col=0, dtype=np.float32).values
X = np.concatenate([pd.read_csv(train_date_fname, index_col=0, dtype=np.float32, nrows=250000).values,
                    pd.read_csv(train_numeric_fname, index_col=0, dtype=np.float32, nrows=250000).values], 
                    axis=1)

y = pd.read_csv(train_numeric_fname, index_col=0, dtype=np.float32, usecols=[0,969], nrows=250000).values.ravel()

t1 = datetime.now()
print "loaded in", t1 - t0


1 Reading CSV files
loaded in 0:00:39.604000

In [4]:
xgb_param_grid = {
    'max_depth': [3], #, 5, 7], 
    'learning_rate': [0.1, 0.01], 
    'n_estimators': [1000], #500, 1000], 
    'base_score': [0.001, 0.005, 0.01],
}

In [5]:
print "Running Gridsearch"
t0 = datetime.now()

print "Removed..."
# xgb_grid = GridSearchCV(XGBClassifier(objective='binary:logistic'), xgb_param_grid, n_jobs=1, cv=3)
# xgb_grid.fit(X, y)

t1 = datetime.now()
print "Ran in", t1 - t0


Running Gridsearch
Removed...
Ran in 0:00:00

In [6]:
# print (xgb_grid.best_params_)
# print (xgb_grid.best_score_)

In [7]:
from include.GCForest import gcForest

In [8]:
gcf = gcForest(shape_1X=2125, window=1, tolerance=0.0)

In [ ]:
gcf.fit(X,y)


Slicing Sequence...
Training MGS Random Forests...

In [8]:
type(X)


Out[8]:
numpy.ndarray

In [ ]: