In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
dataset = pd.read_csv('train.csv')

In [3]:
dataset.head(3)


Out[3]:
id cat1 cat2 cat3 cat4 cat5 cat6 cat7 cat8 cat9 ... cont6 cont7 cont8 cont9 cont10 cont11 cont12 cont13 cont14 loss
0 1 A B A B A A A A B ... 0.718367 0.335060 0.30260 0.67135 0.83510 0.569745 0.594646 0.822493 0.714843 2213.18
1 2 A B A A A A A A B ... 0.438917 0.436585 0.60087 0.35127 0.43919 0.338312 0.366307 0.611431 0.304496 1283.60
2 5 A B A A B A A A B ... 0.289648 0.315545 0.27320 0.26076 0.32446 0.381398 0.373424 0.195709 0.774425 3005.09

3 rows × 132 columns


In [6]:
cont_columns = []
cat_columns = []
for i in dataset.columns:
    if dataset[i].dtype == 'float' and i!='loss':
        cont_columns.append(i)
    elif dataset[i].dtype == 'object':
        cat_columns.append(i)
print "number of continuous features: ", len(cont_columns)
print "number of categorical features: ", len(cat_columns)


number of continuous features:  14
number of categorical features:  116

In [7]:
dataset.describe(include = ['object'])


Out[7]:
cat1 cat2 cat3 cat4 cat5 cat6 cat7 cat8 cat9 cat10 ... cat107 cat108 cat109 cat110 cat111 cat112 cat113 cat114 cat115 cat116
count 188318 188318 188318 188318 188318 188318 188318 188318 188318 188318 ... 188318 188318 188318 188318 188318 188318 188318 188318 188318 188318
unique 2 2 2 2 2 2 2 2 2 2 ... 20 11 84 131 16 51 61 19 23 326
top A A A A A A A A A A ... F B BI CL A E BM A K HK
freq 141550 106721 177993 128395 123737 131693 183744 177274 113122 160213 ... 47310 65512 152918 25305 128395 25148 26191 131693 43866 21061

4 rows × 116 columns

*Test out Faron's XGBoost emsemble method*


In [9]:
import xgboost as xgb
from scipy.sparse import csr_matrix
from sklearn.model_selection import KFold
from collections import Counter

In [20]:
ID = 'id'
TARGET = 'loss'
DATA_DIR = ""

TRAIN_FILE = "{0}train.csv".format(DATA_DIR)
TEST_FILE = "{0}test.csv".format(DATA_DIR)

SEED = 0
NFOLDS = 5
NTHREADS = 4

xgb_params = {
    'seed': 0,
    'colsample_bytree': 1,
    'silent': 1,
    'subsample': 1.0,
    'learning_rate': 1.0,
    'objective': 'reg:linear',
    'max_depth': 100,
    'num_parallel_tree': 1,
    'min_child_weight': 250,
    'eval_metric': 'mae',
    'nthread': NTHREADS,
    'nrounds': 1
}

In [17]:
def get_data():
    train = pd.read_csv(TRAIN_FILE)
    test = pd.read_csv(TEST_FILE)

    y_train = train[TARGET].ravel()

    train.drop([ID, TARGET], axis=1, inplace=True)
    test.drop([ID], axis=1, inplace=True)

    ntrain = train.shape[0]
    train_test = pd.concat((train, test)).reset_index(drop=True)

    features = train.columns
    cats = [feat for feat in features if 'cat' in feat]

    train_test = train_test[cats]

    for feat in cats:
        # convert categorical to integers
        train_test[feat] = pd.factorize(train_test[feat], sort=True)[0]

    x_train = np.array(train_test.iloc[:ntrain, :])
    x_test = np.array(train_test.iloc[ntrain:, :])

    return x_train, y_train, x_test


def get_oof(clf, x_train, y_train, x_test):
    ntrain = x_train.shape[0]
    oof_train = np.zeros((ntrain,))

    kf = KFold(n_splits=NFOLDS, shuffle=True, random_state=SEED).split(x_train)

    for i, (train_index, test_index) in enumerate(kf):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]

        clf.train(x_tr, y_tr)
        oof_train[test_index] = clf.predict(x_te)

    clf.train(x_train, y_train)
    oof_test = clf.predict(x_test)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)


def get_sparse_ohe(x_train, x_test, min_obs=10):
    ntrain = x_train.shape[0]

    train_test = np.concatenate((x_train, x_test)).reshape(-1, )

    # replace infrequent values by nan
    val = dict((k, np.nan if v < min_obs else k) for k, v in dict(Counter(train_test)).items())
    k, v = np.array(list(zip(*sorted(val.items()))))
    train_test = v[np.digitize(train_test, k, right=True)]

    ohe = csr_matrix(pd.get_dummies(train_test, dummy_na=False, sparse=True))

    x_train_ohe = ohe[:ntrain, :]
    x_test_ohe = ohe[ntrain:, :]

    return x_train_ohe, x_test_ohe

In [18]:
class XgbWrapper(object):
    def __init__(self, seed=0, params=None):
        self.param = params
        self.param['seed'] = seed
        self.nrounds = params.pop('nrounds', 1000)

    def train(self, x_train, y_train, x_valid=None, y_valid=None, sample_weights=None):
        dtrain = xgb.DMatrix(x_train, label=y_train)
        self.gbdt = xgb.train(self.param, dtrain, self.nrounds)
    
    # pred_leaf=True => getting leaf indices
    def predict(self, x):
        return self.gbdt.predict(xgb.DMatrix(x), pred_leaf=True).astype(int)

In [ ]:
x_train, y_train, x_test = get_data()

dtrain = xgb.DMatrix(x_train, label=y_train)

xg = XgbWrapper(seed=SEED, params=xgb_params)
xg_cat_embedding_train, xg_cat_embedding_test = get_oof(xg, x_train, y_train, x_test)

xg_cat_embedding_ohe_train, xg_cat_embedding_ohe_test = get_sparse_ohe(xg_cat_embedding_train, xg_cat_embedding_test)

print("OneHotEncoded XG-Embeddings: {},{}".format(xg_cat_embedding_ohe_train.shape, xg_cat_embedding_ohe_test.shape))

In [29]:
xg_cat_embedding_train.shape


Out[29]:
(188318, 1)

In [23]:
df = pd.DataFrame({'A':['type1','type2','type2'],
                   'B':['type1','type2','type3'],
                   'C':['type1','type3','type3']})

In [26]:
df['A'] = pd.factorize(df['A'], sort=True)[0]

In [27]:
df


Out[27]:
A B C
0 0 type1 type1
1 1 type2 type3
2 1 type3 type3

In [ ]: