In [1]:
import os
import re
import pickle
import time
import datetime
import numpy as np
import pandas as pd
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix, csc_matrix
%matplotlib inline
# Custom modules
import const
import func
In [2]:
const.TRAIN_FILES
Out[2]:
In [3]:
const.TEST_FILES
Out[3]:
In [4]:
# Load
train_date = func.load_data_file(const.TRAIN_FILES[2])
test_date = func.load_data_file(const.TEST_FILES[2])
In [5]:
y = func.read_last_column(os.path.join(const.BASE_PATH, const.TRAIN_FILES[0]+'.csv'))
In [6]:
y.head(3)
Out[6]:
In [7]:
tmin_tr = train_date['data']['features'].data[train_date['data']['features'].indptr[:-1]]
tmin_tr[np.diff(train_date['data']['features'].indptr)==0]=np.nan
tmin_te = test_date['data']['features'].data[test_date['data']['features'].indptr[:-1]]
tmin_te[np.diff(test_date['data']['features'].indptr)==0]=np.nan
nmin_tr = train_date['data']['features'].indices[train_date['data']['features'].indptr[:-1]]
#nmin_tr[np.diff(train_date['data']['features'].indptr)==0]=np.nan
nmin_te = test_date['data']['features'].indices[test_date['data']['features'].indptr[:-1]]
#nmin_te[np.diff(test_date['data']['features'].indptr)==0]=np.nan
In [8]:
ids = pd.concat([train_date['data']['ids'], test_date['data']['ids']])
ids['start_time'] = np.concatenate([tmin_tr, tmin_te])
ids['start_time'] = ids['start_time'].astype(float).round(2)
ids['start_station'] = np.concatenate([nmin_tr, nmin_te])
#ids['start_station'] = ids['start_station'].astype(float).round(2)
ids = ids.merge(y.reset_index(), how='left')
In [9]:
ntrain = train_date['data']['ids'].shape[0]
train_test = ids.reset_index(drop=True).reset_index(drop=False)
In [10]:
# Train/test
train_test['f1'] = train_test['Id'].diff().fillna(0).astype(int)
train_test['f2'] = train_test['Id'].iloc[::-1].diff().fillna(0).astype(int)
In [11]:
train_test = train_test.sort_values(by=['start_station','start_time', 'Id'], ascending=True)
train_test['f3'] = train_test['Id'].diff().fillna(0).astype(int)
train_test['f4'] = train_test['Id'].iloc[::-1].diff().fillna(0).astype(int)
In [14]:
train_test.head()
Out[14]:
In [17]:
train_test['f4'][train_test.Response==1].value_counts()
Out[17]:
In [15]:
train_test.to_csv(os.path.join(const.DATA_PATH, 'feat_set_leaks.csv'), index=False)
In [12]:
print train_test[(train_test.Response==1) & (train_test.f4==-1)].Id.count()
print train_test[(train_test.Response==0) & (train_test.f4==-1)].Id.count()
In [13]:
train_test = train_test.sort_values(by=['index']).drop(['index'], axis=1)
train = train_test.iloc[:ntrain, :]
In [14]:
r=[-15, 1]
#r=[0,15]
#r=[0,1e4]
r=[-1e4,0]
f, (ax1, ax2, ax3) = plt.subplots(1,3,figsize=(16,6))
train_test[train_test.Response>=0].f4.hist(bins=100, ax=ax1)
train_test[train_test.Response==0].f4.hist(bins=100, ax=ax2, range=r)
train_test[train_test.Response==1].f4.hist(bins=100, ax=ax3, range=r)
ax1.set_title('Histogram f4 R=0 + R=1')
ax2.set_title('Histogram f4 R=0')
ax3.set_title('Histogram f4 R=1')
Out[14]:
In [15]:
train_test[['f1','f2','f3','f4']].corr()
Out[15]:
In [16]:
list(train.columns)
Out[16]:
In [28]:
features = np.setdiff1d(list(train.columns), ['Response', 'Id','start_station'])
In [29]:
features
Out[29]:
In [ ]:
In [30]:
y = train.Response.ravel()
X_train = np.array(train[features])
print('train: {0}'.format(train.shape))
prior = np.sum(y) / (1.*len(y))
xgb_params = {
'seed': 0,
'colsample_bytree': 0.7,
'silent': 1,
'subsample': 0.7,
'learning_rate': 0.1,
'objective': 'binary:logistic',
'max_depth': 4,
'num_parallel_tree': 1,
'min_child_weight': 2,
'eval_metric': 'auc',
'base_score': prior
}
dtrain = xgb.DMatrix(X_train, label=y)
res = xgb.cv(xgb_params, dtrain, num_boost_round=10, nfold=4, seed=0, stratified=True,
early_stopping_rounds=1, verbose_eval=1, show_stdv=True)
cv_mean = res.iloc[-1, 0]
cv_std = res.iloc[-1, 1]
print('CV-Mean: {0}+{1}'.format(cv_mean, cv_std))
In [20]:
# Without start station: 0.89008475
# With start station: 0.890845
# Without f1: 0.88678875
# Without f2: 0.88699775
# Without f3: 0.88875625
# Without f4: 0.65140425
In [65]:
def callback_test(x):
print x[0].get_score()
with open(os.path.join(const.DATA_PATH,'callback_fi_{}.log'.format(x[2])), 'w') as f:
f.write(str(x[0].get_score()))
with open(os.path.join(const.DATA_PATH,'callback__eval_{}.log'.format(x[2])), 'w') as f:
f.write(str(x[6]))
print
print('Joost')
In [66]:
y = train.Response.ravel()
X_train = np.array(train[features])
print('train: {0}'.format(train.shape))
prior = np.sum(y) / (1.*len(y))
xgb_params = {
'seed': 0,
'colsample_bytree': 0.7,
'silent': 1,
'subsample': 0.7,
'learning_rate': 0.1,
'objective': 'binary:logistic',
'max_depth': 4,
'num_parallel_tree': 1,
'min_child_weight': 2,
'eval_metric': 'auc',
'base_score': prior,
'num_round': 10,
'seed': 123,
'early_stopping': 20
}
dtrain = xgb.DMatrix(X_train, label=y)
watchlist = [(dtrain,'train')]
eval_result = {}
bst = xgb.train(xgb_params,
dtrain,
num_boost_round=xgb_params['num_round'],
evals=watchlist,
evals_result=eval_result,
early_stopping_rounds=xgb_params['early_stopping'],
verbose_eval=1,
callbacks=[callback_test])
In [32]:
eval_result
Out[32]:
In [ ]: