Base de treino balanciada

Considerei apenas algumas Hazard < 26 E mantive demais parametros.

Will train until val error hasn't decreased in 51 rounds.

Stopping. Best iteration:

[1114] train-poisson-nloglik:2.040657 val-poisson-nloglik:2.755183

kaglle score: 0.366962

Meu melhor kaglle score: 0.3850


In [93]:
# Setup
import pandas as pd
import numpy as np 
from sklearn import preprocessing
import xgboost as xgb
import random
import matplotlib.pyplot as plt

In [94]:
# Defining some functions

print 'Defining functions (shameless stolen from the script pages)'

## Gini: Shameless stolen from jpopham91's script
def Gini1(y_true, y_pred):
    # check and get number of samples
    assert y_true.shape == y_pred.shape
    n_samples = y_true.shape[0]
    
    # sort rows on prediction column 
    # (from largest to smallest)
    arr = np.array([y_true, y_pred]).transpose()
    true_order = arr[arr[:,0].argsort()][::-1,0]
    pred_order = arr[arr[:,1].argsort()][::-1,0]
    
    # get Lorenz curves
    L_true = np.cumsum(true_order) / np.sum(true_order)
    L_pred = np.cumsum(pred_order) / np.sum(pred_order)
    L_ones = np.linspace(1/n_samples, 1, n_samples)
    
    # get Gini coefficients (area between curves)
    G_true = np.sum(L_ones - L_true)
    G_pred = np.sum(L_ones - L_pred)
    
    # normalize to true Gini coefficient
    return G_pred/G_true

# Source script: justfor que por sua vez pegou da fonte abaixo
# Source of good version: https://www.kaggle.com/c/ClaimPredictionChallenge/forums/t/703/code-to-calculate-normalizedgini    
def Gini2_aux(actual, pred, cmpcol = 0, sortcol = 1):
     assert( len(actual) == len(pred) )
     all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=np.float)
     all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
     totalLosses = all[:,0].sum()
     giniSum = all[:,0].cumsum().sum() / totalLosses
     giniSum -= (len(actual) + 1) / 2.
     return giniSum / len(actual)
 
def Gini2(y_true, y_pred):
     return Gini2_aux(y_true, y_pred) / Gini2_aux(y_true, y_true)

def ReBalanceTrainData(train_pre,sample_size):
    h = 1
    print "Hazard: " + str(h)
    train_aux = train_pre[train_pre.Hazard == h]
    rows_sampled = np.random.choice(train_aux.index,sample_size)
    train_balanced_pre = train_pre.ix[rows_sampled]

    for h in range(2,hazard_thr):
        print "Hazard: " + str(h)
        train_aux = train_pre[train_pre.Hazard == h]
        rows_sampled = np.random.choice(train_aux.index,sample_size)
        train_balanced_pre = train_balanced_pre.append(train_pre.ix[rows_sampled])

    return train_balanced_pre

def SampleData(train_balanced_pre,train_size,val_size,test_size):

    print 'Making a copy'
    train_copy = train_balanced_pre.copy()
   
    print 'Sampling validation set'
    np.random.seed(3)
    val_rows_samples = np.random.choice( train_copy.index,val_size,replace=False)
    val_data = train_copy.ix[val_rows_samples].copy()

    print 'Sampling train set'
    train_test_data = train_copy.drop(val_rows_samples).copy()
    print 'train_test_data-size: ' + str(train_test_data.shape)

    np.random.seed(7)
    train_rows_samples = np.random.choice( train_test_data.index,train_size, replace=False)
    train_data = train_test_data.ix[train_rows_samples].copy()
    
    print 'Sampling test set'
    test_data = train_test_data.drop(train_rows_samples).copy()
    
    np.random.seed(5)
    test_rows_samples = np.random.choice(test_data.index,test_size, replace=True)
    test_data = test_data.ix[test_rows_samples].copy()

    return train_data, val_data, test_data


Defining functions (shameless stolen from the script pages)

In [183]:
#load train and test 
train  = pd.read_csv('../data/raw/train.csv', index_col=0)
test   = pd.read_csv('../data/raw/test.csv', index_col=0)

print train.shape
## XXX Parece BUG head() nao mostra a ultimas colunas: 10,20,30
## Entao eu tive que duplicar para imprimir todas as colunas
print train.iloc[:,0:10].head()
print train.iloc[:,10:20].head()
print train.iloc[:,20:30].head()
print train.iloc[:,30:33].head()
print "=========================="

labels = train.Hazard
%matplotlib inline
labels.hist(bins=69)


(50999, 33)
    Hazard  T1_V1  T1_V2  T1_V3 T1_V4 T1_V5 T1_V6 T1_V7 T1_V8 T1_V9
Id                                                                 
1        1     15      3      2     N     B     N     B     B     D
2        4     16     14      5     H     B     N     B     B     C
3        1     10     10      5     N     K     N     B     B     E
4        1     18     18      5     N     K     N     B     B     E
5        1     13     19      5     N     H     N     B     B     E
    T1_V10 T1_V11 T1_V12  T1_V13  T1_V14 T1_V15 T1_V16 T1_V17  T2_V1  T2_V2
Id                                                                         
1        7      B      B      15       1      A      B      N     36     11
2       12      B      B      10       3      A      B      Y     78     10
3       12      H      B      15       1      A      R      Y     71     21
4        3      H      B      15       1      A      R      N     71     13
5        7      H      B      10       1      A      J      N     75     10
   T2_V3  T2_V4 T2_V5  T2_V6  T2_V7  T2_V8  T2_V9  T2_V10 T2_V11 T2_V12
Id                                                                     
1      N     10     B      2     37      1     11       6      Y      N
2      Y     17     C      2     22      1     18       5      Y      Y
3      Y     13     C      6     37      2     14       6      Y      Y
4      N     15     A      2     25      1      1       6      Y      N
5      Y     11     B      1     22      1      2       7      N      N
   T2_V13  T2_V14  T2_V15
Id                       
1       E       2       2
2       E       2       1
3       E       6       1
4       C       2       6
5       E       1       1
==========================
Out[183]:
<matplotlib.axes.AxesSubplot at 0x7eff7fb0dbd0>

Prepaing training data


In [184]:
## Fazendo copia dos dados
train_pre = train.copy()

hazard_thr = 26
sample_size = 3000 # sample size per class

print "Printing intial train dim:"
print train_pre.shape
print "Considering Hazard < " + str(hazard_thr) + " in train data"

# verifcar a capacidade de gerar hazard acima de hazard_thr
train_pre = train_pre[ train_pre['Hazard'] < hazard_thr]

#train_balanced_pre = ReBalanceTrainData(train_pre,sample_size)
train_balanced_pre = train_pre

train_size = 37000
val_size = 13000
test_size = 30000
train_data, val_data, test_data = SampleData(train_balanced_pre,train_size,val_size,test_size)

#offset = 5000
#train_data = train_balanced_pre.iloc[0:offset,:].copy()
#val_data = train_balanced_pre.iloc[offset:50999,:].copy()

#np.random.seed(5)
#test_rows_samples = np.random.choice(val_data.index,1500, replace=True)
#test_data = val_data.ix[test_rows_samples].copy()

val_labels = val_data['Hazard'].copy()
val_data.to_csv('val_data.csv')

test_labels = test_data['Hazard'].copy()
test_data.to_csv('test_data.csv')

train_labels = train_data['Hazard'].copy()
train_data.to_csv('train_data.csv')

print 'train-size: ' + str(train_data.shape)
print 'test-size: ' + str(test_data.shape)
print 'val-size: ' + str(val_data.shape)


Printing intial train dim:
(50999, 33)
Considering Hazard < 26 in train data
Making a copy
Sampling validation set
Sampling train set
train_test_data-size: (37865, 33)
Sampling test set
train-size: (37000, 33)
test-size: (30000, 33)
val-size: (13000, 33)

In [185]:
print 'Summary of random data slected'
print 'train data:'
train_labels.value_counts().to_csv('train_labels_describe.csv')
plt.xlim(0,70);train_labels.hist(bins=(hazard_thr-1))

print 'val data: '
val_labels.order().value_counts().to_csv('val_labels_describe.csv')
plt.figure();plt.xlim(0,70);val_labels.hist(bins=(hazard_thr-1))

print 'test data: '
test_labels.order().value_counts().to_csv('test_labels_describe.csv')
plt.figure();plt.xlim(0,70);test_labels.hist( bins=(hazard_thr-1))


Summary of random data slected
train data:
val data: 
test data: 
Out[185]:
<matplotlib.axes.AxesSubplot at 0x7eff7fa1e250>

In [186]:
submit_pre = test.copy()

print "Droping columns"

train_data.drop('Hazard', axis=1, inplace=True)
train_data.drop('T2_V10', axis=1, inplace=True)
train_data.drop('T2_V7', axis=1, inplace=True)
train_data.drop('T1_V13', axis=1, inplace=True)
train_data.drop('T1_V10', axis=1, inplace=True)

val_data.drop('Hazard', axis=1, inplace=True)
val_data.drop('T2_V10', axis=1, inplace=True)
val_data.drop('T2_V7', axis=1, inplace=True)
val_data.drop('T1_V13', axis=1, inplace=True)
val_data.drop('T1_V10', axis=1, inplace=True)

test_data.drop('Hazard', axis=1, inplace=True)
test_data.drop('T2_V10', axis=1, inplace=True)
test_data.drop('T2_V7', axis=1, inplace=True)
test_data.drop('T1_V13', axis=1, inplace=True)
test_data.drop('T1_V10', axis=1, inplace=True)

submit_pre.drop('T2_V10', axis=1, inplace=True)
submit_pre.drop('T2_V7', axis=1, inplace=True)
submit_pre.drop('T1_V13', axis=1, inplace=True)
submit_pre.drop('T1_V10', axis=1, inplace=True)

columns = train_data.columns
submit_ind = submit_pre.index

print "Converting to numpy array"

train_data = np.array(train_data)
train_labels = np.array(train_labels)

val_data = np.array(val_data)
val_labels = np.array(val_labels)

test_data = np.array(test_data)
test_labels = np.array(test_labels)

submit_pre = np.array(submit_pre)


Droping columns
Converting to numpy array

In [187]:
print "Converting string columns to numerical levels (train_balanced_pre)"
# label encode the categorical variables

print train_data.shape
print val_data.shape
print test_data.shape
print submit_pre.shape

print 'Converting train'
for i in range(train_data.shape[1]):
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(train_data[:,i]) + list(submit_pre[:,i]) + list(test_data[:,i]) + list(val_data[:,i]))
    train_data[:,i] = lbl.transform(train_data[:,i])
    val_data[:,i] = lbl.transform(val_data[:,i])
    test_data[:,i] = lbl.transform(test_data[:,i])
    submit_pre[:,i] = lbl.transform(submit_pre[:,i])
    
train_data = train_data.astype(float)
val_data = val_data.astype(float)
test_data = test_data.astype(float)
submit_pre = submit_pre.astype(float)


Converting string columns to numerical levels (train_balanced_pre)
(37000, 28)
(13000, 28)
(30000, 28)
(51000, 28)
Converting train

Seting model parameters


In [201]:
params = {}
params["objective"] = "count:poisson" # "reg:linear"    
params["eta"] = 0.0075
params["min_child_weight"] = 5
params["subsample"] = 0.8
params["colsample_bytree"] = 0.8
params["scale_pos_weight"] = 1.0
params["silent"] = 0
params["booster"] = "gbtree"
params["seed"] = 0
params["max_depth"] = 10

num_rounds = 5000
est = 51

plst = list(params.items())
print "Prameters list"
print plst

print 'num_rounds: ' + str(num_rounds)
print 'early_stopping_rounds: ' + str(est)


Prameters list
[('colsample_bytree', 0.8), ('silent', 0), ('scale_pos_weight', 1.0), ('min_child_weight', 5), ('subsample', 0.8), ('eta', 0.0075), ('objective', 'count:poisson'), ('seed', 0), ('max_depth', 10), ('booster', 'gbtree')]
num_rounds: 5000
early_stopping_rounds: 51

In [202]:
print 'Converting train, val, test n sbmit to DMatrix'

xgsubmit = xgb.DMatrix(submit_pre)
xgtest = xgb.DMatrix(test_data)

xgval = xgb.DMatrix(val_data,label=val_labels)
xgtrain = xgb.DMatrix(train_data,label=train_labels)

print "number of row xgtrain"
print xgtrain.num_row()

print "number of row xgval"
print xgval.num_row()

print "number of row xgsubmit"
print xgsubmit.num_row()

print "number of row xgtest"
print xgtest.num_row()


Converting train, val, test n sbmit to DMatrix
number of row xgtrain
37000
number of row xgval
13000
number of row xgsubmit
51000
number of row xgtest
30000

Training XGBoost


In [203]:
res = {'train':[], 'val': []}
#train using early stopping and predict

#random.seed(13)
#np.random.seed(13)
watchlist = [(xgtrain, 'train'),(xgval, 'val')]
model = xgb.train(plst, xgtrain, num_rounds, evals =watchlist, early_stopping_rounds=est,
                evals_result=res,verbose_eval=False)

print 'model was trained'
print 'best iterations n score'
print 'iteartion: ' + str(model.best_iteration) + "\t score: "+ str(model.best_score)


Will train until val error hasn't decreased in 51 rounds.
Stopping. Best iteration:
[1114]	train-poisson-nloglik:2.040657	val-poisson-nloglik:2.755183

model was trained
best iterations n score
iteartion: 1114	 score: 2.755183

In [204]:
#print res.items()

print '=== Train n val report ==='
preds1 = model.predict(xgtrain)
r1 = np.sqrt( np.mean((train_labels - preds1)**2) )

preds2 = model.predict(xgval)
r2 = np.sqrt( np.mean((val_labels - preds2)**2) )

print "train-rmse:" +  str(r1) + "\t val-rmse:" + str(r2)

print 'PS: Gini (Duvidoso)'
print "train-gini1: " + str(Gini1(train_labels,preds1)) + "\t val-gini1: " + str(Gini1(val_labels,preds2)) 
print "train-gini2: " + str(Gini2(train_labels,preds1)) + "\t val-gini2: " + str(Gini2(val_labels,preds2))

print '=== Test report ===' 
print 'Computing lables in test data'
preds2 = model.predict(xgtest)

print 'PS: Gini (Duvidoso)'
r1 = np.sqrt( np.mean((test_labels - preds2)**2) )
g1 = Gini1(test_labels,preds2)
g2 = Gini2(test_labels,preds2)
print "test-rmse: " +  str(r1) + "\t test-GINI1: " + str(g1) + "\t test-GINI2: " + str(g2)

# generate solution file
preds = pd.DataFrame({"Hazard": test_labels, "Model_Hazard": preds2})
preds = preds.set_index('Hazard')
print preds.head(7)

#preds.to_csv('test_dev_20150824_1.csv')


=== Train n val report ===
train-rmse:2.29464695088	 val-rmse:3.56810354799
PS: Gini (Duvidoso)
train-gini1: 0.847010640291	 val-gini1: 0.35746605758
train-gini2: 0.847001785421	 val-gini2: 0.357360455399
=== Test report ===
Computing lables in test data
PS: Gini (Duvidoso)
test-rmse: 3.65459928843	 test-GINI1: 0.40695957711	 test-GINI2: 0.406917685392
        Model_Hazard
Hazard              
2           4.112949
7           4.905953
1           3.386273
1           4.464572
2           2.125232
20          5.208757
2           3.459567

In [207]:
print '=== Submit ==='
preds1 = model.predict(xgsubmit)

# generate solution file
preds = pd.DataFrame({"Id": submit_ind, "Hazard": preds1})
preds = preds.set_index('Id')
print preds.head(11)
preds.to_csv('submit_dev_20150824_5.csv')

print 'Current dir:'
%pwd


=== Submit ===
      Hazard
Id          
6   2.874607
7   4.657729
8   6.827285
9   3.541183
10  2.586863
11  3.342521
13  4.644872
14  2.406672
16  2.856246
17  2.944134
18  5.975890
Current dir:
Out[207]:
u'/home/leandroohf/Documents/kaggle/Liberty_Mutual_Group_Property_Inspection_Prediction/dev'

In [206]:
xgb.plot_importance(model)

## importance: f1,f15,f0,f2,f18,f16,f27,f13
col_k = [1,15,0,2,18,16,27,13,3,4,26,9,25,20,11,5,19,8,17,12,14]
print columns[col_k]

## Noa eh importante
col_k = [24,21,23,5,8]
print columns[col_k]


Index([u'T1_V2', u'T2_V1', u'T1_V1', u'T1_V3', u'T2_V4', u'T2_V2', u'T2_V15',
       u'T1_V16', u'T1_V4', u'T1_V5', u'T2_V14', u'T1_V11', u'T2_V13',
       u'T2_V6', u'T1_V14', u'T1_V6', u'T2_V5', u'T1_V9', u'T2_V3', u'T1_V15',
       u'T1_V17'],
      dtype='object')
Index([u'T2_V12', u'T2_V8', u'T2_V11', u'T1_V6', u'T1_V9'], dtype='object')