Base de treino balanciada

Considerei apenas algumas Hazard < 70 (Considera tudo)

Usei plot_imprtance para selecionar colunas mas metodo nao se mostrou promissor

kaglle score:

Meu melhor kaglle score: 0.3850


In [30]:
# Setup
import pandas as pd
import numpy as np 
from sklearn import preprocessing
import xgboost as xgb
import random
import matplotlib.pyplot as plt

In [31]:
# Defining some functions

print 'Defining functions (shameless stolen from the script pages)'

## Gini: Shameless stolen from jpopham91's script
def Gini1(y_true, y_pred):
    # check and get number of samples
    assert y_true.shape == y_pred.shape
    n_samples = y_true.shape[0]
    
    # sort rows on prediction column 
    # (from largest to smallest)
    arr = np.array([y_true, y_pred]).transpose()
    true_order = arr[arr[:,0].argsort()][::-1,0]
    pred_order = arr[arr[:,1].argsort()][::-1,0]
    
    # get Lorenz curves
    L_true = np.cumsum(true_order) / np.sum(true_order)
    L_pred = np.cumsum(pred_order) / np.sum(pred_order)
    L_ones = np.linspace(1/n_samples, 1, n_samples)
    
    # get Gini coefficients (area between curves)
    G_true = np.sum(L_ones - L_true)
    G_pred = np.sum(L_ones - L_pred)
    
    # normalize to true Gini coefficient
    return G_pred/G_true

# Source script: justfor que por sua vez pegou da fonte abaixo
# Source of good version: https://www.kaggle.com/c/ClaimPredictionChallenge/forums/t/703/code-to-calculate-normalizedgini    
def Gini2_aux(actual, pred, cmpcol = 0, sortcol = 1):
     assert( len(actual) == len(pred) )
     all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=np.float)
     all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
     totalLosses = all[:,0].sum()
     giniSum = all[:,0].cumsum().sum() / totalLosses
     giniSum -= (len(actual) + 1) / 2.
     return giniSum / len(actual)
 
def Gini2(y_true, y_pred):
     return Gini2_aux(y_true, y_pred) / Gini2_aux(y_true, y_true)

def ReBalanceTrainData(train_pre,sample_size):
    h = 1
    print "Hazard: " + str(h)
    train_aux = train_pre[train_pre.Hazard == h]
    rows_sampled = np.random.choice(train_aux.index,sample_size)
    train_balanced_pre = train_pre.ix[rows_sampled]

    for h in range(2,hazard_thr):
        print "Hazard: " + str(h)
        train_aux = train_pre[train_pre.Hazard == h]
        rows_sampled = np.random.choice(train_aux.index,sample_size)
        train_balanced_pre = train_balanced_pre.append(train_pre.ix[rows_sampled])

    return train_balanced_pre

def SampleData(train_balanced_pre,train_size,val_size,test_size):

    print 'Making a copy'
    train_copy = train_balanced_pre.copy()
   
    print 'Sampling validation set'
    np.random.seed(3)
    val_rows_samples = np.random.choice( train_copy.index,val_size,replace=False)
    val_data = train_copy.ix[val_rows_samples].copy()

    print 'Sampling train set'
    train_test_data = train_copy.drop(val_rows_samples).copy()
    print 'train_test_data-size: ' + str(train_test_data.shape)

    np.random.seed(7)
    train_rows_samples = np.random.choice( train_test_data.index,train_size, replace=False)
    train_data = train_test_data.ix[train_rows_samples].copy()
    
    print 'Sampling test set'
    test_data = train_test_data.drop(train_rows_samples).copy()
    
    np.random.seed(5)
    test_rows_samples = np.random.choice(test_data.index,test_size, replace=True)
    test_data = test_data.ix[test_rows_samples].copy()

    return train_data, val_data, test_data


def DropColumns(data,columns_to_drop):    
    for c in columns_to_drop:
        print 'dropping ' + c
        data.drop(c, axis=1, inplace=True)

    return data


Defining functions (shameless stolen from the script pages)

In [75]:
#load train and test 
train  = pd.read_csv('../data/raw/train.csv', index_col=0)
test   = pd.read_csv('../data/raw/test.csv', index_col=0)

print train.shape
## XXX Parece BUG head() nao mostra a ultimas colunas: 10,20,30
## Entao eu tive que duplicar para imprimir todas as colunas
print train.iloc[:,0:10].head()
print train.iloc[:,10:20].head()
print train.iloc[:,20:30].head()
print train.iloc[:,30:33].head()
print "=========================="

labels = train.Hazard
%matplotlib inline
labels.hist(bins=69)


(50999, 33)
    Hazard  T1_V1  T1_V2  T1_V3 T1_V4 T1_V5 T1_V6 T1_V7 T1_V8 T1_V9
Id                                                                 
1        1     15      3      2     N     B     N     B     B     D
2        4     16     14      5     H     B     N     B     B     C
3        1     10     10      5     N     K     N     B     B     E
4        1     18     18      5     N     K     N     B     B     E
5        1     13     19      5     N     H     N     B     B     E
    T1_V10 T1_V11 T1_V12  T1_V13  T1_V14 T1_V15 T1_V16 T1_V17  T2_V1  T2_V2
Id                                                                         
1        7      B      B      15       1      A      B      N     36     11
2       12      B      B      10       3      A      B      Y     78     10
3       12      H      B      15       1      A      R      Y     71     21
4        3      H      B      15       1      A      R      N     71     13
5        7      H      B      10       1      A      J      N     75     10
   T2_V3  T2_V4 T2_V5  T2_V6  T2_V7  T2_V8  T2_V9  T2_V10 T2_V11 T2_V12
Id                                                                     
1      N     10     B      2     37      1     11       6      Y      N
2      Y     17     C      2     22      1     18       5      Y      Y
3      Y     13     C      6     37      2     14       6      Y      Y
4      N     15     A      2     25      1      1       6      Y      N
5      Y     11     B      1     22      1      2       7      N      N
   T2_V13  T2_V14  T2_V15
Id                       
1       E       2       2
2       E       2       1
3       E       6       1
4       C       2       6
5       E       1       1
==========================
Out[75]:
<matplotlib.axes.AxesSubplot at 0x7f9f0bd1e5d0>

Prepaing training data


In [76]:
## Fazendo copia dos dados
train_pre = train.copy()

hazard_thr = 70
sample_size = 3000 # sample size per class

print "Printing intial train dim:"
print train_pre.shape

print "Considering Hazard < " + str(hazard_thr) + " in train data"

# verifcar a capacidade de gerar hazard acima de hazard_thr
train_pre = train_pre[ train_pre['Hazard'] < hazard_thr]

print 'train_pre-size: ' + str(train_pre.shape)


Printing intial train dim:
(50999, 33)
Considering Hazard < 70 in train data
train_pre-size: (50999, 33)

In [77]:
#train_balanced_pre = ReBalanceTrainData(train_pre,sample_size)
train_balanced_pre = train_pre

train_size = 10000
val_size = 40000
test_size = 1200
train_data, val_data, test_data = SampleData(train_balanced_pre,train_size,val_size,test_size)

#offset = 5000
#train_data = train_balanced_pre.iloc[0:offset,:].copy()
#val_data = train_balanced_pre.iloc[offset:50999,:].copy()

#np.random.seed(5)
#test_rows_samples = np.random.choice(val_data.index,1500, replace=True)
#test_data = val_data.ix[test_rows_samples].copy()

val_labels = val_data['Hazard'].copy()
val_data.to_csv('val_data.csv')

test_labels = test_data['Hazard'].copy()
test_data.to_csv('test_data.csv')

#train_data = ReBalanceTrainData(train_data,11500)

train_labels = train_data['Hazard'].copy()
train_data.to_csv('train_data.csv')

print 'train-size: ' + str(train_data.shape)
print 'test-size: ' + str(test_data.shape)
print 'val-size: ' + str(val_data.shape)


Making a copy
Sampling validation set
Sampling train set
train_test_data-size: (10999, 33)
Sampling test set
train-size: (10000, 33)
test-size: (1200, 33)
val-size: (40000, 33)

In [78]:
print 'Summary of random data slected'
print 'train data:'
train_labels.value_counts().to_csv('train_labels_describe.csv')
plt.xlim(0,70);train_labels.hist(bins=(hazard_thr-1))

print 'val data: '
val_labels.order().value_counts().to_csv('val_labels_describe.csv')
plt.figure();plt.xlim(0,70);val_labels.hist(bins=(hazard_thr-1))

print 'test data: '
test_labels.order().value_counts().to_csv('test_labels_describe.csv')
plt.figure();plt.xlim(0,70);test_labels.hist( bins=(hazard_thr-1))


Summary of random data slected
train data:
val data: 
test data: 
Out[78]:
<matplotlib.axes.AxesSubplot at 0x7f9f0b015f10>

In [79]:
submit_pre = test.copy()

print "Droping columns"


columns_to_drop = [ 'Hazard', 'T2_V8', 'T2_V12','T1_V17', 'T1_V7', 'T2_V10', 'T2_V3',
                    'T2_V11', 'T1_V8', 'T1_V9', 'T1_V12', 'T1_V15','T1_V6', 'T2_V5', 
                    'T1_V14', 'T2_V6', 'T1_V11', 'T1_V5', 'T2_V13','T1_V4', 'T1_V13', 
                    'T2_V14', 'T1_V16', 'T1_V10', 'T2_V7','T1_V3', 'T1_V1']


#columns_to_drop = [ 'Hazard', 'T2_V10', 'T2_V7','T2_V12','T2_V8','T1_V13', 'T1_V10',
#			      'T1_V7', 'T2_V11', 'T1_V17', 'T1_V12', 'T1_V8', 'T2_V3', 'T2_V1',
#			      'T1_V15', 'T2_V5','T1_V6', 'T1_V9', 'T2_V6', 'T1_V14']

train_data = DropColumns(train_data,columns_to_drop)
val_data = DropColumns(val_data,columns_to_drop)
test_data = DropColumns(test_data,columns_to_drop)

columns_to_drop = [ 'T2_V8', 'T2_V12','T1_V17', 'T1_V7', 'T2_V10', 'T2_V3',
                    'T2_V11', 'T1_V8', 'T1_V9', 'T1_V12', 'T1_V15','T1_V6', 'T2_V5', 
                    'T1_V14', 'T2_V6', 'T1_V11', 'T1_V5', 'T2_V13','T1_V4', 'T1_V13', 
                    'T2_V14', 'T1_V16', 'T1_V10', 'T2_V7','T1_V3', 'T1_V1']

#columns_to_drop = [ 'T2_V10', 'T2_V7','T2_V12','T2_V8','T1_V13', 'T1_V10',
#			      'T1_V7', 'T2_V11', 'T1_V17', 'T1_V12', 'T1_V8', 'T2_V3', 'T2_V1',
#			      'T1_V15', 'T2_V5','T1_V6', 'T1_V9', 'T2_V6', 'T1_V14']

submit_pre = DropColumns(submit_pre,columns_to_drop)

columns = train_data.columns
submit_ind = submit_pre.index

print "Converting to numpy array"

train_data = np.array(train_data)
train_labels = np.array(train_labels)

val_data = np.array(val_data)
val_labels = np.array(val_labels)

test_data = np.array(test_data)
test_labels = np.array(test_labels)

submit_pre = np.array(submit_pre)


Droping columns
dropping Hazard
dropping T2_V8
dropping T2_V12
dropping T1_V17
dropping T1_V7
dropping T2_V10
dropping T2_V3
dropping T2_V11
dropping T1_V8
dropping T1_V9
dropping T1_V12
dropping T1_V15
dropping T1_V6
dropping T2_V5
dropping T1_V14
dropping T2_V6
dropping T1_V11
dropping T1_V5
dropping T2_V13
dropping T1_V4
dropping T1_V13
dropping T2_V14
dropping T1_V16
dropping T1_V10
dropping T2_V7
dropping T1_V3
dropping T1_V1
dropping Hazard
dropping T2_V8
dropping T2_V12
dropping T1_V17
dropping T1_V7
dropping T2_V10
dropping T2_V3
dropping T2_V11
dropping T1_V8
dropping T1_V9
dropping T1_V12
dropping T1_V15
dropping T1_V6
dropping T2_V5
dropping T1_V14
dropping T2_V6
dropping T1_V11
dropping T1_V5
dropping T2_V13
dropping T1_V4
dropping T1_V13
dropping T2_V14
dropping T1_V16
dropping T1_V10
dropping T2_V7
dropping T1_V3
dropping T1_V1
dropping Hazard
dropping T2_V8
dropping T2_V12
dropping T1_V17
dropping T1_V7
dropping T2_V10
dropping T2_V3
dropping T2_V11
dropping T1_V8
dropping T1_V9
dropping T1_V12
dropping T1_V15
dropping T1_V6
dropping T2_V5
dropping T1_V14
dropping T2_V6
dropping T1_V11
dropping T1_V5
dropping T2_V13
dropping T1_V4
dropping T1_V13
dropping T2_V14
dropping T1_V16
dropping T1_V10
dropping T2_V7
dropping T1_V3
dropping T1_V1
dropping T2_V8
dropping T2_V12
dropping T1_V17
dropping T1_V7
dropping T2_V10
dropping T2_V3
dropping T2_V11
dropping T1_V8
dropping T1_V9
dropping T1_V12
dropping T1_V15
dropping T1_V6
dropping T2_V5
dropping T1_V14
dropping T2_V6
dropping T1_V11
dropping T1_V5
dropping T2_V13
dropping T1_V4
dropping T1_V13
dropping T2_V14
dropping T1_V16
dropping T1_V10
dropping T2_V7
dropping T1_V3
dropping T1_V1
Converting to numpy array

In [80]:
print "Converting string columns to numerical levels (train_balanced_pre)"
# label encode the categorical variables

print train_data.shape
print val_data.shape
print test_data.shape
print submit_pre.shape

print 'Converting train'
for i in range(train_data.shape[1]):
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(train_data[:,i]) + list(submit_pre[:,i]) + list(test_data[:,i]) + list(val_data[:,i]))
    train_data[:,i] = lbl.transform(train_data[:,i])
    val_data[:,i] = lbl.transform(val_data[:,i])
    test_data[:,i] = lbl.transform(test_data[:,i])
    submit_pre[:,i] = lbl.transform(submit_pre[:,i])
    
train_data = train_data.astype(float)
val_data = val_data.astype(float)
test_data = test_data.astype(float)
submit_pre = submit_pre.astype(float)


Converting string columns to numerical levels (train_balanced_pre)
(10000, 6)
(40000, 6)
(1200, 6)
(51000, 6)
Converting train

Seting model parameters


In [81]:
params = {}
params["objective"] = "reg:linear" #"count:poisson"      
params["eta"] = 0.01
params["min_child_weight"] = 5
params["subsample"] = 0.75
params["colsample_bytree"] = 0.70
params["scale_pos_weight"] = 1.0
params["silent"] = 0
params["booster"] = "gbtree"
params["seed"] = 0
params["max_depth"] = 9

num_rounds = 5000
est = 51

plst = list(params.items())
print "Prameters list"
print plst

print 'num_rounds: ' + str(num_rounds)
print 'early_stopping_rounds: ' + str(est)


Prameters list
[('colsample_bytree', 0.7), ('silent', 0), ('scale_pos_weight', 1.0), ('min_child_weight', 5), ('subsample', 0.75), ('eta', 0.01), ('objective', 'reg:linear'), ('seed', 0), ('max_depth', 9), ('booster', 'gbtree')]
num_rounds: 5000
early_stopping_rounds: 51

In [82]:
print 'Converting train, val, test n sbmit to DMatrix'

xgsubmit = xgb.DMatrix(submit_pre)
xgtest = xgb.DMatrix(test_data)

xgval = xgb.DMatrix(val_data,label=val_labels)
xgtrain = xgb.DMatrix(train_data,label=train_labels)

print "number of row xgtrain"
print xgtrain.num_row()

print "number of row xgval"
print xgval.num_row()

print "number of row xgsubmit"
print xgsubmit.num_row()

print "number of row xgtest"
print xgtest.num_row()


Converting train, val, test n sbmit to DMatrix
number of row xgtrain
10000
number of row xgval
40000
number of row xgsubmit
51000
number of row xgtest
1200

Training XGBoost


In [83]:
res = {'train':[], 'val': []}
#train using early stopping and predict

#random.seed(13)
#np.random.seed(13)
watchlist = [(xgtrain, 'train'),(xgval, 'val')]
model = xgb.train(plst, xgtrain, num_rounds, evals =watchlist, early_stopping_rounds=est,
                evals_result=res,verbose_eval=False)

print 'model was trained'
print 'best iterations n score'
print 'iteartion: ' + str(model.best_iteration) + "\t score: "+ str(model.best_score)


Will train until val error hasn't decreased in 51 rounds.
Stopping. Best iteration:
[297]	train-rmse:3.275064	val-rmse:4.003722

model was trained
best iterations n score
iteartion: 297	 score: 4.003722

In [84]:
#print res.items()

print '=== Train n val report ==='
preds1 = model.predict(xgtrain)
r1 = np.sqrt( np.mean((train_labels - preds1)**2) )

preds2 = model.predict(xgval)
r2 = np.sqrt( np.mean((val_labels - preds2)**2) )

print "train-rmse:" +  str(r1) + "\t val-rmse:" + str(r2)

print 'PS: Gini (Duvidoso)'
print "train-gini1: " + str(Gini1(train_labels,preds1)) + "\t val-gini1: " + str(Gini1(val_labels,preds2)) 
print "train-gini2: " + str(Gini2(train_labels,preds1)) + "\t val-gini2: " + str(Gini2(val_labels,preds2))

print '=== Test report ===' 
print 'Computing lables in test data'
preds2 = model.predict(xgtest)

print 'PS: Gini (Duvidoso)'
r1 = np.sqrt( np.mean((test_labels - preds2)**2) )
g1 = Gini1(test_labels,preds2)
g2 = Gini2(test_labels,preds2)
print "test-rmse: " +  str(r1) + "\t test-GINI1: " + str(g1) + "\t test-GINI2: " + str(g2)

# generate solution file
preds = pd.DataFrame({"Hazard": test_labels, "Model_Hazard": preds2})
preds = preds.set_index('Hazard')
print preds.head(7)

#preds.to_csv('test_dev_20150824_1.csv')


=== Train n val report ===
train-rmse:3.20626621974	 val-rmse:4.00616745598
PS: Gini (Duvidoso)
train-gini1: 0.728189229977	 val-gini1: 0.175644995474
train-gini2: 0.728131700095	 val-gini2: 0.175601594883
=== Test report ===
Computing lables in test data
PS: Gini (Duvidoso)
test-rmse: 4.18655457968	 test-GINI1: 0.145267197887	 test-GINI2: 0.143771258763
        Model_Hazard
Hazard              
13          3.051234
10          5.915341
9           3.989376
1           4.015671
12          5.485186
23          4.709243
1           6.152878

In [112]:
preds3 = np.ones(val_size)*1.0

print preds3.size

r2 = np.sqrt( np.mean((val_labels - preds3)**2) )

print "val-rmse:" + str(r2)


3100
val-rmse:0.485599064675

In [15]:
print '=== Submit ==='
preds1 = model.predict(xgsubmit)

# generate solution file
preds = pd.DataFrame({"Id": submit_ind, "Hazard": preds1})
preds = preds.set_index('Id')
print preds.head(11)
preds.to_csv('submit_dev_20150825_2.csv')

print 'Current dir:'
%pwd


=== Submit ===
      Hazard
Id          
6   2.542489
7   2.426166
8   2.737331
9   2.220707
10  1.888816
11  2.470783
13  2.460974
14  1.926570
16  2.306811
17  2.022678
18  2.306679
Current dir:
Out[15]:
u'/home/leandroohf/Documents/kaggle/Liberty_Mutual_Group_Property_Inspection_Prediction/dev'

In [74]:
xgb.plot_importance(model)

## importance: f1,f15,f0,f2,f18,f16,f27,f13
#col_k = [1,15,0,2,18,16,27,13,3,4,26,9,25,20,11,5,19,8,17,12,14]
#print columns[col_k]

## Noa eh importante
col_k = [5,14,9,15,7,4,18,3,8,19,10,6,16,2,0]
print columns[col_k]


Index([u'T1_V6', u'T2_V5', u'T1_V14', u'T2_V6', u'T1_V11', u'T1_V5', u'T2_V13',
       u'T1_V4', u'T1_V13', u'T2_V14', u'T1_V16', u'T1_V10', u'T2_V7',
       u'T1_V3', u'T1_V1'],
      dtype='object')