In [93]:
# Setup
import pandas as pd
import numpy as np
from sklearn import preprocessing
import xgboost as xgb
import random
import matplotlib.pyplot as plt
In [94]:
# Defining some functions
print 'Defining functions (shameless stolen from the script pages)'
## Gini: Shameless stolen from jpopham91's script
def Gini1(y_true, y_pred):
# check and get number of samples
assert y_true.shape == y_pred.shape
n_samples = y_true.shape[0]
# sort rows on prediction column
# (from largest to smallest)
arr = np.array([y_true, y_pred]).transpose()
true_order = arr[arr[:,0].argsort()][::-1,0]
pred_order = arr[arr[:,1].argsort()][::-1,0]
# get Lorenz curves
L_true = np.cumsum(true_order) / np.sum(true_order)
L_pred = np.cumsum(pred_order) / np.sum(pred_order)
L_ones = np.linspace(1/n_samples, 1, n_samples)
# get Gini coefficients (area between curves)
G_true = np.sum(L_ones - L_true)
G_pred = np.sum(L_ones - L_pred)
# normalize to true Gini coefficient
return G_pred/G_true
# Source script: justfor que por sua vez pegou da fonte abaixo
# Source of good version: https://www.kaggle.com/c/ClaimPredictionChallenge/forums/t/703/code-to-calculate-normalizedgini
def Gini2_aux(actual, pred, cmpcol = 0, sortcol = 1):
assert( len(actual) == len(pred) )
all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=np.float)
all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
totalLosses = all[:,0].sum()
giniSum = all[:,0].cumsum().sum() / totalLosses
giniSum -= (len(actual) + 1) / 2.
return giniSum / len(actual)
def Gini2(y_true, y_pred):
return Gini2_aux(y_true, y_pred) / Gini2_aux(y_true, y_true)
def ReBalanceTrainData(train_pre,sample_size):
h = 1
print "Hazard: " + str(h)
train_aux = train_pre[train_pre.Hazard == h]
rows_sampled = np.random.choice(train_aux.index,sample_size)
train_balanced_pre = train_pre.ix[rows_sampled]
for h in range(2,hazard_thr):
print "Hazard: " + str(h)
train_aux = train_pre[train_pre.Hazard == h]
rows_sampled = np.random.choice(train_aux.index,sample_size)
train_balanced_pre = train_balanced_pre.append(train_pre.ix[rows_sampled])
return train_balanced_pre
def SampleData(train_balanced_pre,train_size,val_size,test_size):
print 'Making a copy'
train_copy = train_balanced_pre.copy()
print 'Sampling validation set'
np.random.seed(3)
val_rows_samples = np.random.choice( train_copy.index,val_size,replace=False)
val_data = train_copy.ix[val_rows_samples].copy()
print 'Sampling train set'
train_test_data = train_copy.drop(val_rows_samples).copy()
print 'train_test_data-size: ' + str(train_test_data.shape)
np.random.seed(7)
train_rows_samples = np.random.choice( train_test_data.index,train_size, replace=False)
train_data = train_test_data.ix[train_rows_samples].copy()
print 'Sampling test set'
test_data = train_test_data.drop(train_rows_samples).copy()
np.random.seed(5)
test_rows_samples = np.random.choice(test_data.index,test_size, replace=True)
test_data = test_data.ix[test_rows_samples].copy()
return train_data, val_data, test_data
In [183]:
#load train and test
train = pd.read_csv('../data/raw/train.csv', index_col=0)
test = pd.read_csv('../data/raw/test.csv', index_col=0)
print train.shape
## XXX Parece BUG head() nao mostra a ultimas colunas: 10,20,30
## Entao eu tive que duplicar para imprimir todas as colunas
print train.iloc[:,0:10].head()
print train.iloc[:,10:20].head()
print train.iloc[:,20:30].head()
print train.iloc[:,30:33].head()
print "=========================="
labels = train.Hazard
%matplotlib inline
labels.hist(bins=69)
Out[183]:
In [184]:
## Fazendo copia dos dados
train_pre = train.copy()
hazard_thr = 26
sample_size = 3000 # sample size per class
print "Printing intial train dim:"
print train_pre.shape
print "Considering Hazard < " + str(hazard_thr) + " in train data"
# verifcar a capacidade de gerar hazard acima de hazard_thr
train_pre = train_pre[ train_pre['Hazard'] < hazard_thr]
#train_balanced_pre = ReBalanceTrainData(train_pre,sample_size)
train_balanced_pre = train_pre
train_size = 37000
val_size = 13000
test_size = 30000
train_data, val_data, test_data = SampleData(train_balanced_pre,train_size,val_size,test_size)
#offset = 5000
#train_data = train_balanced_pre.iloc[0:offset,:].copy()
#val_data = train_balanced_pre.iloc[offset:50999,:].copy()
#np.random.seed(5)
#test_rows_samples = np.random.choice(val_data.index,1500, replace=True)
#test_data = val_data.ix[test_rows_samples].copy()
val_labels = val_data['Hazard'].copy()
val_data.to_csv('val_data.csv')
test_labels = test_data['Hazard'].copy()
test_data.to_csv('test_data.csv')
train_labels = train_data['Hazard'].copy()
train_data.to_csv('train_data.csv')
print 'train-size: ' + str(train_data.shape)
print 'test-size: ' + str(test_data.shape)
print 'val-size: ' + str(val_data.shape)
In [185]:
print 'Summary of random data slected'
print 'train data:'
train_labels.value_counts().to_csv('train_labels_describe.csv')
plt.xlim(0,70);train_labels.hist(bins=(hazard_thr-1))
print 'val data: '
val_labels.order().value_counts().to_csv('val_labels_describe.csv')
plt.figure();plt.xlim(0,70);val_labels.hist(bins=(hazard_thr-1))
print 'test data: '
test_labels.order().value_counts().to_csv('test_labels_describe.csv')
plt.figure();plt.xlim(0,70);test_labels.hist( bins=(hazard_thr-1))
Out[185]:
In [186]:
submit_pre = test.copy()
print "Droping columns"
train_data.drop('Hazard', axis=1, inplace=True)
train_data.drop('T2_V10', axis=1, inplace=True)
train_data.drop('T2_V7', axis=1, inplace=True)
train_data.drop('T1_V13', axis=1, inplace=True)
train_data.drop('T1_V10', axis=1, inplace=True)
val_data.drop('Hazard', axis=1, inplace=True)
val_data.drop('T2_V10', axis=1, inplace=True)
val_data.drop('T2_V7', axis=1, inplace=True)
val_data.drop('T1_V13', axis=1, inplace=True)
val_data.drop('T1_V10', axis=1, inplace=True)
test_data.drop('Hazard', axis=1, inplace=True)
test_data.drop('T2_V10', axis=1, inplace=True)
test_data.drop('T2_V7', axis=1, inplace=True)
test_data.drop('T1_V13', axis=1, inplace=True)
test_data.drop('T1_V10', axis=1, inplace=True)
submit_pre.drop('T2_V10', axis=1, inplace=True)
submit_pre.drop('T2_V7', axis=1, inplace=True)
submit_pre.drop('T1_V13', axis=1, inplace=True)
submit_pre.drop('T1_V10', axis=1, inplace=True)
columns = train_data.columns
submit_ind = submit_pre.index
print "Converting to numpy array"
train_data = np.array(train_data)
train_labels = np.array(train_labels)
val_data = np.array(val_data)
val_labels = np.array(val_labels)
test_data = np.array(test_data)
test_labels = np.array(test_labels)
submit_pre = np.array(submit_pre)
In [187]:
print "Converting string columns to numerical levels (train_balanced_pre)"
# label encode the categorical variables
print train_data.shape
print val_data.shape
print test_data.shape
print submit_pre.shape
print 'Converting train'
for i in range(train_data.shape[1]):
lbl = preprocessing.LabelEncoder()
lbl.fit(list(train_data[:,i]) + list(submit_pre[:,i]) + list(test_data[:,i]) + list(val_data[:,i]))
train_data[:,i] = lbl.transform(train_data[:,i])
val_data[:,i] = lbl.transform(val_data[:,i])
test_data[:,i] = lbl.transform(test_data[:,i])
submit_pre[:,i] = lbl.transform(submit_pre[:,i])
train_data = train_data.astype(float)
val_data = val_data.astype(float)
test_data = test_data.astype(float)
submit_pre = submit_pre.astype(float)
In [201]:
params = {}
params["objective"] = "count:poisson" # "reg:linear"
params["eta"] = 0.0075
params["min_child_weight"] = 5
params["subsample"] = 0.8
params["colsample_bytree"] = 0.8
params["scale_pos_weight"] = 1.0
params["silent"] = 0
params["booster"] = "gbtree"
params["seed"] = 0
params["max_depth"] = 10
num_rounds = 5000
est = 51
plst = list(params.items())
print "Prameters list"
print plst
print 'num_rounds: ' + str(num_rounds)
print 'early_stopping_rounds: ' + str(est)
In [202]:
print 'Converting train, val, test n sbmit to DMatrix'
xgsubmit = xgb.DMatrix(submit_pre)
xgtest = xgb.DMatrix(test_data)
xgval = xgb.DMatrix(val_data,label=val_labels)
xgtrain = xgb.DMatrix(train_data,label=train_labels)
print "number of row xgtrain"
print xgtrain.num_row()
print "number of row xgval"
print xgval.num_row()
print "number of row xgsubmit"
print xgsubmit.num_row()
print "number of row xgtest"
print xgtest.num_row()
In [203]:
res = {'train':[], 'val': []}
#train using early stopping and predict
#random.seed(13)
#np.random.seed(13)
watchlist = [(xgtrain, 'train'),(xgval, 'val')]
model = xgb.train(plst, xgtrain, num_rounds, evals =watchlist, early_stopping_rounds=est,
evals_result=res,verbose_eval=False)
print 'model was trained'
print 'best iterations n score'
print 'iteartion: ' + str(model.best_iteration) + "\t score: "+ str(model.best_score)
In [204]:
#print res.items()
print '=== Train n val report ==='
preds1 = model.predict(xgtrain)
r1 = np.sqrt( np.mean((train_labels - preds1)**2) )
preds2 = model.predict(xgval)
r2 = np.sqrt( np.mean((val_labels - preds2)**2) )
print "train-rmse:" + str(r1) + "\t val-rmse:" + str(r2)
print 'PS: Gini (Duvidoso)'
print "train-gini1: " + str(Gini1(train_labels,preds1)) + "\t val-gini1: " + str(Gini1(val_labels,preds2))
print "train-gini2: " + str(Gini2(train_labels,preds1)) + "\t val-gini2: " + str(Gini2(val_labels,preds2))
print '=== Test report ==='
print 'Computing lables in test data'
preds2 = model.predict(xgtest)
print 'PS: Gini (Duvidoso)'
r1 = np.sqrt( np.mean((test_labels - preds2)**2) )
g1 = Gini1(test_labels,preds2)
g2 = Gini2(test_labels,preds2)
print "test-rmse: " + str(r1) + "\t test-GINI1: " + str(g1) + "\t test-GINI2: " + str(g2)
# generate solution file
preds = pd.DataFrame({"Hazard": test_labels, "Model_Hazard": preds2})
preds = preds.set_index('Hazard')
print preds.head(7)
#preds.to_csv('test_dev_20150824_1.csv')
In [207]:
print '=== Submit ==='
preds1 = model.predict(xgsubmit)
# generate solution file
preds = pd.DataFrame({"Id": submit_ind, "Hazard": preds1})
preds = preds.set_index('Id')
print preds.head(11)
preds.to_csv('submit_dev_20150824_5.csv')
print 'Current dir:'
%pwd
Out[207]:
In [206]:
xgb.plot_importance(model)
## importance: f1,f15,f0,f2,f18,f16,f27,f13
col_k = [1,15,0,2,18,16,27,13,3,4,26,9,25,20,11,5,19,8,17,12,14]
print columns[col_k]
## Noa eh importante
col_k = [24,21,23,5,8]
print columns[col_k]