Primero pimpea tu libreta!


In [62]:
from IPython.core.display import HTML
import os
def css_styling():
    """Load default custom.css file from ipython profile"""
    base = os.getcwd()
    styles = "<style>\n%s\n</style>" % (open(os.path.join(base,'files/custom.css'),'r').read())
    return HTML(styles)
css_styling()


Out[62]:

Gradient Boosting Classifier (GBC)


In [63]:
from IPython.core.display import Image
Image('https://kaggle2.blob.core.windows.net/competitions/kaggle/3887/media/ATLASEXP_image.png')


Out[63]:

Empiezo

Hola mundo


In [64]:
import numpy as np
import scipy as sc
import pandas as pd
import sklearn as sk
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.cross_validation import train_test_split
import os
import pickle
import sys

Datos de entrenamiento!


In [78]:
df=pd.read_csv('training.csv')

In [79]:
df.head(1)


Out[79]:
EventId DER_mass_MMC DER_mass_transverse_met_lep DER_mass_vis DER_pt_h DER_deltaeta_jet_jet DER_mass_jet_jet DER_prodeta_jet_jet DER_deltar_tau_lep DER_pt_tot ... PRI_jet_num PRI_jet_leading_pt PRI_jet_leading_eta PRI_jet_leading_phi PRI_jet_subleading_pt PRI_jet_subleading_eta PRI_jet_subleading_phi PRI_jet_all_pt Weight Label
0 100000 138.47 51.655 97.827 27.98 0.91 124.711 2.666 3.064 41.928 ... 2 67.435 2.15 0.444 46.062 1.24 -2.475 113.497 0.002653 s

1 rows × 33 columns


In [80]:
bueno=df['Label'].replace(to_replace=['s','b'],value=[1,0])
# quitar columnas
df.drop('EventId',axis=1,inplace=True)
df.drop('Label',axis=1,inplace=True)

A empezar el ML!


In [81]:
X = df.values
Y = bueno

In [82]:
w_train=X_train[:,-1]
X_train = X_train[:,:-1]

In [83]:
clf = GBC(
          n_estimators=50,
          max_depth=5,
          min_samples_leaf=200,
          max_features=10,
          verbose=1)
clf.fit(X_train,Y_train)


      Iter       Train Loss   Remaining Time 
         1           1.2146            6.01s
         2           1.1570            4.71s
         3           1.1058            4.31s
         4           1.0662            3.95s
         5           1.0331            3.77s
         6           1.0009            3.61s
         7           0.9726            3.48s
         8           0.9471            3.39s
         9           0.9263            3.28s
        10           0.9088            3.16s
        20           0.8050            2.23s
        30           0.7579            1.48s
        40           0.7327            0.72s
        50           0.7156            0.00s
Out[83]:
GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=5, max_features=10, max_leaf_nodes=None,
              min_samples_leaf=200, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=50,
              random_state=None, subsample=1.0, verbose=1,
              warm_start=False)

Actividad:

Experimentar con:

  • max_features
  • max_depth
  • min_samples_leaf
  • n_estimators

Guardar tu mejor classificador


In [84]:
pickle_out = open('mejor.pkl', 'wb')
pickle.dump(clf, pickle_out)
pickle_out.close()

Loadeando tu classificador


In [14]:
my_object_file = open('mejor.pkl', 'rb')
clf = pickle.load(my_object_file)
my_object_file.close()

Datos de validacion


In [96]:
df=pd.read_csv('test.csv')
df.columns


Out[96]:
Index([u'EventId', u'DER_mass_MMC', u'DER_mass_transverse_met_lep',
       u'DER_mass_vis', u'DER_pt_h', u'DER_deltaeta_jet_jet',
       u'DER_mass_jet_jet', u'DER_prodeta_jet_jet', u'DER_deltar_tau_lep',
       u'DER_pt_tot', u'DER_sum_pt', u'DER_pt_ratio_lep_tau',
       u'DER_met_phi_centrality', u'DER_lep_eta_centrality', u'PRI_tau_pt',
       u'PRI_tau_eta', u'PRI_tau_phi', u'PRI_lep_pt', u'PRI_lep_eta',
       u'PRI_lep_phi', u'PRI_met', u'PRI_met_phi', u'PRI_met_sumet',
       u'PRI_jet_num', u'PRI_jet_leading_pt', u'PRI_jet_leading_eta',
       u'PRI_jet_leading_phi', u'PRI_jet_subleading_pt',
       u'PRI_jet_subleading_eta', u'PRI_jet_subleading_phi',
       u'PRI_jet_all_pt'],
      dtype='object')

In [93]:
df.head(1)


Out[93]:
EventId DER_mass_MMC DER_mass_transverse_met_lep DER_mass_vis DER_pt_h DER_deltaeta_jet_jet DER_mass_jet_jet DER_prodeta_jet_jet DER_deltar_tau_lep DER_pt_tot ... PRI_met_phi PRI_met_sumet PRI_jet_num PRI_jet_leading_pt PRI_jet_leading_eta PRI_jet_leading_phi PRI_jet_subleading_pt PRI_jet_subleading_eta PRI_jet_subleading_phi PRI_jet_all_pt
0 350000 -999 79.589 23.916 3.036 -999 -999 -999 0.903 3.036 ... 2.022 98.556 0 -999 -999 -999 -999 -999 -999 -0

1 rows × 31 columns


In [94]:
# quitar columnas
df.drop('EventId',axis=1,inplace=True)

In [ ]:
w_train=X_train[:,-1]
X_train = X_train[:,:-1]

Mejorando la regla de decision

en vez de 0.5 usaremos el percentil 88%


In [54]:
prob_pre_train=clf.predict_proba(X_train)[:,1]
prob_pre_test=clf.predict_proba(X_test)[:,1]

Look at the following two cells and you understand the concepts


In [98]:
pcut = np.percentile(prob_pre_train,88)

In [56]:
pcut


Out[56]:
0.82254474492399354

In [57]:
Yhat_train = prob_pre_train > pcut 
Yhat_test = prob_pre_test > pcut

The Metric 'penalizes' according to the weight that is linked with an event.


In [91]:
Image('http://i.imgur.com/Hflz2lG.jpg')


Out[91]:

In [58]:
# razon de entrenamiento
rat =  float(X_train.shape[0]) /float(X_test.shape[0])

print(rat)


0.111111111111

In [59]:
TruePositive_train = w_train*(Y_train==1.0)*(1.0/rat)
TrueNegative_train = w_train*(Y_train==0.0)*(1.0/rat)
TruePositive_valid = w_test*(Y_test==1.0)*(1.0/(1-rat))
TrueNegative_valid = w_test*(Y_test==0.0)*(1.0/(1-rat))

In [60]:
s_train = sum ( TruePositive_train*(Yhat_train==1.0) )#here only the "cases" are summed where prediction and "real" signal come together
b_train = sum ( TrueNegative_train*(Yhat_train==1.0) )#...
s_valid = sum ( TruePositive_valid*(Yhat_test==1.0) )
b_valid = sum ( TrueNegative_valid*(Yhat_test==1.0) )

In [97]:
import math
print('Calculando el score AMS score para una probabilidad de corte pcut=',pcut)
def AMSScore(s,b): return  math.sqrt (2.*( (s + b + 10.)*math.log(1.+s/(b+10.))-s))
print( '   - AMS basado en %s %% entrenamiento:' % (rat*100),AMSScore(s_train,b_train))
print('   - AMS basado en %s %% validacion:' % ((1-rat)*100),(AMSScore(s_valid,b_valid)))


('Calculando el score AMS score para una probabilidad de corte pcut=', 0.82254474492399354)
('   - AMS basado en 11.1111111111 % entrenamiento:', 3.762137684770048)
('   - AMS basado en 88.8888888889 % validacion:', 3.2146923823942384)

In [ ]: