Primero pimpea tu libreta!


In [62]:
from IPython.core.display import HTML
import os
def css_styling():
    """Load default custom.css file from ipython profile"""
    base = os.getcwd()
    styles = "<style>\n%s\n</style>" % (open(os.path.join(base,'files/custom.css'),'r').read())
    return HTML(styles)
css_styling()


Out[62]:

Gradient Boosting Classifier (GBC)


In [63]:
from IPython.core.display import Image
Image('https://kaggle2.blob.core.windows.net/competitions/kaggle/3887/media/ATLASEXP_image.png')


Out[63]:

Empiezo

Hola mundo


In [64]:
import numpy as np
import scipy as sc
import pandas as pd
import sklearn as sk
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.cross_validation import train_test_split
import os
import pickle
import sys

Datos!


In [72]:
df=pd.read_csv('training.csv')

In [73]:
df.head(1)


Out[73]:
EventId DER_mass_MMC DER_mass_transverse_met_lep DER_mass_vis DER_pt_h DER_deltaeta_jet_jet DER_mass_jet_jet DER_prodeta_jet_jet DER_deltar_tau_lep DER_pt_tot ... PRI_met_phi PRI_met_sumet PRI_jet_num PRI_jet_leading_pt PRI_jet_leading_eta PRI_jet_leading_phi PRI_jet_subleading_pt PRI_jet_subleading_eta PRI_jet_subleading_phi PRI_jet_all_pt
0 350000 -999 79.589 23.916 3.036 -999 -999 -999 0.903 3.036 ... 2.022 98.556 0 -999 -999 -999 -999 -999 -999 -0

1 rows × 31 columns


In [76]:
# quitar columnas
df.drop('EventId',axis=1,inplace=True)

A empezar el ML!


In [77]:
X = df.values
Y = bueno

In [51]:
X_train,X_test, Y_train, Y_test= train_test_split(X,Y,test_size=0.9)
w_train=X_train[:,-1]
w_test=X_test[:,-1]

X_train = X_train[:,:-1]
X_test = X_test[:,:-1]

In [70]:
clf = GBC(
          n_estimators=50,
          max_depth=5,
          min_samples_leaf=200,
          max_features=10,
          verbose=1)
clf.fit(X,Y)


      Iter       Train Loss   Remaining Time 
         1           1.0957           35.85s
         2           0.9638           32.90s
         3           0.8391           31.55s
         4           0.7605           34.29s
         5           0.6690           35.62s
         6           0.5910           36.33s
         7           0.5502           36.04s
         8           0.5179           35.41s
         9           0.4603           34.99s
        10           0.4100           35.19s
        20           0.1473           27.95s
        30           0.0617           18.32s
        40           0.0248            9.20s
        50           0.0109            0.00s
Out[70]:
GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=5, max_features=10, max_leaf_nodes=None,
              min_samples_leaf=200, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=50,
              random_state=None, subsample=1.0, verbose=1,
              warm_start=False)

In [53]:
clf.score(X_test,Y_test)


Out[53]:
0.83095111111111108

Actividad:

Experimentar con:

  • max_features
  • max_depth
  • min_samples_leaf
  • n_estimators

Guardar tu mejor classificador


In [71]:
pickle_out = open('mejor.pkl', 'wb')
pickle.dump(clf, pickle_out)
pickle_out.close()

Loadeando tu classificador


In [14]:
my_object_file = open('mejor.pkl', 'rb')
clf = pickle.load(my_object_file)
my_object_file.close()

Mejorando la regla de decision

en vez de 0.5 usaremos el percentil 88%


In [54]:
prob_pre_train=clf.predict_proba(X_train)[:,1]
prob_pre_test=clf.predict_proba(X_test)[:,1]

Look at the following two cells and you understand the concepts


In [55]:
pcut = np.percentile(prob_pre_train,88)

In [56]:
pcut


Out[56]:
0.82254474492399354

In [57]:
Yhat_train = prob_pre_train > pcut 
Yhat_test = prob_pre_test > pcut

The Metric 'penalizes' according to the weight that is linked with an event.


In [42]:
Image('http://i.imgur.com/Hflz2lG.jpg')


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-42-eda20b7561a5> in <module>()
----> 1 Image('http://i.imgur.com/Hflz2lG.jpg')

NameError: name 'Image' is not defined

In [58]:
# razon de entrenamiento
rat =  float(X_train.shape[0]) /float(X_test.shape[0])

print(rat)


0.111111111111

In [59]:
TruePositive_train = w_train*(Y_train==1.0)*(1.0/rat)
TrueNegative_train = w_train*(Y_train==0.0)*(1.0/rat)
TruePositive_valid = w_test*(Y_test==1.0)*(1.0/(1-rat))
TrueNegative_valid = w_test*(Y_test==0.0)*(1.0/(1-rat))

In [60]:
s_train = sum ( TruePositive_train*(Yhat_train==1.0) )#here only the "cases" are summed where prediction and "real" signal come together
b_train = sum ( TrueNegative_train*(Yhat_train==1.0) )#...
s_valid = sum ( TruePositive_valid*(Yhat_test==1.0) )
b_valid = sum ( TrueNegative_valid*(Yhat_test==1.0) )

In [61]:
import math
print('Calculando el score AMS score para una probabilidad de corte pcut=',pcut)
def AMSScore(s,b): return  math.sqrt (2.*( (s + b + 10.)*math.log(1.+s/(b+10.))-s))
print( '   - AMS basado en %s %% entrenamiento:' % (rat*100),AMSScore(s_train,b_train))
print('   - AMS basado en %s %% validacion:' % ((1-rat)*100),(AMSScore(s_valid,b_valid)))


('Calculando el score AMS score para una probabilidad de corte pcut=', 0.82254474492399354)
('   - AMS basado en 11.1111111111 % entrenamiento:', 3.762137684770048)
('   - AMS basado en 88.8888888889 % validacion:', 3.2146923823942384)

In [ ]: