In [62]:
from IPython.core.display import HTML
import os
def css_styling():
"""Load default custom.css file from ipython profile"""
base = os.getcwd()
styles = "<style>\n%s\n</style>" % (open(os.path.join(base,'files/custom.css'),'r').read())
return HTML(styles)
css_styling()
Out[62]:
In [63]:
from IPython.core.display import Image
Image('https://kaggle2.blob.core.windows.net/competitions/kaggle/3887/media/ATLASEXP_image.png')
Out[63]:
In [64]:
import numpy as np
import scipy as sc
import pandas as pd
import sklearn as sk
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.cross_validation import train_test_split
import os
import pickle
import sys
In [78]:
df=pd.read_csv('training.csv')
In [79]:
df.head(1)
Out[79]:
In [80]:
bueno=df['Label'].replace(to_replace=['s','b'],value=[1,0])
# quitar columnas
df.drop('EventId',axis=1,inplace=True)
df.drop('Label',axis=1,inplace=True)
In [81]:
X = df.values
Y = bueno
In [82]:
w_train=X_train[:,-1]
X_train = X_train[:,:-1]
In [83]:
clf = GBC(
n_estimators=50,
max_depth=5,
min_samples_leaf=200,
max_features=10,
verbose=1)
clf.fit(X_train,Y_train)
Out[83]:
In [84]:
pickle_out = open('mejor.pkl', 'wb')
pickle.dump(clf, pickle_out)
pickle_out.close()
In [14]:
my_object_file = open('mejor.pkl', 'rb')
clf = pickle.load(my_object_file)
my_object_file.close()
In [96]:
df=pd.read_csv('test.csv')
df.columns
Out[96]:
In [93]:
df.head(1)
Out[93]:
In [94]:
# quitar columnas
df.drop('EventId',axis=1,inplace=True)
In [ ]:
w_train=X_train[:,-1]
X_train = X_train[:,:-1]
In [54]:
prob_pre_train=clf.predict_proba(X_train)[:,1]
prob_pre_test=clf.predict_proba(X_test)[:,1]
Look at the following two cells and you understand the concepts
In [98]:
pcut = np.percentile(prob_pre_train,88)
In [56]:
pcut
Out[56]:
In [57]:
Yhat_train = prob_pre_train > pcut
Yhat_test = prob_pre_test > pcut
In [91]:
Image('http://i.imgur.com/Hflz2lG.jpg')
Out[91]:
In [58]:
# razon de entrenamiento
rat = float(X_train.shape[0]) /float(X_test.shape[0])
print(rat)
In [59]:
TruePositive_train = w_train*(Y_train==1.0)*(1.0/rat)
TrueNegative_train = w_train*(Y_train==0.0)*(1.0/rat)
TruePositive_valid = w_test*(Y_test==1.0)*(1.0/(1-rat))
TrueNegative_valid = w_test*(Y_test==0.0)*(1.0/(1-rat))
In [60]:
s_train = sum ( TruePositive_train*(Yhat_train==1.0) )#here only the "cases" are summed where prediction and "real" signal come together
b_train = sum ( TrueNegative_train*(Yhat_train==1.0) )#...
s_valid = sum ( TruePositive_valid*(Yhat_test==1.0) )
b_valid = sum ( TrueNegative_valid*(Yhat_test==1.0) )
In [97]:
import math
print('Calculando el score AMS score para una probabilidad de corte pcut=',pcut)
def AMSScore(s,b): return math.sqrt (2.*( (s + b + 10.)*math.log(1.+s/(b+10.))-s))
print( ' - AMS basado en %s %% entrenamiento:' % (rat*100),AMSScore(s_train,b_train))
print(' - AMS basado en %s %% validacion:' % ((1-rat)*100),(AMSScore(s_valid,b_valid)))
In [ ]: