Primero pimpea tu libreta!


In [2]:
from IPython.core.display import HTML
import os
def css_styling():
    """Load default custom.css file from ipython profile"""
    base = os.getcwd()
    styles = "<style>\n%s\n</style>" % (open(os.path.join(base,'files/custom.css'),'r').read())
    return HTML(styles)
css_styling()


Out[2]:

In [3]:
import pandas as pd
import numpy as np

In [4]:
df = pd.read_csv('training.csv')

In [5]:
df.head(1)


Out[5]:
EventId DER_mass_MMC DER_mass_transverse_met_lep DER_mass_vis DER_pt_h DER_deltaeta_jet_jet DER_mass_jet_jet DER_prodeta_jet_jet DER_deltar_tau_lep DER_pt_tot ... PRI_jet_num PRI_jet_leading_pt PRI_jet_leading_eta PRI_jet_leading_phi PRI_jet_subleading_pt PRI_jet_subleading_eta PRI_jet_subleading_phi PRI_jet_all_pt Weight Label
0 100000 138.47 51.655 97.827 27.98 0.91 124.711 2.666 3.064 41.928 ... 2 67.435 2.15 0.444 46.062 1.24 -2.475 113.497 0.002653 s

1 rows × 33 columns


In [6]:
bueno=df['Label'].replace(to_replace=['s','b'],value=[1,0])
df['class_int']= bueno

In [7]:
df.drop('EventId',axis=1,inplace=True)
df.drop('Label',axis=1,inplace=True)
df.drop('class_int',axis=1,inplace=True)

In [8]:
X = df.values
Y = bueno

In [9]:
print(X.shape)
print(Y.shape)


(250000, 31)
(250000,)

In [11]:
X_train,X_test, Y_train, Y_test= train_test_split(X,Y,test_size=0.95)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-11-c481711203cc> in <module>()
----> 1 X_train,X_test, Y_train, Y_test= train_test_split(X,Y,test_size=0.95)

NameError: name 'train_test_split' is not defined

In [10]:
print(X_train.shape)
print(Y_train.shape)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-10-d60f5268cef7> in <module>()
----> 1 print(X_train.shape)
      2 print(Y_train.shape)

NameError: name 'X_train' is not defined

experimentar con

  • max_features
  • max_depth
  • min_samples_leaf
  • n_estimators

In [ ]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(
    max_features=3,
    max_depth=10,
    min_samples_leaf=50,
    n_estimators=5
)
clf.fit(X_train,Y_train)
clf.score(X_test,Y_test)

In [1]:
from sklearn.ensemble import GradientBoostingClassifier as GBC

In [ ]: