notebook.community
Edit and run
In [1]: import graphviz from matplotlib import gridspec import matplotlib.pylab as plt from matplotlib.ticker import NullFormatter, NullLocator, MultipleLocator import numpy as np import pandas as pd pd.set_option("display.max_rows", 500) pd.set_option("display.max_columns", 500) from scipy import stats import seaborn as sns sns.set_palette('husl') sns.set(style = 'ticks') import sklearn.datasets from sklearn.preprocessing import MinMaxScaler from sklearn.metrics import auc, roc_curve from sklearn.model_selection import train_test_split from sklearn.preprocessing import MinMaxScaler import sklearn.tree import sqlite3 import warnings warnings.filterwarnings("ignore")
import graphviz from matplotlib import gridspec import matplotlib.pylab as plt from matplotlib.ticker import NullFormatter, NullLocator, MultipleLocator import numpy as np import pandas as pd pd.set_option("display.max_rows", 500) pd.set_option("display.max_columns", 500) from scipy import stats import seaborn as sns sns.set_palette('husl') sns.set(style = 'ticks') import sklearn.datasets from sklearn.preprocessing import MinMaxScaler from sklearn.metrics import auc, roc_curve from sklearn.model_selection import train_test_split from sklearn.preprocessing import MinMaxScaler import sklearn.tree import sqlite3 import warnings warnings.filterwarnings("ignore")
In [2]: df = pd.read_csv("ttHbb_data.csv") df.head()
df = pd.read_csv("ttHbb_data.csv") df.head()
Out[2]: TTHReco_best_Higgsleptop_dR TTHReco_withH_best_Higgsttbar_dR TTHReco_best_TTHReco Aplanarity_jets TTHReco_best_Higgs_mass dRbb_MaxPt_Sort4 Mbb_MindR_Sort4 H1_all dEtajj_MaxdEta LHD_Discriminant TTHReco_best_b1Higgsbhadtop_dR TTHReco_best_Higgsbleptop_mass dRbb_avg_Sort4 nHiggsbb30_Sort4 dRlepbb_MindR_Sort4 Mjj_MindR HT_jets TTHReco_best_bbHiggs_dR classification 0 -1.000000 -1.000000 -1.000000 0.010764 -1.000 1.873108 29691.271 0.801864 1.611009 -1.000000 -1.000000 -1.00 1.596763 2 1.926379 29691.271 200475.97 -1.000000 1 1 3.033175 2.914220 0.000834 0.027035 156656.200 1.190597 57697.246 0.525295 1.440214 0.429600 0.664401 360101.97 2.093996 2 2.911004 57697.246 520111.80 1.190597 1 2 3.362126 2.896700 0.115693 0.065426 162454.690 2.151490 41279.270 0.707541 0.484865 0.398901 2.522366 226994.64 1.869288 0 1.299827 30980.066 417521.62 3.028662 1 3 -1.000000 -1.000000 -1.000000 0.048023 -1.000 0.880868 73485.016 0.487572 1.895097 -1.000000 -1.000000 -1.00 2.143145 3 2.155922 73485.016 275745.00 -1.000000 1 4 2.637625 2.656071 0.469479 0.289182 58203.133 1.388736 59727.780 0.162432 2.234762 0.599817 2.199806 129147.22 1.998076 3 1.264251 62289.004 562667.20 1.378433 1
In [3]: features = list(df.columns[:-1]) print(features) X = df[features] y = df["classification"]
features = list(df.columns[:-1]) print(features) X = df[features] y = df["classification"]
['TTHReco_best_Higgsleptop_dR', 'TTHReco_withH_best_Higgsttbar_dR', 'TTHReco_best_TTHReco', 'Aplanarity_jets', 'TTHReco_best_Higgs_mass', 'dRbb_MaxPt_Sort4', 'Mbb_MindR_Sort4', 'H1_all', 'dEtajj_MaxdEta', 'LHD_Discriminant', 'TTHReco_best_b1Higgsbhadtop_dR', 'TTHReco_best_Higgsbleptop_mass', 'dRbb_avg_Sort4', 'nHiggsbb30_Sort4', 'dRlepbb_MindR_Sort4', 'Mjj_MindR', 'HT_jets', 'TTHReco_best_bbHiggs_dR']
In [4]: classifier = sklearn.tree.DecisionTreeClassifier( min_samples_split = 20, random_state = 99, max_depth = 5 ) classifier.fit(X, y)
classifier = sklearn.tree.DecisionTreeClassifier( min_samples_split = 20, random_state = 99, max_depth = 5 ) classifier.fit(X, y)
Out[4]: DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=20, min_weight_fraction_leaf=0.0, presort=False, random_state=99, splitter='best')
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=20, min_weight_fraction_leaf=0.0, presort=False, random_state=99, splitter='best')
In [5]: graph = graphviz.Source( sklearn.tree.export_graphviz( classifier, out_file = None, feature_names = list(df[features].columns.values), filled = True, rounded = True, special_characters = True ) ) graph
graph = graphviz.Source( sklearn.tree.export_graphviz( classifier, out_file = None, feature_names = list(df[features].columns.values), filled = True, rounded = True, special_characters = True ) ) graph
Out[5]: Tree 0 HT_jets ≤ 369345.031 gini = 0.5 samples = 22795 value = [11743, 11052] 1 LHD_Discriminant ≤ 0.501 gini = 0.424 samples = 8883 value = [6174, 2709] 0->1 True 30 dRbb_avg_Sort4 ≤ 2.397 gini = 0.48 samples = 13912 value = [5569, 8343] 0->30 False 2 dRbb_avg_Sort4 ≤ 2.365 gini = 0.398 samples = 7892 value = [5725, 2167] 1->2 17 dEtajj_MaxdEta ≤ 3.009 gini = 0.496 samples = 991 value = [449, 542] 1->17 3 HT_jets ≤ 269917.984 gini = 0.44 samples = 5191 value = [3491, 1700] 2->3 10 dEtajj_MaxdEta ≤ 2.135 gini = 0.286 samples = 2701 value = [2234, 467] 2->10 4 H1_all ≤ 0.668 gini = 0.343 samples = 2216 value = [1728, 488] 3->4 7 H1_all ≤ 0.295 gini = 0.483 samples = 2975 value = [1763, 1212] 3->7 5 gini = 0.311 samples = 1797 value = [1451, 346] 4->5 6 gini = 0.448 samples = 419 value = [277, 142] 4->6 8 gini = 0.456 samples = 1540 value = [998, 542] 7->8 9 gini = 0.498 samples = 1435 value = [765, 670] 7->9 11 dEtajj_MaxdEta ≤ 1.491 gini = 0.46 samples = 128 value = [82, 46] 10->11 14 HT_jets ≤ 283937.906 gini = 0.274 samples = 2573 value = [2152, 421] 10->14 12 gini = 0.0 samples = 3 value = [0, 3] 11->12 13 gini = 0.451 samples = 125 value = [82, 43] 11->13 15 gini = 0.218 samples = 1291 value = [1130, 161] 14->15 16 gini = 0.323 samples = 1282 value = [1022, 260] 14->16 18 LHD_Discriminant ≤ 0.78 gini = 0.484 samples = 838 value = [344, 494] 17->18 25 dRlepbb_MindR_Sort4 ≤ 3.427 gini = 0.431 samples = 153 value = [105, 48] 17->25 19 LHD_Discriminant ≤ 0.515 gini = 0.495 samples = 694 value = [314, 380] 18->19 22 TTHReco_best_Higgsbleptop_mass ≤ 247482.609 gini = 0.33 samples = 144 value = [30, 114] 18->22 20 gini = 0.0 samples = 17 value = [0, 17] 19->20 21 gini = 0.497 samples = 677 value = [314, 363] 19->21 23 gini = 0.278 samples = 132 value = [22, 110] 22->23 24 gini = 0.444 samples = 12 value = [8, 4] 22->24 26 Mbb_MindR_Sort4 ≤ 25857.608 gini = 0.412 samples = 148 value = [105, 43] 25->26 29 gini = 0.0 samples = 5 value = [0, 5] 25->29 27 gini = 0.0 samples = 5 value = [0, 5] 26->27 28 gini = 0.39 samples = 143 value = [105, 38] 26->28 31 LHD_Discriminant ≤ 0.495 gini = 0.446 samples = 10136 value = [3404, 6732] 30->31 46 HT_jets ≤ 623625.531 gini = 0.489 samples = 3776 value = [2165, 1611] 30->46 32 HT_jets ≤ 508918.922 gini = 0.472 samples = 7398 value = [2827, 4571] 31->32 39 dEtajj_MaxdEta ≤ 2.614 gini = 0.333 samples = 2738 value = [577, 2161] 31->39 33 Aplanarity_jets ≤ 0.128 gini = 0.499 samples = 3278 value = [1560, 1718] 32->33 36 HT_jets ≤ 867087.406 gini = 0.426 samples = 4120 value = [1267, 2853] 32->36 34 gini = 0.5 samples = 2727 value = [1390, 1337] 33->34 35 gini = 0.427 samples = 551 value = [170, 381] 33->35 37 gini = 0.446 samples = 3231 value = [1084, 2147] 36->37 38 gini = 0.327 samples = 889 value = [183, 706] 36->38 40 HT_jets ≤ 539465.219 gini = 0.289 samples = 1777 value = [311, 1466] 39->40 43 HT_jets ≤ 670898.5 gini = 0.4 samples = 961 value = [266, 695] 39->43 41 gini = 0.349 samples = 972 value = [219, 753] 40->41 42 gini = 0.202 samples = 805 value = [92, 713] 40->42 44 gini = 0.44 samples = 637 value = [208, 429] 43->44 45 gini = 0.294 samples = 324 value = [58, 266] 43->45 47 LHD_Discriminant ≤ 0.562 gini = 0.461 samples = 2506 value = [1602, 904] 46->47 54 LHD_Discriminant ≤ 0.429 gini = 0.494 samples = 1270 value = [563, 707] 46->54 48 dEtajj_MaxdEta ≤ 3.014 gini = 0.443 samples = 2270 value = [1519, 751] 47->48 51 Mbb_MindR_Sort4 ≤ 104844.047 gini = 0.456 samples = 236 value = [83, 153] 47->51 49 gini = 0.489 samples = 778 value = [447, 331] 48->49 50 gini = 0.405 samples = 1492 value = [1072, 420] 48->50 52 gini = 0.494 samples = 137 value = [61, 76] 51->52 53 gini = 0.346 samples = 99 value = [22, 77] 51->53 55 dRbb_avg_Sort4 ≤ 2.746 gini = 0.499 samples = 1070 value = [516, 554] 54->55 58 Mbb_MindR_Sort4 ≤ 38388.623 gini = 0.36 samples = 200 value = [47, 153] 54->58 56 gini = 0.49 samples = 739 value = [316, 423] 55->56 57 gini = 0.478 samples = 331 value = [200, 131] 55->57 59 gini = 0.0 samples = 3 value = [3, 0] 58->59 60 gini = 0.347 samples = 197 value = [44, 153] 58->60
In [6]: y_predictions = classifier.predict(X) y_predictions sklearn.metrics.accuracy_score(y, y_predictions)
y_predictions = classifier.predict(X) y_predictions sklearn.metrics.accuracy_score(y, y_predictions)
Out[6]: 0.683351612195657
0.683351612195657
In [7]: _df = pd.DataFrame() _df["variable"] = X.columns.values _df["importance"] = classifier.feature_importances_ _df.index = _df["variable"].values del _df["variable"] _df = _df.sort_values(by = "importance", ascending = False) _df
_df = pd.DataFrame() _df["variable"] = X.columns.values _df["importance"] = classifier.feature_importances_ _df.index = _df["variable"].values del _df["variable"] _df = _df.sort_values(by = "importance", ascending = False) _df
Out[7]: importance HT_jets 0.580307 dRbb_avg_Sort4 0.190065 LHD_Discriminant 0.154543 dEtajj_MaxdEta 0.030467 Aplanarity_jets 0.017219 H1_all 0.015864 Mbb_MindR_Sort4 0.006724 TTHReco_best_Higgsbleptop_mass 0.002552 dRlepbb_MindR_Sort4 0.002259 Mjj_MindR 0.000000 nHiggsbb30_Sort4 0.000000 TTHReco_best_Higgsleptop_dR 0.000000 TTHReco_best_b1Higgsbhadtop_dR 0.000000 TTHReco_withH_best_Higgsttbar_dR 0.000000 dRbb_MaxPt_Sort4 0.000000 TTHReco_best_Higgs_mass 0.000000 TTHReco_best_TTHReco 0.000000 TTHReco_best_bbHiggs_dR 0.000000
In [8]: plt.rcParams["figure.figsize"] = [8, 8] sns.barplot(_df["importance"], _df.index); plt.xlabel('importance') plt.show();
plt.rcParams["figure.figsize"] = [8, 8] sns.barplot(_df["importance"], _df.index); plt.xlabel('importance') plt.show();
In [ ]: