In [2]:
import pandas as pd
df = pd.read_csv('titanic_train_new.csv')
display(df.head())
X = df.drop(['Survived', 'PassengerId'], axis=1)
y = df['Survived']
from xtoy import Toy
toy = Toy(use_lightgbm=False, use_xgboost=True)
toy.fit(X ,y)
PassengerId
Survived
Pclass
Name
Sex
Age
SibSp
Parch
Ticket
Fare
Cabin
Embarked
0
1
0
3
Braund, Mr. Owen Harris
male
22.0
1
0
A/5 21171
7.2500
NaN
S
1
2
1
1
Cumings, Mrs. John Bradley (Florence Briggs Th...
female
38.0
1
0
PC 17599
71.2833
C85
C
2
3
1
3
Heikkinen, Miss. Laina
female
26.0
0
0
STON/O2. 3101282
7.9250
NaN
S
3
4
1
1
Futrelle, Mrs. Jacques Heath (Lily May Peel)
female
35.0
1
0
113803
53.1000
C123
S
4
5
0
3
Allen, Mr. William Henry
male
35.0
0
0
373450
8.0500
NaN
S
/Users/sethuiyer/Documents/Workspace/interpret/venv/lib/python3.6/site-packages/sklearn/ensemble/weight_boosting.py:29: DeprecationWarning: numpy.core.umath_tests is an internal NumPy module and should not be imported. It will be removed in a future NumPy release.
from numpy.core.umath_tests import inner1d
make_scorer(f1_weighted_score)
estimator: ridge
unique_combinations 16
Types [2] and maxint [15] detected
--- Evolve in 16 possible combinations ---
gen nevals avg min max
0 5 0.689742 0.654157 0.733195
1 2 0.679021 0.640221 0.702697
2 2 0.689675 0.6583 0.702697
3 4 0.682846 0.659591 0.702697
4 3 0.674335 0.621703 0.702697
5 5 0.68524 0.665538 0.695859
6 2 0.701537 0.686516 0.73597
7 4 0.694487 0.671264 0.714945
8 3 0.696822 0.67901 0.714945
9 3 0.690318 0.623581 0.714945
10 2 0.715361 0.70439 0.721945
(0.7359696065779963,)
cbp {'estimator__alpha': 0.001}
Best individual is: {'estimator__alpha': 0.001}
with fitness: 0.7359696065779963
estimator: rf
unique_combinations 216
Types [1, 1, 1, 1, 1, 1] and maxint [5, 3, 2, 2, 0, 0] detected
--- Evolve in 216 possible combinations ---
gen nevals avg min max
0 5 0.795542 0.77349 0.81893
1 2 0.80894 0.788244 0.822922
2 3 0.797709 0.763932 0.81893
3 4 0.800394 0.785012 0.824188
4 1 0.805025 0.779771 0.824188
5 5 0.787438 0.762702 0.840506
6 4 0.789424 0.764625 0.840506
7 3 0.805538 0.791489 0.840506
8 4 0.788507 0.77228 0.800128
9 4 0.773438 0.757836 0.785949
10 4 0.790818 0.768333 0.803311
11 0 0.803311 0.803311 0.803311
12 2 0.800764 0.776111 0.817775
13 0 0.809096 0.803311 0.817775
14 4 0.804443 0.788986 0.817775
15 2 0.805595 0.785612 0.817775
16 4 0.787949 0.767265 0.817775
17 4 0.800028 0.774016 0.817775
18 4 0.794458 0.747996 0.808771
19 1 0.801319 0.777041 0.808771
20 4 0.78454 0.772483 0.800024
21 3 0.794575 0.762681 0.819709
(0.8241883798898725,)
cbp {'estimator__max_features': 0.5, 'estimator__max_depth': 10, 'estimator__min_samples_leaf': 5, 'estimator__min_samples_split': 20, 'estimator__n_estimators': 1000, 'estimator__class_weight': 'balanced'}
Best individual is: {'estimator__max_features': 0.5, 'estimator__max_depth': 10, 'estimator__min_samples_leaf': 5, 'estimator__min_samples_split': 20, 'estimator__n_estimators': 1000, 'estimator__class_weight': 'balanced'}
with fitness: 0.8241883798898725
estimator: knn
unique_combinations 336
Types [1, 1, 1, 1] and maxint [5, 6, 3, 1] detected
--- Evolve in 336 possible combinations ---
gen nevals avg min max
0 5 0.728349 0.699398 0.753189
1 5 0.733158 0.703808 0.78842
2 4 0.739315 0.678993 0.775224
3 5 0.732318 0.713896 0.754946
4 4 0.723504 0.663968 0.754946
5 2 0.7392 0.71354 0.754946
6 2 0.738477 0.695622 0.754946
7 4 0.737678 0.701343 0.777205
8 3 0.734933 0.699171 0.777205
9 1 0.75768 0.701839 0.777205
10 0 0.762132 0.701839 0.777205
11 2 0.763111 0.72634 0.777205
12 4 0.726617 0.67593 0.777205
13 4 0.735894 0.711275 0.777205
14 2 0.728609 0.711275 0.749758
15 4 0.726591 0.691991 0.749758
16 4 0.701094 0.650055 0.742242
17 2 0.733319 0.704893 0.742242
18 2 0.722204 0.642465 0.742242
19 2 0.729732 0.710871 0.742242
20 1 0.735558 0.708823 0.742242
21 4 0.718573 0.681926 0.742818
22 4 0.73579 0.684558 0.752722
23 3 0.72 0.676035 0.752202
24 2 0.750753 0.747225 0.758339
25 3 0.732015 0.702391 0.748774
26 2 0.735641 0.710112 0.748774
27 4 0.718904 0.69773 0.748774
28 3 0.734688 0.689374 0.748774
29 2 0.753876 0.748197 0.769271
30 2 0.738207 0.709012 0.769271
31 3 0.719145 0.684383 0.754364
32 1 0.741529 0.696356 0.754364
33 5 0.72871 0.71174 0.737569
(0.7884199758303595,)
cbp {'estimator__n_neighbors': 5, 'estimator__leaf_size': 3, 'estimator__p': 2, 'estimator__weights': 'distance'}
Best individual is: {'estimator__n_neighbors': 5, 'estimator__leaf_size': 3, 'estimator__p': 2, 'estimator__weights': 'distance'}
with fitness: 0.7884199758303595
estimator: xgb
unique_combinations 270
Types [1, 1, 2, 1] and maxint [4, 5, 8, 0] detected
--- Evolve in 270 possible combinations ---
gen nevals avg min max
0 5 0.808367 0.791833 0.820858
1 3 0.797774 0.774765 0.816199
2 2 0.810423 0.801415 0.816199
3 4 0.796619 0.756186 0.843127
4 3 0.817328 0.788144 0.843127
5 1 0.809282 0.747912 0.843127
6 2 0.81195 0.798454 0.843127
7 1 0.818529 0.790227 0.843127
8 2 0.820652 0.764773 0.843127
9 4 0.797062 0.737579 0.843127
10 2 0.813633 0.791774 0.843127
11 5 0.800168 0.77124 0.851061
12 1 0.818479 0.78652 0.851061
13 0 0.831387 0.789613 0.851061
14 2 0.832532 0.791779 0.851061
15 2 0.838702 0.804821 0.851061
16 3 0.828386 0.796409 0.851061
17 2 0.833577 0.812072 0.851061
18 1 0.834974 0.819057 0.851061
19 3 0.820793 0.795979 0.851061
20 2 0.83296 0.808224 0.851061
21 4 0.81702 0.782231 0.844766
22 3 0.82765 0.80363 0.844766
23 2 0.833485 0.825278 0.844766
24 3 0.827005 0.802584 0.853273
25 1 0.835164 0.822451 0.853273
26 2 0.820604 0.776367 0.853273
27 3 0.814249 0.79489 0.833364
(0.8510609205371467,)
cbp {'estimator__max_depth': 20, 'estimator__min_child_weight': 15, 'estimator__learning_rate': 0.3, 'estimator__n_estimators': 100}
Best individual is: {'estimator__max_depth': 20, 'estimator__min_child_weight': 15, 'estimator__learning_rate': 0.3, 'estimator__n_estimators': 100}
with fitness: 0.8510609205371467
best: Pipeline(memory=None,
steps=[('scaler', Normalizer(copy=True, norm='l2')), ('estimator', XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bytree=1, gamma=0, learning_rate=0.3, max_delta_step=0,
max_depth=20, min_child_weight=15, missing=None, n_estimators=100,
n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
silent=True, subsample=1))])
Out[2]:
Pipeline(memory=None,
steps=[('scaler', Normalizer(copy=True, norm='l2')), ('estimator', XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bytree=1, gamma=0, learning_rate=0.3, max_delta_step=0,
max_depth=20, min_child_weight=15, missing=None, n_estimators=100,
n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
silent=True, subsample=1))])
In [3]:
df_test = pd.read_csv('titanic_test.csv')
X_test = df_test.drop(['Survived','PassengerId'], axis=1)
X_test = toy.featurizer.imputer.transform(X_test)
y_test = df_test['Survived']
In [9]:
X_test
Out[9]:
Pclass
Name
Sex
Age
SibSp
Parch
Ticket
Fare
Cabin
Embarked
0
3
Braund, Mr. Owen Harris
male
22.000000
1
0
A/5 21171
7.2500
B96 B98
S
1
1
Cumings, Mrs. John Bradley (Florence Briggs Th...
female
38.000000
1
0
PC 17599
71.2833
C85
C
2
3
Heikkinen, Miss. Laina
female
26.000000
0
0
STON/O2. 3101282
7.9250
B96 B98
S
3
1
Futrelle, Mrs. Jacques Heath (Lily May Peel)
female
35.000000
1
0
113803
53.1000
C123
S
4
3
Allen, Mr. William Henry
male
35.000000
0
0
373450
8.0500
B96 B98
S
...
...
...
...
...
...
...
...
...
...
...
886
2
Montvila, Rev. Juozas
male
27.000000
0
0
211536
13.0000
B96 B98
S
887
1
Graham, Miss. Margaret Edith
female
19.000000
0
0
112053
30.0000
B42
S
888
3
Johnston, Miss. Catherine Helen "Carrie"
female
29.699118
1
2
W./C. 6607
23.4500
B96 B98
S
889
1
Behr, Mr. Karl Howell
male
26.000000
0
0
111369
30.0000
C148
C
890
3
Dooley, Mr. Patrick
male
32.000000
0
0
370376
7.7500
B96 B98
Q
891 rows × 10 columns
In [5]:
from interpret import show
from interpret.perf import ROC
blackbox_perf = ROC(toy.predict_proba).explain_perf(X_test, y_test, name='Blackbox')
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
~/Documents/Workspace/interpret/venv/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
2896 try:
-> 2897 return self._engine.get_loc(key)
2898 except KeyError:
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/index_class_helper.pxi in pandas._libs.index.Int64Engine._check_type()
KeyError: 'Age'
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
<ipython-input-5-9263b995b073> in <module>
2 from interpret.perf import ROC
3
----> 4 blackbox_perf = ROC(toy.predict_proba).explain_perf(X_test, y_test, name='Blackbox')
~/Documents/Workspace/interpret/venv/lib/python3.6/site-packages/interpret/perf/curve.py in explain_perf(self, X, y, name)
73 X, y, self.feature_names, self.feature_types
74 )
---> 75 predict_fn = unify_predict_fn(self.predict_fn, X)
76 scores = predict_fn(X)
77
~/Documents/Workspace/interpret/venv/lib/python3.6/site-packages/interpret/utils/all.py in unify_predict_fn(predict_fn, X)
188
189 def unify_predict_fn(predict_fn, X):
--> 190 predictions = predict_fn(X[:1])
191 if predictions.ndim == 2:
192 new_predict_fn = lambda x: predict_fn(x)[:, 1] # noqa: E731
~/Documents/Workspace/interpret/venv/lib/python3.6/site-packages/xtoy/toys.py in predict_proba(self, X)
136
137 def predict_proba(self, X):
--> 138 X = self.featurizer.transform(X).A
139 return self.best_evo.predict_proba(X)
140
~/Documents/Workspace/interpret/venv/lib/python3.6/site-packages/xtoy/prep.py in transform(self, X, y)
218 date_groups = []
219 for col in self.missing_col_names:
--> 220 res.append(pd.isnull(X[col]))
221 X = self.imputer.transform(X)
222 for i, col in enumerate(X):
~/Documents/Workspace/interpret/venv/lib/python3.6/site-packages/pandas/core/frame.py in __getitem__(self, key)
2993 if self.columns.nlevels > 1:
2994 return self._getitem_multilevel(key)
-> 2995 indexer = self.columns.get_loc(key)
2996 if is_integer(indexer):
2997 indexer = [indexer]
~/Documents/Workspace/interpret/venv/lib/python3.6/site-packages/pandas/core/indexes/range.py in get_loc(self, key, method, tolerance)
377 except ValueError:
378 raise KeyError(key)
--> 379 return super().get_loc(key, method=method, tolerance=tolerance)
380
381 @Appender(_index_shared_docs["get_indexer"])
~/Documents/Workspace/interpret/venv/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
2897 return self._engine.get_loc(key)
2898 except KeyError:
-> 2899 return self._engine.get_loc(self._maybe_cast_indexer(key))
2900 indexer = self.get_indexer([key], method=method, tolerance=tolerance)
2901 if indexer.ndim > 1 or indexer.size > 1:
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/index_class_helper.pxi in pandas._libs.index.Int64Engine._check_type()
KeyError: 'Age'
In [ ]:
Content source: sethuiyer/mlhub
Similar notebooks: