notebook.community

Edit and run



In [2]:

    
import pandas as pd
df = pd.read_csv('titanic_train_new.csv')
display(df.head())
X = df.drop(['Survived', 'PassengerId'], axis=1)
y = df['Survived']
from xtoy import Toy
toy = Toy(use_lightgbm=False, use_xgboost=True)
toy.fit(X ,y)









    







  
    
      
      PassengerId
      Survived
      Pclass
      Name
      Sex
      Age
      SibSp
      Parch
      Ticket
      Fare
      Cabin
      Embarked
    
  
  
    
      0
      1
      0
      3
      Braund, Mr. Owen Harris
      male
      22.0
      1
      0
      A/5 21171
      7.2500
      NaN
      S
    
    
      1
      2
      1
      1
      Cumings, Mrs. John Bradley (Florence Briggs Th...
      female
      38.0
      1
      0
      PC 17599
      71.2833
      C85
      C
    
    
      2
      3
      1
      3
      Heikkinen, Miss. Laina
      female
      26.0
      0
      0
      STON/O2. 3101282
      7.9250
      NaN
      S
    
    
      3
      4
      1
      1
      Futrelle, Mrs. Jacques Heath (Lily May Peel)
      female
      35.0
      1
      0
      113803
      53.1000
      C123
      S
    
    
      4
      5
      0
      3
      Allen, Mr. William Henry
      male
      35.0
      0
      0
      373450
      8.0500
      NaN
      S
    
  








    



/Users/sethuiyer/Documents/Workspace/interpret/venv/lib/python3.6/site-packages/sklearn/ensemble/weight_boosting.py:29: DeprecationWarning: numpy.core.umath_tests is an internal NumPy module and should not be imported. It will be removed in a future NumPy release.
  from numpy.core.umath_tests import inner1d






    



make_scorer(f1_weighted_score)
estimator: ridge
unique_combinations 16
Types [2] and maxint [15] detected
--- Evolve in 16 possible combinations ---
gen	nevals	avg     	min     	max     
0  	5     	0.689742	0.654157	0.733195
1  	2     	0.679021	0.640221	0.702697
2  	2     	0.689675	0.6583  	0.702697
3  	4     	0.682846	0.659591	0.702697
4  	3     	0.674335	0.621703	0.702697
5  	5     	0.68524 	0.665538	0.695859
6  	2     	0.701537	0.686516	0.73597 
7  	4     	0.694487	0.671264	0.714945
8  	3     	0.696822	0.67901 	0.714945
9  	3     	0.690318	0.623581	0.714945
10 	2     	0.715361	0.70439 	0.721945
(0.7359696065779963,)
cbp {'estimator__alpha': 0.001}
Best individual is: {'estimator__alpha': 0.001}
with fitness: 0.7359696065779963
estimator: rf
unique_combinations 216
Types [1, 1, 1, 1, 1, 1] and maxint [5, 3, 2, 2, 0, 0] detected
--- Evolve in 216 possible combinations ---
gen	nevals	avg     	min    	max    
0  	5     	0.795542	0.77349	0.81893
1  	2     	0.80894 	0.788244	0.822922
2  	3     	0.797709	0.763932	0.81893 
3  	4     	0.800394	0.785012	0.824188
4  	1     	0.805025	0.779771	0.824188
5  	5     	0.787438	0.762702	0.840506
6  	4     	0.789424	0.764625	0.840506
7  	3     	0.805538	0.791489	0.840506
8  	4     	0.788507	0.77228 	0.800128
9  	4     	0.773438	0.757836	0.785949
10 	4     	0.790818	0.768333	0.803311
11 	0     	0.803311	0.803311	0.803311
12 	2     	0.800764	0.776111	0.817775
13 	0     	0.809096	0.803311	0.817775
14 	4     	0.804443	0.788986	0.817775
15 	2     	0.805595	0.785612	0.817775
16 	4     	0.787949	0.767265	0.817775
17 	4     	0.800028	0.774016	0.817775
18 	4     	0.794458	0.747996	0.808771
19 	1     	0.801319	0.777041	0.808771
20 	4     	0.78454 	0.772483	0.800024
21 	3     	0.794575	0.762681	0.819709
(0.8241883798898725,)
cbp {'estimator__max_features': 0.5, 'estimator__max_depth': 10, 'estimator__min_samples_leaf': 5, 'estimator__min_samples_split': 20, 'estimator__n_estimators': 1000, 'estimator__class_weight': 'balanced'}
Best individual is: {'estimator__max_features': 0.5, 'estimator__max_depth': 10, 'estimator__min_samples_leaf': 5, 'estimator__min_samples_split': 20, 'estimator__n_estimators': 1000, 'estimator__class_weight': 'balanced'}
with fitness: 0.8241883798898725
estimator: knn
unique_combinations 336
Types [1, 1, 1, 1] and maxint [5, 6, 3, 1] detected
--- Evolve in 336 possible combinations ---
gen	nevals	avg     	min     	max     
0  	5     	0.728349	0.699398	0.753189
1  	5     	0.733158	0.703808	0.78842 
2  	4     	0.739315	0.678993	0.775224
3  	5     	0.732318	0.713896	0.754946
4  	4     	0.723504	0.663968	0.754946
5  	2     	0.7392  	0.71354 	0.754946
6  	2     	0.738477	0.695622	0.754946
7  	4     	0.737678	0.701343	0.777205
8  	3     	0.734933	0.699171	0.777205
9  	1     	0.75768 	0.701839	0.777205
10 	0     	0.762132	0.701839	0.777205
11 	2     	0.763111	0.72634 	0.777205
12 	4     	0.726617	0.67593 	0.777205
13 	4     	0.735894	0.711275	0.777205
14 	2     	0.728609	0.711275	0.749758
15 	4     	0.726591	0.691991	0.749758
16 	4     	0.701094	0.650055	0.742242
17 	2     	0.733319	0.704893	0.742242
18 	2     	0.722204	0.642465	0.742242
19 	2     	0.729732	0.710871	0.742242
20 	1     	0.735558	0.708823	0.742242
21 	4     	0.718573	0.681926	0.742818
22 	4     	0.73579 	0.684558	0.752722
23 	3     	0.72    	0.676035	0.752202
24 	2     	0.750753	0.747225	0.758339
25 	3     	0.732015	0.702391	0.748774
26 	2     	0.735641	0.710112	0.748774
27 	4     	0.718904	0.69773 	0.748774
28 	3     	0.734688	0.689374	0.748774
29 	2     	0.753876	0.748197	0.769271
30 	2     	0.738207	0.709012	0.769271
31 	3     	0.719145	0.684383	0.754364
32 	1     	0.741529	0.696356	0.754364
33 	5     	0.72871 	0.71174 	0.737569
(0.7884199758303595,)
cbp {'estimator__n_neighbors': 5, 'estimator__leaf_size': 3, 'estimator__p': 2, 'estimator__weights': 'distance'}
Best individual is: {'estimator__n_neighbors': 5, 'estimator__leaf_size': 3, 'estimator__p': 2, 'estimator__weights': 'distance'}
with fitness: 0.7884199758303595
estimator: xgb
unique_combinations 270
Types [1, 1, 2, 1] and maxint [4, 5, 8, 0] detected
--- Evolve in 270 possible combinations ---
gen	nevals	avg     	min     	max     
0  	5     	0.808367	0.791833	0.820858
1  	3     	0.797774	0.774765	0.816199
2  	2     	0.810423	0.801415	0.816199
3  	4     	0.796619	0.756186	0.843127
4  	3     	0.817328	0.788144	0.843127
5  	1     	0.809282	0.747912	0.843127
6  	2     	0.81195 	0.798454	0.843127
7  	1     	0.818529	0.790227	0.843127
8  	2     	0.820652	0.764773	0.843127
9  	4     	0.797062	0.737579	0.843127
10 	2     	0.813633	0.791774	0.843127
11 	5     	0.800168	0.77124 	0.851061
12 	1     	0.818479	0.78652 	0.851061
13 	0     	0.831387	0.789613	0.851061
14 	2     	0.832532	0.791779	0.851061
15 	2     	0.838702	0.804821	0.851061
16 	3     	0.828386	0.796409	0.851061
17 	2     	0.833577	0.812072	0.851061
18 	1     	0.834974	0.819057	0.851061
19 	3     	0.820793	0.795979	0.851061
20 	2     	0.83296 	0.808224	0.851061
21 	4     	0.81702 	0.782231	0.844766
22 	3     	0.82765 	0.80363 	0.844766
23 	2     	0.833485	0.825278	0.844766
24 	3     	0.827005	0.802584	0.853273
25 	1     	0.835164	0.822451	0.853273
26 	2     	0.820604	0.776367	0.853273
27 	3     	0.814249	0.79489 	0.833364
(0.8510609205371467,)
cbp {'estimator__max_depth': 20, 'estimator__min_child_weight': 15, 'estimator__learning_rate': 0.3, 'estimator__n_estimators': 100}
Best individual is: {'estimator__max_depth': 20, 'estimator__min_child_weight': 15, 'estimator__learning_rate': 0.3, 'estimator__n_estimators': 100}
with fitness: 0.8510609205371467
best: Pipeline(memory=None,
     steps=[('scaler', Normalizer(copy=True, norm='l2')), ('estimator', XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.3, max_delta_step=0,
       max_depth=20, min_child_weight=15, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1))])






    Out[2]:





Pipeline(memory=None,
     steps=[('scaler', Normalizer(copy=True, norm='l2')), ('estimator', XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.3, max_delta_step=0,
       max_depth=20, min_child_weight=15, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1))])



In [3]:

    
df_test = pd.read_csv('titanic_test.csv')
X_test = df_test.drop(['Survived','PassengerId'], axis=1)
X_test = toy.featurizer.imputer.transform(X_test)
y_test = df_test['Survived']



In [9]:

    
X_test









    Out[9]:







  
    
      
      Pclass
      Name
      Sex
      Age
      SibSp
      Parch
      Ticket
      Fare
      Cabin
      Embarked
    
  
  
    
      0
      3
      Braund, Mr. Owen Harris
      male
      22.000000
      1
      0
      A/5 21171
      7.2500
      B96 B98
      S
    
    
      1
      1
      Cumings, Mrs. John Bradley (Florence Briggs Th...
      female
      38.000000
      1
      0
      PC 17599
      71.2833
      C85
      C
    
    
      2
      3
      Heikkinen, Miss. Laina
      female
      26.000000
      0
      0
      STON/O2. 3101282
      7.9250
      B96 B98
      S
    
    
      3
      1
      Futrelle, Mrs. Jacques Heath (Lily May Peel)
      female
      35.000000
      1
      0
      113803
      53.1000
      C123
      S
    
    
      4
      3
      Allen, Mr. William Henry
      male
      35.000000
      0
      0
      373450
      8.0500
      B96 B98
      S
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      886
      2
      Montvila, Rev. Juozas
      male
      27.000000
      0
      0
      211536
      13.0000
      B96 B98
      S
    
    
      887
      1
      Graham, Miss. Margaret Edith
      female
      19.000000
      0
      0
      112053
      30.0000
      B42
      S
    
    
      888
      3
      Johnston, Miss. Catherine Helen "Carrie"
      female
      29.699118
      1
      2
      W./C. 6607
      23.4500
      B96 B98
      S
    
    
      889
      1
      Behr, Mr. Karl Howell
      male
      26.000000
      0
      0
      111369
      30.0000
      C148
      C
    
    
      890
      3
      Dooley, Mr. Patrick
      male
      32.000000
      0
      0
      370376
      7.7500
      B96 B98
      Q
    
  

891 rows × 10 columns



In [5]:

    
from interpret import show
from interpret.perf import ROC

blackbox_perf = ROC(toy.predict_proba).explain_perf(X_test, y_test, name='Blackbox')









    



---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
~/Documents/Workspace/interpret/venv/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   2896             try:
-> 2897                 return self._engine.get_loc(key)
   2898             except KeyError:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index_class_helper.pxi in pandas._libs.index.Int64Engine._check_type()

KeyError: 'Age'

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
<ipython-input-5-9263b995b073> in <module>
      2 from interpret.perf import ROC
      3 
----> 4 blackbox_perf = ROC(toy.predict_proba).explain_perf(X_test, y_test, name='Blackbox')

~/Documents/Workspace/interpret/venv/lib/python3.6/site-packages/interpret/perf/curve.py in explain_perf(self, X, y, name)
     73             X, y, self.feature_names, self.feature_types
     74         )
---> 75         predict_fn = unify_predict_fn(self.predict_fn, X)
     76         scores = predict_fn(X)
     77 

~/Documents/Workspace/interpret/venv/lib/python3.6/site-packages/interpret/utils/all.py in unify_predict_fn(predict_fn, X)
    188 
    189 def unify_predict_fn(predict_fn, X):
--> 190     predictions = predict_fn(X[:1])
    191     if predictions.ndim == 2:
    192         new_predict_fn = lambda x: predict_fn(x)[:, 1]  # noqa: E731

~/Documents/Workspace/interpret/venv/lib/python3.6/site-packages/xtoy/toys.py in predict_proba(self, X)
    136 
    137     def predict_proba(self, X):
--> 138         X = self.featurizer.transform(X).A
    139         return self.best_evo.predict_proba(X)
    140 

~/Documents/Workspace/interpret/venv/lib/python3.6/site-packages/xtoy/prep.py in transform(self, X, y)
    218         date_groups = []
    219         for col in self.missing_col_names:
--> 220             res.append(pd.isnull(X[col]))
    221         X = self.imputer.transform(X)
    222         for i, col in enumerate(X):

~/Documents/Workspace/interpret/venv/lib/python3.6/site-packages/pandas/core/frame.py in __getitem__(self, key)
   2993             if self.columns.nlevels > 1:
   2994                 return self._getitem_multilevel(key)
-> 2995             indexer = self.columns.get_loc(key)
   2996             if is_integer(indexer):
   2997                 indexer = [indexer]

~/Documents/Workspace/interpret/venv/lib/python3.6/site-packages/pandas/core/indexes/range.py in get_loc(self, key, method, tolerance)
    377             except ValueError:
    378                 raise KeyError(key)
--> 379         return super().get_loc(key, method=method, tolerance=tolerance)
    380 
    381     @Appender(_index_shared_docs["get_indexer"])

~/Documents/Workspace/interpret/venv/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   2897                 return self._engine.get_loc(key)
   2898             except KeyError:
-> 2899                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   2900         indexer = self.get_indexer([key], method=method, tolerance=tolerance)
   2901         if indexer.ndim > 1 or indexer.size > 1:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index_class_helper.pxi in pandas._libs.index.Int64Engine._check_type()

KeyError: 'Age'



In [ ]:

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	373450	8.0500	NaN	S

	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
0	3	Braund, Mr. Owen Harris	male	22.000000	1	0	A/5 21171	7.2500	B96 B98	S
1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.000000	1	0	PC 17599	71.2833	C85	C
2	3	Heikkinen, Miss. Laina	female	26.000000	0	0	STON/O2. 3101282	7.9250	B96 B98	S
3	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.000000	1	0	113803	53.1000	C123	S
4	3	Allen, Mr. William Henry	male	35.000000	0	0	373450	8.0500	B96 B98	S
...	...	...	...	...	...	...	...	...	...	...
886	2	Montvila, Rev. Juozas	male	27.000000	0	0	211536	13.0000	B96 B98	S
887	1	Graham, Miss. Margaret Edith	female	19.000000	0	0	112053	30.0000	B42	S
888	3	Johnston, Miss. Catherine Helen "Carrie"	female	29.699118	1	2	W./C. 6607	23.4500	B96 B98	S
889	1	Behr, Mr. Karl Howell	male	26.000000	0	0	111369	30.0000	C148	C
890	3	Dooley, Mr. Patrick	male	32.000000	0	0	370376	7.7500	B96 B98	Q