In [1]:
%matplotlib inline
from sklearn.externals import joblib
from sklearn_model_eval import plots
from sklearn_model_eval import tables
import pandas as pd
import numpy as np
import json
from sklearn.metrics import classification_report
from sklearn import cross_validation
In [18]:
model = joblib.load('model/model.pkl')
with open('model/features.json', "r") as f:
features = json.load(f)
train_y = np.load('model/train_y.npy').astype(int)
predict_y = np.load('model/predicted_y.npy').astype(int)
predict_scores = np.load('model/predicted_scores.npy')
df_raw = pd.read_csv('raw_data/train.csv', index_col='PassengerId')
df = pd.read_csv('data/train.csv', index_col='id')
df = df.join(df_raw)
train_x = df.drop(['Survived'], axis=1).values
---------------------------------------------------------------------------
IOError Traceback (most recent call last)
<ipython-input-18-1c6e369a086d> in <module>()
6 train_y = np.load('model/train_y.npy').astype(int)
7 predict_y = np.load('model/predicted_y.npy').astype(int)
----> 8 predict_scores = np.load('model/predicted_scores.npy')
9
10 df_raw = pd.read_csv('raw_data/train.csv', index_col='PassengerId')
/Users/Edu/Envs/sklearn_model_eval/lib/python2.7/site-packages/numpy/lib/npyio.pyc in load(file, mmap_mode, allow_pickle, fix_imports, encoding)
360 own_fid = False
361 if isinstance(file, basestring):
--> 362 fid = open(file, "rb")
363 own_fid = True
364 else:
IOError: [Errno 2] No such file or directory: 'model/predicted_scores.npy'
In [3]:
def error_type(y_true, y_predicted):
if y_true==0 and y_predicted==0:
return 'TN'
elif y_true==1 and y_predicted==1:
return 'TP'
elif y_true==1 and y_predicted==0:
return 'FN'
elif y_true==0 and y_predicted==1:
return 'FP'
else:
return 'E'
In [4]:
pt = [error_type(*t) for t in zip(train_y, predict_y)]
df['pred_type'] = pt
df.head()
Out[4]:
age
fare
parents_and_children
p_class
siblings_and_spouses
survived
fam_size
fam_mul_size
fare_mul_pclass
fare_mul_age
...
Name
Sex
Age
SibSp
Parch
Ticket
Fare
Cabin
Embarked
pred_type
id
1
22
7.2500
0
3
1
0
1
0
2.416667
159.5000
...
Braund, Mr. Owen Harris
male
22
1
0
A/5 21171
7.2500
NaN
S
TN
2
38
71.2833
0
1
1
1
1
0
71.283300
2708.7654
...
Cumings, Mrs. John Bradley (Florence Briggs Th...
female
38
1
0
PC 17599
71.2833
C85
C
TP
3
26
7.9250
0
3
0
1
0
0
2.641667
206.0500
...
Heikkinen, Miss. Laina
female
26
0
0
STON/O2. 3101282
7.9250
NaN
S
TP
4
35
53.1000
0
1
1
1
1
0
53.100000
1858.5000
...
Futrelle, Mrs. Jacques Heath (Lily May Peel)
female
35
1
0
113803
53.1000
C123
S
TP
5
35
8.0500
0
3
0
0
0
0
2.683333
281.7500
...
Allen, Mr. William Henry
male
35
0
0
373450
8.0500
NaN
S
TN
5 rows × 37 columns
In [5]:
df.describe()
Out[5]:
age
fare
parents_and_children
p_class
siblings_and_spouses
survived
fam_size
fam_mul_size
fare_mul_pclass
fare_mul_age
...
sex_female
sex_male
social_status_high
social_status_normal
Survived
Pclass
Age
SibSp
Parch
Fare
count
891.000000
891.000000
891.000000
891.000000
891.000000
891.000000
891.000000
891.000000
891.000000
891.000000
...
891.000000
891.000000
891.000000
891.000000
891.000000
891.000000
714.000000
891.000000
891.000000
891.000000
mean
29.188182
32.204208
0.381594
2.308642
0.523008
0.383838
0.904602
0.567901
25.046650
1022.118312
...
0.352413
0.647587
0.051627
0.948373
0.383838
2.308642
29.699118
0.523008
0.381594
32.204208
std
13.337887
49.693429
0.806057
0.836071
1.102743
0.486592
1.613459
1.979287
51.245594
1837.298581
...
0.477990
0.477990
0.221398
0.221398
0.486592
0.836071
14.526497
1.102743
0.806057
49.693429
min
0.420000
0.000000
0.000000
1.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
...
0.000000
0.000000
0.000000
0.000000
0.000000
1.000000
0.420000
0.000000
0.000000
0.000000
25%
22.000000
7.910400
0.000000
2.000000
0.000000
0.000000
0.000000
0.000000
2.641667
193.750000
...
0.000000
0.000000
0.000000
1.000000
0.000000
2.000000
20.125000
0.000000
0.000000
7.910400
50%
26.000000
14.454200
0.000000
3.000000
0.000000
0.000000
0.000000
0.000000
6.325000
354.200000
...
0.000000
1.000000
0.000000
1.000000
0.000000
3.000000
28.000000
0.000000
0.000000
14.454200
75%
36.000000
31.000000
0.000000
3.000000
1.000000
1.000000
1.000000
0.000000
23.183333
1115.550000
...
1.000000
1.000000
0.000000
1.000000
1.000000
3.000000
38.000000
1.000000
0.000000
31.000000
max
80.000000
512.329200
6.000000
3.000000
8.000000
1.000000
10.000000
16.000000
512.329200
18443.851200
...
1.000000
1.000000
1.000000
1.000000
1.000000
3.000000
80.000000
8.000000
6.000000
512.329200
8 rows × 31 columns
In [6]:
df[df.pred_type=='FP']
Out[6]:
age
fare
parents_and_children
p_class
siblings_and_spouses
survived
fam_size
fam_mul_size
fare_mul_pclass
fare_mul_age
...
Name
Sex
Age
SibSp
Parch
Ticket
Fare
Cabin
Embarked
pred_type
id
265
22
7.7500
0
3
0
0
0
0
2.583333
170.500
...
Henry, Miss. Delia
female
NaN
0
0
382649
7.7500
NaN
Q
FP
529
39
7.9250
0
3
0
0
0
0
2.641667
309.075
...
Salonen, Mr. Johan Werner
male
39
0
0
3101296
7.9250
NaN
S
FP
575
16
8.0500
0
3
0
0
0
0
2.683333
128.800
...
Rush, Mr. Alfred George John
male
16
0
0
A/4. 20589
8.0500
NaN
S
FP
827
25
56.4958
0
3
0
0
0
0
18.831933
1412.395
...
Lam, Mr. Len
male
NaN
0
0
1601
56.4958
NaN
S
FP
4 rows × 37 columns
In [7]:
df[df.pred_type=='FP'].describe()
Out[7]:
age
fare
parents_and_children
p_class
siblings_and_spouses
survived
fam_size
fam_mul_size
fare_mul_pclass
fare_mul_age
...
sex_female
sex_male
social_status_high
social_status_normal
Survived
Pclass
Age
SibSp
Parch
Fare
count
4.000000
4.000000
4
4
4
4
4
4
4.000000
4.00000
...
4.00
4.00
4
4
4
4
2.000000
4
4
4.000000
mean
25.500000
20.055200
0
3
0
0
0
0
6.685067
505.19250
...
0.25
0.75
0
1
0
3
27.500000
0
0
20.055200
std
9.746794
24.294045
0
0
0
0
0
0
8.098015
609.69086
...
0.50
0.50
0
0
0
0
16.263456
0
0
24.294045
min
16.000000
7.750000
0
3
0
0
0
0
2.583333
128.80000
...
0.00
0.00
0
1
0
3
16.000000
0
0
7.750000
25%
20.500000
7.881250
0
3
0
0
0
0
2.627083
160.07500
...
0.00
0.75
0
1
0
3
21.750000
0
0
7.881250
50%
23.500000
7.987500
0
3
0
0
0
0
2.662500
239.78750
...
0.00
1.00
0
1
0
3
27.500000
0
0
7.987500
75%
28.500000
20.161450
0
3
0
0
0
0
6.720483
584.90500
...
0.25
1.00
0
1
0
3
33.250000
0
0
20.161450
max
39.000000
56.495800
0
3
0
0
0
0
18.831933
1412.39500
...
1.00
1.00
0
1
0
3
39.000000
0
0
56.495800
8 rows × 31 columns
In [8]:
df[df.pred_type=='FN']
Out[8]:
age
fare
parents_and_children
p_class
siblings_and_spouses
survived
fam_size
fam_mul_size
fare_mul_pclass
fare_mul_age
...
Name
Sex
Age
SibSp
Parch
Ticket
Fare
Cabin
Embarked
pred_type
id
37
25
7.2292
0
3
0
1
0
0
2.409733
180.730
...
Mamee, Mr. Hanna
male
NaN
0
0
2677
7.2292
NaN
C
FN
108
25
7.7750
0
3
0
1
0
0
2.591667
194.375
...
Moss, Mr. Albert Johan
male
NaN
0
0
312991
7.7750
NaN
S
FN
227
19
10.5000
0
2
0
1
0
0
5.250000
199.500
...
Mellors, Mr. William John
male
19
0
0
SW/PP 751
10.5000
NaN
S
FN
268
25
7.7750
0
3
1
1
1
0
2.591667
194.375
...
Persson, Mr. Ernst Ulrik
male
25
1
0
347083
7.7750
NaN
S
FN
284
19
8.0500
0
3
0
1
0
0
2.683333
152.950
...
Dorking, Mr. Edward Arthur
male
19
0
0
A/5. 10482
8.0500
NaN
S
FN
289
42
13.0000
0
2
0
1
0
0
6.500000
546.000
...
Hosono, Mr. Masabumi
male
42
0
0
237798
13.0000
NaN
S
FN
580
32
7.9250
0
3
0
1
0
0
2.641667
253.600
...
Jussila, Mr. Eiriik
male
32
0
0
STON/O 2. 3101286
7.9250
NaN
S
FN
674
31
13.0000
0
2
0
1
0
0
6.500000
403.000
...
Wilhelms, Mr. Charles
male
31
0
0
244270
13.0000
NaN
S
FN
829
25
7.7500
0
3
0
1
0
0
2.583333
193.750
...
McCormack, Mr. Thomas Joseph
male
NaN
0
0
367228
7.7500
NaN
Q
FN
9 rows × 37 columns
In [9]:
df[df.pred_type=='FN'].describe()
Out[9]:
age
fare
parents_and_children
p_class
siblings_and_spouses
survived
fam_size
fam_mul_size
fare_mul_pclass
fare_mul_age
...
sex_female
sex_male
social_status_high
social_status_normal
Survived
Pclass
Age
SibSp
Parch
Fare
count
9.000000
9.000000
9
9.000000
9.000000
9
9.000000
9
9.000000
9.000000
...
9
9
9
9
9
9.000000
6.000000
9.000000
9
9.000000
mean
27.000000
9.222689
0
2.666667
0.111111
1
0.111111
0
3.750156
257.586667
...
0
1
0
1
1
2.666667
28.000000
0.111111
0
9.222689
std
7.158911
2.333487
0
0.500000
0.333333
0
0.333333
0
1.788228
130.690269
...
0
0
0
0
0
0.500000
8.854377
0.333333
0
2.333487
min
19.000000
7.229200
0
2.000000
0.000000
1
0.000000
0
2.409733
152.950000
...
0
1
0
1
1
2.000000
19.000000
0.000000
0
7.229200
25%
25.000000
7.775000
0
2.000000
0.000000
1
0.000000
0
2.591667
193.750000
...
0
1
0
1
1
2.000000
20.500000
0.000000
0
7.775000
50%
25.000000
7.925000
0
3.000000
0.000000
1
0.000000
0
2.641667
194.375000
...
0
1
0
1
1
3.000000
28.000000
0.000000
0
7.925000
75%
31.000000
10.500000
0
3.000000
0.000000
1
0.000000
0
5.250000
253.600000
...
0
1
0
1
1
3.000000
31.750000
0.000000
0
10.500000
max
42.000000
13.000000
0
3.000000
1.000000
1
1.000000
0
6.500000
546.000000
...
0
1
0
1
1
3.000000
42.000000
1.000000
0
13.000000
8 rows × 31 columns
In [10]:
df[df.pred_type=='FN'].describe()
Out[10]:
age
fare
parents_and_children
p_class
siblings_and_spouses
survived
fam_size
fam_mul_size
fare_mul_pclass
fare_mul_age
...
sex_female
sex_male
social_status_high
social_status_normal
Survived
Pclass
Age
SibSp
Parch
Fare
count
9.000000
9.000000
9
9.000000
9.000000
9
9.000000
9
9.000000
9.000000
...
9
9
9
9
9
9.000000
6.000000
9.000000
9
9.000000
mean
27.000000
9.222689
0
2.666667
0.111111
1
0.111111
0
3.750156
257.586667
...
0
1
0
1
1
2.666667
28.000000
0.111111
0
9.222689
std
7.158911
2.333487
0
0.500000
0.333333
0
0.333333
0
1.788228
130.690269
...
0
0
0
0
0
0.500000
8.854377
0.333333
0
2.333487
min
19.000000
7.229200
0
2.000000
0.000000
1
0.000000
0
2.409733
152.950000
...
0
1
0
1
1
2.000000
19.000000
0.000000
0
7.229200
25%
25.000000
7.775000
0
2.000000
0.000000
1
0.000000
0
2.591667
193.750000
...
0
1
0
1
1
2.000000
20.500000
0.000000
0
7.775000
50%
25.000000
7.925000
0
3.000000
0.000000
1
0.000000
0
2.641667
194.375000
...
0
1
0
1
1
3.000000
28.000000
0.000000
0
7.925000
75%
31.000000
10.500000
0
3.000000
0.000000
1
0.000000
0
5.250000
253.600000
...
0
1
0
1
1
3.000000
31.750000
0.000000
0
10.500000
max
42.000000
13.000000
0
3.000000
1.000000
1
1.000000
0
6.500000
546.000000
...
0
1
0
1
1
3.000000
42.000000
1.000000
0
13.000000
8 rows × 31 columns
In [11]:
print(classification_report(train_y, predict_y, target_names=['Not survived', 'Survived']))
precision recall f1-score support
Not survived 0.98 0.99 0.99 549
Survived 0.99 0.97 0.98 342
avg / total 0.99 0.99 0.99 891
In [12]:
plots.feature_importance(model, features)
/Users/Edu/Envs/sklearn_model_eval/lib/python2.7/site-packages/matplotlib/collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
if self._edgecolors == str('face'):
Out[12]:
In [13]:
tables.feature_importances(model, features)
Out[13]:
name importance std
sex_male 0.130496 0.121147
sex_female 0.119978 0.113127
fare_div_age 0.102264 0.0494655
fare_mul_age 0.0927848 0.0318357
pclass_mul_age 0.0922766 0.051091
fare_mul_pclass 0.0821204 0.0403248
fare 0.0741329 0.0299099
pclass_div_age 0.0677291 0.0258095
age 0.0606519 0.0230871
p_class 0.0367528 0.0386757
fam_size 0.0320025 0.0220324
siblings_and_spouses 0.0204828 0.0184914
cabin_U 0.0190486 0.0262712
fam_mul_size 0.0189731 0.0157724
parents_and_children 0.0120584 0.00992616
social_status_normal 0.00884819 0.0135133
social_status_high 0.00677494 0.00966199
cabin_E 0.00635245 0.00742544
cabin_B 0.00424109 0.00790835
cabin_D 0.00397173 0.00596327
cabin_C 0.00381588 0.00336495
cabin_A 0.0023138 0.00420524
cabin_F 0.00112749 0.00199341
cabin_G 0.000802325 0.00200635
In [14]:
df.pred_type.value_counts()
Out[14]:
TN 545
TP 333
FN 9
FP 4
Name: pred_type, dtype: int64
In [15]:
plots.confusion_matrix_(train_y, predict_y, target_names=[0,1])
Out[15]:
In [16]:
plots.precision_recall(train_y, predict_scores[:,1])
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-16-e2ab6f13013c> in <module>()
----> 1 plots.precision_recall(train_y, predict_scores[:,1])
NameError: name 'predict_scores' is not defined
In [17]:
plots.roc(train_y, predict_scores[:,1])
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-17-0e7c8f6510f6> in <module>()
----> 1 plots.roc(train_y, predict_scores[:,1])
NameError: name 'predict_scores' is not defined
In [ ]:
Content source: edublancas/titanic
Similar notebooks: