Data explore


In [262]:
import pandas as pd #数据分析
import numpy as np #科学计算
from pandas import Series,DataFrame
import matplotlib.pyplot as plt
%matplotlib inline
data_train = pd.read_csv("./input/titanic/train.csv")
data_train.columns


Out[262]:
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [263]:
data_train.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB

In [264]:
data_train.describe()


Out[264]:
PassengerId Survived Pclass Age SibSp Parch Fare
count 891.000000 891.000000 891.000000 714.000000 891.000000 891.000000 891.000000
mean 446.000000 0.383838 2.308642 29.699118 0.523008 0.381594 32.204208
std 257.353842 0.486592 0.836071 14.526497 1.102743 0.806057 49.693429
min 1.000000 0.000000 1.000000 0.420000 0.000000 0.000000 0.000000
25% 223.500000 0.000000 2.000000 20.125000 0.000000 0.000000 7.910400
50% 446.000000 0.000000 3.000000 28.000000 0.000000 0.000000 14.454200
75% 668.500000 1.000000 3.000000 38.000000 1.000000 0.000000 31.000000
max 891.000000 1.000000 3.000000 80.000000 8.000000 6.000000 512.329200

In [265]:
fig=plt.figure()
fig.set_alpha(0.5)
plt.subplot2grid((2,3),(0,0))
data_train.Survived.value_counts().plot(kind = 'bar')
plt.title(u'获救人数1为获救')
plt.ylabel(u'人数')
plt.subplot2grid((2,3),(0,1))
data_train.Pclass.value_counts().plot(kind='bar')
plt.title(u'乘客等级分布')
plt.ylabel(u'人数')
plt.subplot2grid((2,3),(0,2))
plt.scatter(data_train.Survived, data_train.Age)
plt.ylabel(u"年龄")                         # sets the y axis lable
plt.grid(b=True, which='major', axis='y') # formats the grid line style of our graphs
plt.title(u"按年龄看获救分布 (1为获救)")
plt.subplot2grid((2,3),(1,0), colspan=2)
data_train.Age[data_train.Pclass == 1].plot(kind='kde')   # plots a kernel desnsity estimate of the subset of the 1st class passanges's age
data_train.Age[data_train.Pclass == 2].plot(kind='kde')
data_train.Age[data_train.Pclass == 3].plot(kind='kde')
plt.xlabel(u"年龄")# plots an axis lable
plt.ylabel(u"密度") 
plt.title(u"各等级的乘客年龄分布")
plt.legend((u'头等舱', u'2等舱',u'3等舱'),loc='best') # sets our legend for our graph.

plt.subplot2grid((2,3),(1,2))
data_train.Embarked.value_counts().plot(kind='bar')
plt.title(u"各登船口岸上船人数")
plt.ylabel(u"人数")  


plt.show()



In [266]:
#看看各乘客等级的获救情况
fig = plt.figure()
fig.set(alpha=0.2)  # 设定图表颜色alpha参数

Survived_0 = data_train.Pclass[data_train.Survived == 0].value_counts()
Survived_1 = data_train.Pclass[data_train.Survived == 1].value_counts()
df=pd.DataFrame({u'获救':Survived_1, u'未获救':Survived_0})
df.plot(kind='bar', stacked=True)
plt.title(u"各乘客等级的获救情况")
plt.xlabel(u"乘客等级") 
plt.ylabel(u"人数") 

plt.show()


<matplotlib.figure.Figure at 0xd43e0b8>

In [267]:
#看看各登录港口的获救情况
fig = plt.figure()
fig.set(alpha=0.2)  # 设定图表颜色alpha参数

Survived_0 = data_train.Embarked[data_train.Survived == 0].value_counts()
Survived_1 = data_train.Embarked[data_train.Survived == 1].value_counts()
df=pd.DataFrame({u'获救':Survived_1, u'未获救':Survived_0})
df.plot(kind='bar', stacked=True)
plt.title(u"各登录港口乘客的获救情况")
plt.xlabel(u"登录港口") 
plt.ylabel(u"人数") 

plt.show()


<matplotlib.figure.Figure at 0xa8d5e10>

In [268]:
#看看各性别的获救情况
fig = plt.figure()
fig.set(alpha=0.2)  # 设定图表颜色alpha参数

Survived_m = data_train.Survived[data_train.Sex == 'male'].value_counts()
Survived_f = data_train.Survived[data_train.Sex == 'female'].value_counts()
df=pd.DataFrame({u'男性':Survived_m, u'女性':Survived_f})
df.plot(kind='bar', stacked=True)
plt.title(u"按性别看获救情况")
plt.xlabel(u"性别") 
plt.ylabel(u"人数")
plt.show()


<matplotlib.figure.Figure at 0xd482208>

In [269]:
fig=plt.figure()
fig.set(alpha=0.65) # 设置图像透明度,无所谓
plt.title(u"根据舱等级和性别的获救情况")

ax1=fig.add_subplot(141)
data_train.Survived[data_train.Sex == 'female'][data_train.Pclass != 3].value_counts().plot(kind='bar', label="female highclass", color='#FA2479')
ax1.set_xticklabels([u"获救", u"未获救"], rotation=0)
ax1.legend([u"女性/高级舱"], loc='best')

ax2=fig.add_subplot(142, sharey=ax1)
data_train.Survived[data_train.Sex == 'female'][data_train.Pclass == 3].value_counts().plot(kind='bar', label='female, low class', color='pink')
ax2.set_xticklabels([u"未获救", u"获救"], rotation=0)
plt.legend([u"女性/低级舱"], loc='best')

ax3=fig.add_subplot(143, sharey=ax1)
data_train.Survived[data_train.Sex == 'male'][data_train.Pclass != 3].value_counts().plot(kind='bar', label='male, high class',color='lightblue')
ax3.set_xticklabels([u"未获救", u"获救"], rotation=0)
plt.legend([u"男性/高级舱"], loc='best')

ax4=fig.add_subplot(144, sharey=ax1)
data_train.Survived[data_train.Sex == 'male'][data_train.Pclass == 3].value_counts().plot(kind='bar', label='male low class', color='steelblue')
ax4.set_xticklabels([u"未获救", u"获救"], rotation=0)
plt.legend([u"男性/低级舱"], loc='best')

plt.show()



In [270]:
g = data_train.groupby(['SibSp','Survived'])
df = pd.DataFrame(g.count()['PassengerId'])
df


Out[270]:
PassengerId
SibSp Survived
0 0 398
1 210
1 0 97
1 112
2 0 15
1 13
3 0 12
1 4
4 0 15
1 3
5 0 5
8 0 7

In [271]:
g = data_train.groupby(['Parch','Survived'])
df = pd.DataFrame(g.count()['PassengerId'])
df


Out[271]:
PassengerId
Parch Survived
0 0 445
1 233
1 0 53
1 65
2 0 40
1 40
3 0 2
1 3
4 0 4
5 0 4
1 1
6 0 1

处理缺失值异常值


In [272]:
from sklearn.ensemble import RandomForestRegressor
 
### 使用 RandomForestClassifier 填补缺失的年龄属性
def set_missing_ages(df):
    
    # 把已有的数值型特征取出来丢进Random Forest Regressor中
    age_df = df[['Age','Fare', 'Parch', 'SibSp', 'Pclass']]

    # 乘客分成已知年龄和未知年龄两部分
    known_age = age_df[age_df.Age.notnull()].as_matrix()
    unknown_age = age_df[age_df.Age.isnull()].as_matrix()

    # y即目标年龄
    y = known_age[:, 0]

    # X即特征属性值
    X = known_age[:, 1:]

    # fit到RandomForestRegressor之中
    rfr = RandomForestRegressor(random_state=0, n_estimators=2000, n_jobs=-1)
    rfr.fit(X, y)
    
    # 用得到的模型进行未知年龄结果预测
    predictedAges = rfr.predict(unknown_age[:, 1::])
    
    # 用得到的预测结果填补原缺失数据
    df.loc[ (df.Age.isnull()), 'Age' ] = predictedAges
    
    return df, rfr

def set_Cabin_type(df):
    df.loc[ (df.Cabin.notnull()), 'Cabin' ] = "Yes"
    df.loc[ (df.Cabin.isnull()), 'Cabin' ] = "No"
    return df

data_train, rfr = set_missing_ages(data_train)
data_train = set_Cabin_type(data_train)
data_train


Out[272]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.000000 1 0 A/5 21171 7.2500 No S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.000000 1 0 PC 17599 71.2833 Yes C
2 3 1 3 Heikkinen, Miss. Laina female 26.000000 0 0 STON/O2. 3101282 7.9250 No S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.000000 1 0 113803 53.1000 Yes S
4 5 0 3 Allen, Mr. William Henry male 35.000000 0 0 373450 8.0500 No S
5 6 0 3 Moran, Mr. James male 23.828953 0 0 330877 8.4583 No Q
6 7 0 1 McCarthy, Mr. Timothy J male 54.000000 0 0 17463 51.8625 Yes S
7 8 0 3 Palsson, Master. Gosta Leonard male 2.000000 3 1 349909 21.0750 No S
8 9 1 3 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) female 27.000000 0 2 347742 11.1333 No S
9 10 1 2 Nasser, Mrs. Nicholas (Adele Achem) female 14.000000 1 0 237736 30.0708 No C
10 11 1 3 Sandstrom, Miss. Marguerite Rut female 4.000000 1 1 PP 9549 16.7000 Yes S
11 12 1 1 Bonnell, Miss. Elizabeth female 58.000000 0 0 113783 26.5500 Yes S
12 13 0 3 Saundercock, Mr. William Henry male 20.000000 0 0 A/5. 2151 8.0500 No S
13 14 0 3 Andersson, Mr. Anders Johan male 39.000000 1 5 347082 31.2750 No S
14 15 0 3 Vestrom, Miss. Hulda Amanda Adolfina female 14.000000 0 0 350406 7.8542 No S
15 16 1 2 Hewlett, Mrs. (Mary D Kingcome) female 55.000000 0 0 248706 16.0000 No S
16 17 0 3 Rice, Master. Eugene male 2.000000 4 1 382652 29.1250 No Q
17 18 1 2 Williams, Mr. Charles Eugene male 32.066493 0 0 244373 13.0000 No S
18 19 0 3 Vander Planke, Mrs. Julius (Emelia Maria Vande... female 31.000000 1 0 345763 18.0000 No S
19 20 1 3 Masselmani, Mrs. Fatima female 29.518205 0 0 2649 7.2250 No C
20 21 0 2 Fynney, Mr. Joseph J male 35.000000 0 0 239865 26.0000 No S
21 22 1 2 Beesley, Mr. Lawrence male 34.000000 0 0 248698 13.0000 Yes S
22 23 1 3 McGowan, Miss. Anna "Annie" female 15.000000 0 0 330923 8.0292 No Q
23 24 1 1 Sloper, Mr. William Thompson male 28.000000 0 0 113788 35.5000 Yes S
24 25 0 3 Palsson, Miss. Torborg Danira female 8.000000 3 1 349909 21.0750 No S
25 26 1 3 Asplund, Mrs. Carl Oscar (Selma Augusta Emilia... female 38.000000 1 5 347077 31.3875 No S
26 27 0 3 Emir, Mr. Farred Chehab male 29.518205 0 0 2631 7.2250 No C
27 28 0 1 Fortune, Mr. Charles Alexander male 19.000000 3 2 19950 263.0000 Yes S
28 29 1 3 O'Dwyer, Miss. Ellen "Nellie" female 22.380113 0 0 330959 7.8792 No Q
29 30 0 3 Todoroff, Mr. Lalio male 27.947206 0 0 349216 7.8958 No S
... ... ... ... ... ... ... ... ... ... ... ... ...
861 862 0 2 Giles, Mr. Frederick Edward male 21.000000 1 0 28134 11.5000 No S
862 863 1 1 Swift, Mrs. Frederick Joel (Margaret Welles Ba... female 48.000000 0 0 17466 25.9292 Yes S
863 864 0 3 Sage, Miss. Dorothy Edith "Dolly" female 10.869867 8 2 CA. 2343 69.5500 No S
864 865 0 2 Gill, Mr. John William male 24.000000 0 0 233866 13.0000 No S
865 866 1 2 Bystrom, Mrs. (Karolina) female 42.000000 0 0 236852 13.0000 No S
866 867 1 2 Duran y More, Miss. Asuncion female 27.000000 1 0 SC/PARIS 2149 13.8583 No C
867 868 0 1 Roebling, Mr. Washington Augustus II male 31.000000 0 0 PC 17590 50.4958 Yes S
868 869 0 3 van Melkebeke, Mr. Philemon male 25.977889 0 0 345777 9.5000 No S
869 870 1 3 Johnson, Master. Harold Theodor male 4.000000 1 1 347742 11.1333 No S
870 871 0 3 Balkic, Mr. Cerin male 26.000000 0 0 349248 7.8958 No S
871 872 1 1 Beckwith, Mrs. Richard Leonard (Sallie Monypeny) female 47.000000 1 1 11751 52.5542 Yes S
872 873 0 1 Carlsson, Mr. Frans Olof male 33.000000 0 0 695 5.0000 Yes S
873 874 0 3 Vander Cruyssen, Mr. Victor male 47.000000 0 0 345765 9.0000 No S
874 875 1 2 Abelson, Mrs. Samuel (Hannah Wizosky) female 28.000000 1 0 P/PP 3381 24.0000 No C
875 876 1 3 Najib, Miss. Adele Kiamie "Jane" female 15.000000 0 0 2667 7.2250 No C
876 877 0 3 Gustafsson, Mr. Alfred Ossian male 20.000000 0 0 7534 9.8458 No S
877 878 0 3 Petroff, Mr. Nedelio male 19.000000 0 0 349212 7.8958 No S
878 879 0 3 Laleff, Mr. Kristo male 27.947206 0 0 349217 7.8958 No S
879 880 1 1 Potter, Mrs. Thomas Jr (Lily Alexenia Wilson) female 56.000000 0 1 11767 83.1583 Yes C
880 881 1 2 Shelley, Mrs. William (Imanita Parrish Hall) female 25.000000 0 1 230433 26.0000 No S
881 882 0 3 Markun, Mr. Johann male 33.000000 0 0 349257 7.8958 No S
882 883 0 3 Dahlberg, Miss. Gerda Ulrika female 22.000000 0 0 7552 10.5167 No S
883 884 0 2 Banfield, Mr. Frederick James male 28.000000 0 0 C.A./SOTON 34068 10.5000 No S
884 885 0 3 Sutehall, Mr. Henry Jr male 25.000000 0 0 SOTON/OQ 392076 7.0500 No S
885 886 0 3 Rice, Mrs. William (Margaret Norton) female 39.000000 0 5 382652 29.1250 No Q
886 887 0 2 Montvila, Rev. Juozas male 27.000000 0 0 211536 13.0000 No S
887 888 1 1 Graham, Miss. Margaret Edith female 19.000000 0 0 112053 30.0000 Yes S
888 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female 16.127950 1 2 W./C. 6607 23.4500 No S
889 890 1 1 Behr, Mr. Karl Howell male 26.000000 0 0 111369 30.0000 Yes C
890 891 0 3 Dooley, Mr. Patrick male 32.000000 0 0 370376 7.7500 No Q

891 rows × 12 columns

onehot编码


In [273]:
# 因为逻辑回归建模时,需要输入的特征都是数值型特征
# 我们先对类目型的特征离散/因子化
# 以Cabin为例,原本一个属性维度,因为其取值可以是['yes','no'],而将其平展开为'Cabin_yes','Cabin_no'两个属性
# 原本Cabin取值为yes的,在此处的'Cabin_yes'下取值为1,在'Cabin_no'下取值为0
# 原本Cabin取值为no的,在此处的'Cabin_yes'下取值为0,在'Cabin_no'下取值为1
# 我们使用pandas的get_dummies来完成这个工作,并拼接在原来的data_train之上,如下所示
dummies_Cabin = pd.get_dummies(data_train['Cabin'], prefix= 'Cabin')

dummies_Embarked = pd.get_dummies(data_train['Embarked'], prefix= 'Embarked')

dummies_Sex = pd.get_dummies(data_train['Sex'], prefix= 'Sex')

dummies_Pclass = pd.get_dummies(data_train['Pclass'], prefix= 'Pclass')

df = pd.concat([data_train, dummies_Cabin, dummies_Embarked, dummies_Sex, dummies_Pclass], axis=1)
df.drop(['Pclass', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True)
df


Out[273]:
PassengerId Survived Age SibSp Parch Fare Cabin_No Cabin_Yes Embarked_C Embarked_Q Embarked_S Sex_female Sex_male Pclass_1 Pclass_2 Pclass_3
0 1 0 22.000000 1 0 7.2500 1 0 0 0 1 0 1 0 0 1
1 2 1 38.000000 1 0 71.2833 0 1 1 0 0 1 0 1 0 0
2 3 1 26.000000 0 0 7.9250 1 0 0 0 1 1 0 0 0 1
3 4 1 35.000000 1 0 53.1000 0 1 0 0 1 1 0 1 0 0
4 5 0 35.000000 0 0 8.0500 1 0 0 0 1 0 1 0 0 1
5 6 0 23.828953 0 0 8.4583 1 0 0 1 0 0 1 0 0 1
6 7 0 54.000000 0 0 51.8625 0 1 0 0 1 0 1 1 0 0
7 8 0 2.000000 3 1 21.0750 1 0 0 0 1 0 1 0 0 1
8 9 1 27.000000 0 2 11.1333 1 0 0 0 1 1 0 0 0 1
9 10 1 14.000000 1 0 30.0708 1 0 1 0 0 1 0 0 1 0
10 11 1 4.000000 1 1 16.7000 0 1 0 0 1 1 0 0 0 1
11 12 1 58.000000 0 0 26.5500 0 1 0 0 1 1 0 1 0 0
12 13 0 20.000000 0 0 8.0500 1 0 0 0 1 0 1 0 0 1
13 14 0 39.000000 1 5 31.2750 1 0 0 0 1 0 1 0 0 1
14 15 0 14.000000 0 0 7.8542 1 0 0 0 1 1 0 0 0 1
15 16 1 55.000000 0 0 16.0000 1 0 0 0 1 1 0 0 1 0
16 17 0 2.000000 4 1 29.1250 1 0 0 1 0 0 1 0 0 1
17 18 1 32.066493 0 0 13.0000 1 0 0 0 1 0 1 0 1 0
18 19 0 31.000000 1 0 18.0000 1 0 0 0 1 1 0 0 0 1
19 20 1 29.518205 0 0 7.2250 1 0 1 0 0 1 0 0 0 1
20 21 0 35.000000 0 0 26.0000 1 0 0 0 1 0 1 0 1 0
21 22 1 34.000000 0 0 13.0000 0 1 0 0 1 0 1 0 1 0
22 23 1 15.000000 0 0 8.0292 1 0 0 1 0 1 0 0 0 1
23 24 1 28.000000 0 0 35.5000 0 1 0 0 1 0 1 1 0 0
24 25 0 8.000000 3 1 21.0750 1 0 0 0 1 1 0 0 0 1
25 26 1 38.000000 1 5 31.3875 1 0 0 0 1 1 0 0 0 1
26 27 0 29.518205 0 0 7.2250 1 0 1 0 0 0 1 0 0 1
27 28 0 19.000000 3 2 263.0000 0 1 0 0 1 0 1 1 0 0
28 29 1 22.380113 0 0 7.8792 1 0 0 1 0 1 0 0 0 1
29 30 0 27.947206 0 0 7.8958 1 0 0 0 1 0 1 0 0 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
861 862 0 21.000000 1 0 11.5000 1 0 0 0 1 0 1 0 1 0
862 863 1 48.000000 0 0 25.9292 0 1 0 0 1 1 0 1 0 0
863 864 0 10.869867 8 2 69.5500 1 0 0 0 1 1 0 0 0 1
864 865 0 24.000000 0 0 13.0000 1 0 0 0 1 0 1 0 1 0
865 866 1 42.000000 0 0 13.0000 1 0 0 0 1 1 0 0 1 0
866 867 1 27.000000 1 0 13.8583 1 0 1 0 0 1 0 0 1 0
867 868 0 31.000000 0 0 50.4958 0 1 0 0 1 0 1 1 0 0
868 869 0 25.977889 0 0 9.5000 1 0 0 0 1 0 1 0 0 1
869 870 1 4.000000 1 1 11.1333 1 0 0 0 1 0 1 0 0 1
870 871 0 26.000000 0 0 7.8958 1 0 0 0 1 0 1 0 0 1
871 872 1 47.000000 1 1 52.5542 0 1 0 0 1 1 0 1 0 0
872 873 0 33.000000 0 0 5.0000 0 1 0 0 1 0 1 1 0 0
873 874 0 47.000000 0 0 9.0000 1 0 0 0 1 0 1 0 0 1
874 875 1 28.000000 1 0 24.0000 1 0 1 0 0 1 0 0 1 0
875 876 1 15.000000 0 0 7.2250 1 0 1 0 0 1 0 0 0 1
876 877 0 20.000000 0 0 9.8458 1 0 0 0 1 0 1 0 0 1
877 878 0 19.000000 0 0 7.8958 1 0 0 0 1 0 1 0 0 1
878 879 0 27.947206 0 0 7.8958 1 0 0 0 1 0 1 0 0 1
879 880 1 56.000000 0 1 83.1583 0 1 1 0 0 1 0 1 0 0
880 881 1 25.000000 0 1 26.0000 1 0 0 0 1 1 0 0 1 0
881 882 0 33.000000 0 0 7.8958 1 0 0 0 1 0 1 0 0 1
882 883 0 22.000000 0 0 10.5167 1 0 0 0 1 1 0 0 0 1
883 884 0 28.000000 0 0 10.5000 1 0 0 0 1 0 1 0 1 0
884 885 0 25.000000 0 0 7.0500 1 0 0 0 1 0 1 0 0 1
885 886 0 39.000000 0 5 29.1250 1 0 0 1 0 1 0 0 0 1
886 887 0 27.000000 0 0 13.0000 1 0 0 0 1 0 1 0 1 0
887 888 1 19.000000 0 0 30.0000 0 1 0 0 1 1 0 1 0 0
888 889 0 16.127950 1 2 23.4500 1 0 0 0 1 1 0 0 0 1
889 890 1 26.000000 0 0 30.0000 0 1 1 0 0 0 1 1 0 0
890 891 0 32.000000 0 0 7.7500 1 0 0 1 0 0 1 0 0 1

891 rows × 16 columns


In [274]:
# 接下来我们要接着做一些数据预处理的工作,比如scaling,将一些变化幅度较大的特征化到[-1,1]之内
# 这样可以加速logistic regression的收敛
import sklearn.preprocessing as preprocessing
scaler = preprocessing.StandardScaler()
age_scale_param = scaler.fit(df['Age'])
df['Age_scaled'] = scaler.fit_transform(df['Age'], age_scale_param)
fare_scale_param = scaler.fit(df['Fare'])
df['Fare_scaled'] = scaler.fit_transform(df['Fare'], fare_scale_param)
df


D:\soft\anaconda3\lib\site-packages\sklearn\preprocessing\data.py:586: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)
D:\soft\anaconda3\lib\site-packages\sklearn\preprocessing\data.py:586: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)
D:\soft\anaconda3\lib\site-packages\sklearn\preprocessing\data.py:649: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)
D:\soft\anaconda3\lib\site-packages\sklearn\preprocessing\data.py:586: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)
D:\soft\anaconda3\lib\site-packages\sklearn\preprocessing\data.py:586: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)
D:\soft\anaconda3\lib\site-packages\sklearn\preprocessing\data.py:649: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)
Out[274]:
PassengerId Survived Age SibSp Parch Fare Cabin_No Cabin_Yes Embarked_C Embarked_Q Embarked_S Sex_female Sex_male Pclass_1 Pclass_2 Pclass_3 Age_scaled Fare_scaled
0 1 0 22.000000 1 0 7.2500 1 0 0 0 1 0 1 0 0 1 -0.561363 -0.502445
1 2 1 38.000000 1 0 71.2833 0 1 1 0 0 1 0 1 0 0 0.613182 0.786845
2 3 1 26.000000 0 0 7.9250 1 0 0 0 1 1 0 0 0 1 -0.267727 -0.488854
3 4 1 35.000000 1 0 53.1000 0 1 0 0 1 1 0 1 0 0 0.392955 0.420730
4 5 0 35.000000 0 0 8.0500 1 0 0 0 1 0 1 0 0 1 0.392955 -0.486337
5 6 0 23.828953 0 0 8.4583 1 0 0 1 0 0 1 0 0 1 -0.427102 -0.478116
6 7 0 54.000000 0 0 51.8625 0 1 0 0 1 0 1 1 0 0 1.787727 0.395814
7 8 0 2.000000 3 1 21.0750 1 0 0 0 1 0 1 0 0 1 -2.029545 -0.224083
8 9 1 27.000000 0 2 11.1333 1 0 0 0 1 1 0 0 0 1 -0.194318 -0.424256
9 10 1 14.000000 1 0 30.0708 1 0 1 0 0 1 0 0 1 0 -1.148636 -0.042956
10 11 1 4.000000 1 1 16.7000 0 1 0 0 1 1 0 0 0 1 -1.882726 -0.312172
11 12 1 58.000000 0 0 26.5500 0 1 0 0 1 1 0 1 0 0 2.081363 -0.113846
12 13 0 20.000000 0 0 8.0500 1 0 0 0 1 0 1 0 0 1 -0.708181 -0.486337
13 14 0 39.000000 1 5 31.2750 1 0 0 0 1 0 1 0 0 1 0.686591 -0.018709
14 15 0 14.000000 0 0 7.8542 1 0 0 0 1 1 0 0 0 1 -1.148636 -0.490280
15 16 1 55.000000 0 0 16.0000 1 0 0 0 1 1 0 0 1 0 1.861136 -0.326267
16 17 0 2.000000 4 1 29.1250 1 0 0 1 0 0 1 0 0 1 -2.029545 -0.061999
17 18 1 32.066493 0 0 13.0000 1 0 0 0 1 0 1 0 1 0 0.177609 -0.386671
18 19 0 31.000000 1 0 18.0000 1 0 0 0 1 1 0 0 0 1 0.099318 -0.285997
19 20 1 29.518205 0 0 7.2250 1 0 1 0 0 1 0 0 0 1 -0.009459 -0.502949
20 21 0 35.000000 0 0 26.0000 1 0 0 0 1 0 1 0 1 0 0.392955 -0.124920
21 22 1 34.000000 0 0 13.0000 0 1 0 0 1 0 1 0 1 0 0.319546 -0.386671
22 23 1 15.000000 0 0 8.0292 1 0 0 1 0 1 0 0 0 1 -1.075227 -0.486756
23 24 1 28.000000 0 0 35.5000 0 1 0 0 1 0 1 1 0 0 -0.120909 0.066360
24 25 0 8.000000 3 1 21.0750 1 0 0 0 1 1 0 0 0 1 -1.589090 -0.224083
25 26 1 38.000000 1 5 31.3875 1 0 0 0 1 1 0 0 0 1 0.613182 -0.016444
26 27 0 29.518205 0 0 7.2250 1 0 1 0 0 0 1 0 0 1 -0.009459 -0.502949
27 28 0 19.000000 3 2 263.0000 0 1 0 0 1 0 1 1 0 0 -0.781590 4.647001
28 29 1 22.380113 0 0 7.8792 1 0 0 1 0 1 0 0 0 1 -0.533459 -0.489776
29 30 0 27.947206 0 0 7.8958 1 0 0 0 1 0 1 0 0 1 -0.124784 -0.489442
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
861 862 0 21.000000 1 0 11.5000 1 0 0 0 1 0 1 0 1 0 -0.634772 -0.416873
862 863 1 48.000000 0 0 25.9292 0 1 0 0 1 1 0 1 0 0 1.347272 -0.126345
863 864 0 10.869867 8 2 69.5500 1 0 0 0 1 1 0 0 0 1 -1.378416 0.751946
864 865 0 24.000000 0 0 13.0000 1 0 0 0 1 0 1 0 1 0 -0.414545 -0.386671
865 866 1 42.000000 0 0 13.0000 1 0 0 0 1 1 0 0 1 0 0.906818 -0.386671
866 867 1 27.000000 1 0 13.8583 1 0 1 0 0 1 0 0 1 0 -0.194318 -0.369389
867 868 0 31.000000 0 0 50.4958 0 1 0 0 1 0 1 1 0 0 0.099318 0.368295
868 869 0 25.977889 0 0 9.5000 1 0 0 0 1 0 1 0 0 1 -0.269350 -0.457142
869 870 1 4.000000 1 1 11.1333 1 0 0 0 1 0 1 0 0 1 -1.882726 -0.424256
870 871 0 26.000000 0 0 7.8958 1 0 0 0 1 0 1 0 0 1 -0.267727 -0.489442
871 872 1 47.000000 1 1 52.5542 0 1 0 0 1 1 0 1 0 0 1.273863 0.409741
872 873 0 33.000000 0 0 5.0000 0 1 0 0 1 0 1 1 0 0 0.246136 -0.547748
873 874 0 47.000000 0 0 9.0000 1 0 0 0 1 0 1 0 0 1 1.273863 -0.467209
874 875 1 28.000000 1 0 24.0000 1 0 1 0 0 1 0 0 1 0 -0.120909 -0.165189
875 876 1 15.000000 0 0 7.2250 1 0 1 0 0 1 0 0 0 1 -1.075227 -0.502949
876 877 0 20.000000 0 0 9.8458 1 0 0 0 1 0 1 0 0 1 -0.708181 -0.450180
877 878 0 19.000000 0 0 7.8958 1 0 0 0 1 0 1 0 0 1 -0.781590 -0.489442
878 879 0 27.947206 0 0 7.8958 1 0 0 0 1 0 1 0 0 1 -0.124784 -0.489442
879 880 1 56.000000 0 1 83.1583 0 1 1 0 0 1 0 1 0 0 1.934545 1.025945
880 881 1 25.000000 0 1 26.0000 1 0 0 0 1 1 0 0 1 0 -0.341136 -0.124920
881 882 0 33.000000 0 0 7.8958 1 0 0 0 1 0 1 0 0 1 0.246136 -0.489442
882 883 0 22.000000 0 0 10.5167 1 0 0 0 1 1 0 0 0 1 -0.561363 -0.436671
883 884 0 28.000000 0 0 10.5000 1 0 0 0 1 0 1 0 1 0 -0.120909 -0.437007
884 885 0 25.000000 0 0 7.0500 1 0 0 0 1 0 1 0 0 1 -0.341136 -0.506472
885 886 0 39.000000 0 5 29.1250 1 0 0 1 0 1 0 0 0 1 0.686591 -0.061999
886 887 0 27.000000 0 0 13.0000 1 0 0 0 1 0 1 0 1 0 -0.194318 -0.386671
887 888 1 19.000000 0 0 30.0000 0 1 0 0 1 1 0 1 0 0 -0.781590 -0.044381
888 889 0 16.127950 1 2 23.4500 1 0 0 0 1 1 0 0 0 1 -0.992425 -0.176263
889 890 1 26.000000 0 0 30.0000 0 1 1 0 0 0 1 1 0 0 -0.267727 -0.044381
890 891 0 32.000000 0 0 7.7500 1 0 0 1 0 0 1 0 0 1 0.172727 -0.492378

891 rows × 18 columns

建模


In [275]:
# 我们把需要的feature字段取出来,转成numpy格式,使用scikit-learn中的LogisticRegression建模
from sklearn import linear_model

train_df = df.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
train_np = train_df.as_matrix()

# y即Survival结果
y = train_np[:, 0]

# X即特征属性值
X = train_np[:, 1:]

# fit到RandomForestRegressor之中
clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6)
clf.fit(X, y)
    
clf


Out[275]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=1e-06,
          verbose=0, warm_start=False)

In [276]:
data_test = pd.read_csv("./input/titanic/test.csv")
data_test.loc[ (data_test.Fare.isnull()), 'Fare' ] = 0
# 接着我们对test_data做和train_data中一致的特征变换
# 首先用同样的RandomForestRegressor模型填上丢失的年龄
tmp_df = data_test[['Age','Fare', 'Parch', 'SibSp', 'Pclass']]
null_age = tmp_df[data_test.Age.isnull()].as_matrix()
# 根据特征属性X预测年龄并补上
X = null_age[:, 1:]
predictedAges = rfr.predict(X)
data_test.loc[ (data_test.Age.isnull()), 'Age' ] = predictedAges

data_test = set_Cabin_type(data_test)
dummies_Cabin = pd.get_dummies(data_test['Cabin'], prefix= 'Cabin')
dummies_Embarked = pd.get_dummies(data_test['Embarked'], prefix= 'Embarked')
dummies_Sex = pd.get_dummies(data_test['Sex'], prefix= 'Sex')
dummies_Pclass = pd.get_dummies(data_test['Pclass'], prefix= 'Pclass')


df_test = pd.concat([data_test, dummies_Cabin, dummies_Embarked, dummies_Sex, dummies_Pclass], axis=1)
df_test.drop(['Pclass', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True)
df_test['Age_scaled'] = scaler.fit_transform(df_test['Age'], age_scale_param)
df_test['Fare_scaled'] = scaler.fit_transform(df_test['Fare'], fare_scale_param)
df_test


D:\soft\anaconda3\lib\site-packages\sklearn\preprocessing\data.py:586: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)
D:\soft\anaconda3\lib\site-packages\sklearn\preprocessing\data.py:649: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)
D:\soft\anaconda3\lib\site-packages\sklearn\preprocessing\data.py:586: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)
D:\soft\anaconda3\lib\site-packages\sklearn\preprocessing\data.py:649: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)
Out[276]:
PassengerId Age SibSp Parch Fare Cabin_No Cabin_Yes Embarked_C Embarked_Q Embarked_S Sex_female Sex_male Pclass_1 Pclass_2 Pclass_3 Age_scaled Fare_scaled
0 892 34.500000 0 0 7.8292 1 0 0 1 0 0 1 0 0 1 0.307535 -0.496637
1 893 47.000000 1 0 7.0000 1 0 0 0 1 1 0 0 0 1 1.256230 -0.511497
2 894 62.000000 0 0 9.6875 1 0 0 1 0 0 1 0 1 0 2.394665 -0.463335
3 895 27.000000 0 0 8.6625 1 0 0 0 1 0 1 0 0 1 -0.261683 -0.481704
4 896 22.000000 1 1 12.2875 1 0 0 0 1 1 0 0 0 1 -0.641161 -0.416740
5 897 14.000000 0 0 9.2250 1 0 0 0 1 0 1 0 0 1 -1.248326 -0.471623
6 898 30.000000 0 0 7.6292 1 0 0 1 0 1 0 0 0 1 -0.033996 -0.500221
7 899 26.000000 1 1 29.0000 1 0 0 0 1 0 1 0 1 0 -0.337578 -0.117238
8 900 18.000000 0 0 7.2292 1 0 1 0 0 1 0 0 0 1 -0.944743 -0.507390
9 901 21.000000 2 0 24.1500 1 0 0 0 1 0 1 0 0 1 -0.717056 -0.204154
10 902 27.947206 0 0 7.8958 1 0 0 0 1 0 1 0 0 1 -0.189794 -0.495444
11 903 46.000000 0 0 26.0000 1 0 0 0 1 0 1 1 0 0 1.180334 -0.171000
12 904 23.000000 1 0 82.2667 0 1 0 0 1 1 0 1 0 0 -0.565265 0.837349
13 905 63.000000 1 0 26.0000 1 0 0 0 1 0 1 0 1 0 2.470560 -0.171000
14 906 47.000000 1 0 61.1750 0 1 0 0 1 1 0 1 0 0 1.256230 0.459367
15 907 24.000000 1 0 27.7208 1 0 1 0 0 1 0 0 1 0 -0.489370 -0.140162
16 908 35.000000 0 0 12.3500 1 0 0 1 0 0 1 0 1 0 0.345482 -0.415620
17 909 21.000000 0 0 7.2250 1 0 1 0 0 0 1 0 0 1 -0.717056 -0.507465
18 910 27.000000 1 0 7.9250 1 0 0 0 1 1 0 0 0 1 -0.261683 -0.494920
19 911 45.000000 0 0 7.2250 1 0 1 0 0 1 0 0 0 1 1.104439 -0.507465
20 912 55.000000 1 0 59.4000 1 0 1 0 0 0 1 1 0 0 1.863395 0.427557
21 913 9.000000 0 1 3.1708 1 0 0 0 1 0 1 0 0 1 -1.627804 -0.580120
22 914 52.314311 0 0 31.6833 1 0 0 0 1 1 0 1 0 0 1.659563 -0.069151
23 915 21.000000 0 1 61.3792 1 0 1 0 0 0 1 1 0 0 -0.717056 0.463026
24 916 48.000000 1 3 262.3750 0 1 1 0 0 1 0 1 0 0 1.332126 4.065049
25 917 50.000000 1 0 14.5000 1 0 0 0 1 0 1 0 0 1 1.483917 -0.377090
26 918 22.000000 0 1 61.9792 0 1 1 0 0 1 0 1 0 0 -0.641161 0.473779
27 919 22.500000 0 0 7.2250 1 0 1 0 0 0 1 0 0 1 -0.603213 -0.507465
28 920 41.000000 0 0 30.5000 0 1 0 0 1 0 1 1 0 0 0.800856 -0.090356
29 921 23.458621 2 0 21.6792 1 0 1 0 0 0 1 0 0 1 -0.530458 -0.248433
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
388 1280 21.000000 0 0 7.7500 1 0 0 1 0 0 1 0 0 1 -0.717056 -0.498056
389 1281 6.000000 3 1 21.0750 1 0 0 0 1 0 1 0 0 1 -1.855491 -0.259261
390 1282 23.000000 0 0 93.5000 0 1 0 0 1 0 1 1 0 0 -0.565265 1.038659
391 1283 51.000000 0 1 39.4000 0 1 0 0 1 1 0 1 0 0 1.559813 0.069140
392 1284 13.000000 0 2 20.2500 1 0 0 0 1 0 1 0 0 1 -1.324222 -0.274045
393 1285 47.000000 0 0 10.5000 1 0 0 0 1 0 1 0 1 0 1.256230 -0.448774
394 1286 29.000000 3 1 22.0250 1 0 0 0 1 0 1 0 0 1 -0.109891 -0.242236
395 1287 18.000000 1 0 60.0000 0 1 0 0 1 1 0 1 0 0 -0.944743 0.438310
396 1288 24.000000 0 0 7.2500 1 0 0 1 0 0 1 0 0 1 -0.489370 -0.507017
397 1289 48.000000 1 1 79.2000 0 1 1 0 0 1 0 1 0 0 1.332126 0.782391
398 1290 22.000000 0 0 7.7750 1 0 0 0 1 0 1 0 0 1 -0.641161 -0.497608
399 1291 31.000000 0 0 7.7333 1 0 0 1 0 0 1 0 0 1 0.041900 -0.498356
400 1292 30.000000 0 0 164.8667 0 1 0 0 1 1 0 1 0 0 -0.033996 2.317614
401 1293 38.000000 1 0 21.0000 1 0 0 0 1 0 1 0 1 0 0.573169 -0.260605
402 1294 22.000000 0 1 59.4000 1 0 1 0 0 1 0 1 0 0 -0.641161 0.427557
403 1295 17.000000 0 0 47.1000 1 0 0 0 1 0 1 1 0 0 -1.020639 0.207130
404 1296 43.000000 1 0 27.7208 0 1 1 0 0 0 1 1 0 0 0.952648 -0.140162
405 1297 20.000000 0 0 13.8625 0 1 1 0 0 0 1 0 1 0 -0.792952 -0.388515
406 1298 23.000000 1 0 10.5000 1 0 0 0 1 0 1 0 1 0 -0.565265 -0.448774
407 1299 50.000000 1 1 211.5000 0 1 1 0 0 0 1 1 0 0 1.483917 3.153324
408 1300 19.895581 0 0 7.7208 1 0 0 1 0 1 0 0 0 1 -0.800877 -0.498580
409 1301 3.000000 1 1 13.7750 1 0 0 0 1 1 0 0 0 1 -2.083178 -0.390083
410 1302 35.295824 0 0 7.7500 1 0 0 1 0 1 0 0 0 1 0.367934 -0.498056
411 1303 37.000000 1 0 90.0000 0 1 0 1 0 1 0 1 0 0 0.497274 0.975936
412 1304 28.000000 0 0 7.7750 1 0 0 0 1 1 0 0 0 1 -0.185787 -0.497608
413 1305 30.705727 0 0 8.0500 1 0 0 0 1 0 1 0 0 1 0.019566 -0.492680
414 1306 39.000000 0 0 108.9000 0 1 1 0 0 1 0 1 0 0 0.649065 1.314641
415 1307 38.500000 0 0 7.2500 1 0 0 0 1 0 1 0 0 1 0.611117 -0.507017
416 1308 30.705727 0 0 8.0500 1 0 0 0 1 0 1 0 0 1 0.019566 -0.492680
417 1309 25.793502 1 1 22.3583 1 0 1 0 0 0 1 0 0 1 -0.353251 -0.236263

418 rows × 17 columns


In [277]:
test = df_test.filter(regex='Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
predictions = clf.predict(test)
result = pd.DataFrame({'PassengerId':data_test['PassengerId'].as_matrix(), 'Survived':predictions.astype(np.int32)})
result.to_csv("logistic_regression_predictions.csv", index=False)

In [278]:
# BaseLine 系统构建完成

学习曲线


In [279]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.learning_curve import learning_curve

# 用sklearn的learning_curve得到training_score和cv_score,使用matplotlib画出learning curve
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=1, 
                        train_sizes=np.linspace(.05, 1., 20), verbose=0, plot=True):
    """
    画出data在某模型上的learning curve.
    参数解释
    ----------
    estimator : 你用的分类器。
    title : 表格的标题。
    X : 输入的feature,numpy类型
    y : 输入的target vector
    ylim : tuple格式的(ymin, ymax), 设定图像中纵坐标的最低点和最高点
    cv : 做cross-validation的时候,数据分成的份数,其中一份作为cv集,其余n-1份作为training(默认为3份)
    n_jobs : 并行的的任务数(默认1)
    """
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, verbose=verbose)
    
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    
    if plot:
        plt.figure()
        plt.title(title)
        if ylim is not None:
            plt.ylim(*ylim)
        plt.xlabel(u"训练样本数")
        plt.ylabel(u"得分")
        plt.gca().invert_yaxis()
        plt.grid()
    
        plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, 
                         alpha=0.1, color="b")
        plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, 
                         alpha=0.1, color="r")
        plt.plot(train_sizes, train_scores_mean, 'o-', color="b", label=u"训练集上得分")
        plt.plot(train_sizes, test_scores_mean, 'o-', color="r", label=u"交叉验证集上得分")
    
        plt.legend(loc="best")
        
        plt.draw()
        plt.gca().invert_yaxis()
        plt.show()
    
    midpoint = ((train_scores_mean[-1] + train_scores_std[-1]) + (test_scores_mean[-1] - test_scores_std[-1])) / 2
    diff = (train_scores_mean[-1] + train_scores_std[-1]) - (test_scores_mean[-1] - test_scores_std[-1])
    return midpoint, diff

In [280]:
pd.DataFrame({"columns":list(train_df.columns)[1:], "coef":list(clf.coef_.T)})


Out[280]:
coef columns
0 [-0.34422839921] SibSp
1 [-0.104931034051] Parch
2 [0.0] Cabin_No
3 [0.902141342399] Cabin_Yes
4 [0.0] Embarked_C
5 [0.0] Embarked_Q
6 [-0.417261046864] Embarked_S
7 [1.95657542024] Sex_female
8 [-0.677420626893] Sex_male
9 [0.341143936557] Pclass_1
10 [0.0] Pclass_2
11 [-1.1941395756] Pclass_3
12 [-0.523782091128] Age_scaled
13 [0.0844324435407] Fare_scaled

Cross validation CV


In [ ]:
from sklearn