notebook.community

Edit and run



In [61]:

    
# coding: utf-8
# http://blog.csdn.net/han_xiaoyang/article/details/49797143



In [62]:

    
# 科学计算
import numpy as np
# 数据分析
import pandas as pd
from pandas import Series, DataFrame



In [63]:

    
data_train = pd.read_csv('input/train.csv')
data_train.head()









    Out[63]:






  
    
      
      PassengerId
      Survived
      Pclass
      Name
      Sex
      Age
      SibSp
      Parch
      Ticket
      Fare
      Cabin
      Embarked
    
  
  
    
      0
      1
      0
      3
      Braund, Mr. Owen Harris
      male
      22.0
      1
      0
      A/5 21171
      7.2500
      NaN
      S
    
    
      1
      2
      1
      1
      Cumings, Mrs. John Bradley (Florence Briggs Th...
      female
      38.0
      1
      0
      PC 17599
      71.2833
      C85
      C
    
    
      2
      3
      1
      3
      Heikkinen, Miss. Laina
      female
      26.0
      0
      0
      STON/O2. 3101282
      7.9250
      NaN
      S
    
    
      3
      4
      1
      1
      Futrelle, Mrs. Jacques Heath (Lily May Peel)
      female
      35.0
      1
      0
      113803
      53.1000
      C123
      S
    
    
      4
      5
      0
      3
      Allen, Mr. William Henry
      male
      35.0
      0
      0
      373450
      8.0500
      NaN
      S



In [64]:

    
# PassengerId => 乘客ID
# Pclass => 乘客等级(1/2/3等舱位)
# Name => 乘客姓名
# Sex => 性别
# Age => 年龄
# SibSp => 堂兄弟/妹个数
# Parch => 父母与小孩个数
# Ticket => 船票信息
# Fare => 票价
# Cabin => 客舱
# Embarked => 登船港口
# Survived字段表示的是该乘客是否获救



In [65]:

    
data_train.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB



In [66]:

    
# Age（年龄）属性只有714名乘客有记录
# Cabin（客舱）更是只有204名乘客是已知的



In [67]:

    
data_train.describe()









    Out[67]:






  
    
      
      PassengerId
      Survived
      Pclass
      Age
      SibSp
      Parch
      Fare
    
  
  
    
      count
      891.000000
      891.000000
      891.000000
      714.000000
      891.000000
      891.000000
      891.000000
    
    
      mean
      446.000000
      0.383838
      2.308642
      29.699118
      0.523008
      0.381594
      32.204208
    
    
      std
      257.353842
      0.486592
      0.836071
      14.526497
      1.102743
      0.806057
      49.693429
    
    
      min
      1.000000
      0.000000
      1.000000
      0.420000
      0.000000
      0.000000
      0.000000
    
    
      25%
      223.500000
      0.000000
      2.000000
      20.125000
      0.000000
      0.000000
      7.910400
    
    
      50%
      446.000000
      0.000000
      3.000000
      28.000000
      0.000000
      0.000000
      14.454200
    
    
      75%
      668.500000
      1.000000
      3.000000
      38.000000
      1.000000
      0.000000
      31.000000
    
    
      max
      891.000000
      1.000000
      3.000000
      80.000000
      8.000000
      6.000000
      512.329200



In [68]:

    
##############################################################################
# 数据初步分析
##############################################################################



In [69]:

    
# 乘客各属性分布
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(14, 8))
fig.set(alpha=0.2)  # 设定图表颜色alpha参数

plt.subplot2grid((2,3),(0,0))             # 在一张大图里分列几个小图
data_train.Survived.value_counts().plot(kind='bar')# 柱状图 
plt.title(u"获救情况 (1为获救)") # 标题
plt.ylabel(u"人数")  

plt.subplot2grid((2,3),(0,1))
data_train.Pclass.value_counts().plot(kind="bar")
plt.title(u"乘客等级分布")
plt.ylabel(u"人数")

plt.subplot2grid((2,3),(0,2))
plt.scatter(data_train.Survived, data_train.Age)
plt.title(u"按年龄看获救分布 (1为获救)")
plt.ylabel(u"年龄")                         # 设定纵坐标名称
plt.grid(b=True, which='major', axis='y') 

plt.subplot2grid((2,3),(1,0), colspan=2)
data_train.Age[data_train.Pclass == 1].plot(kind='kde')   
data_train.Age[data_train.Pclass == 2].plot(kind='kde')
data_train.Age[data_train.Pclass == 3].plot(kind='kde')
plt.title(u"各等级的乘客年龄分布")
plt.xlabel(u"年龄")# plots an axis lable
plt.ylabel(u"密度") 
plt.legend((u'头等舱', u'2等舱',u'3等舱'),loc='best') # sets our legend for our graph.

plt.subplot2grid((2,3),(1,2))
data_train.Embarked.value_counts().plot(kind='bar')
plt.title(u"各登船口岸上船人数")
plt.ylabel(u"人数")  

plt.show()



In [70]:

    
# 看看各乘客等级的获救情况
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(10, 4))
fig.set(alpha=0.2)

Survived_1 = data_train.Pclass[data_train.Survived == 1].value_counts()
Survived_0 = data_train.Pclass[data_train.Survived == 0].value_counts()
df = pd.DataFrame({u'获救': Survived_1, u'未获救': Survived_0})
df.plot(kind='bar', stacked=True)
plt.title(u'各乘客等级的获救情况')
plt.xlabel(u'乘客等级')
plt.ylabel(u'人数')
plt.show()

data_train.Pclass[data_train.Survived == 1].plot(kind='kde') 
data_train.Pclass[data_train.Survived == 0].plot(kind='kde')
plt.title(u'各乘客等级的获救情况')
plt.xlabel(u'乘客等级')
plt.ylabel(u'密度')
plt.legend((u'获救', u'未获救'),loc='best') # sets our legend for our graph.

plt.show()









    





<matplotlib.figure.Figure at 0xc837710>



In [71]:

    
# 看看各性别的获救情况
fig = plt.figure()
fig.set(alpha=0.2)

Survived_m = data_train.Survived[data_train.Sex == 'male'].value_counts()
Survived_f = data_train.Survived[data_train.Sex == 'female'].value_counts()
df = pd.DataFrame({u'男性':Survived_m, u'女性':Survived_f})
df.plot(kind='bar', stacked=True)
plt.title(u'按性别看获救情况')
plt.xlabel(u'性别')
plt.ylabel(u'人数')
plt.show()









    





<matplotlib.figure.Figure at 0xc58e278>



In [72]:

    
# 各乘客等级下的性别获救情况
fig = plt.figure(figsize=(16, 8))
fig.set(alpha=0.65)
plt.title(u'各乘客等级下的性别获救情况')

Survived_f = data_train.Survived[data_train.Sex == 'female']
Survived_m = data_train.Survived[data_train.Sex == 'male']

ax1 = fig.add_subplot(141)
Survived_f[data_train.Pclass != 3].value_counts().plot(
    kind='bar', label='female, high class', color='#FA2479')
ax1.set_xticklabels([u'获救', u'未获救'], rotation=0)
ax1.legend([u'女性／高级舱'], loc='best')

ax2 = fig.add_subplot(142, sharey=ax1)
Survived_f[data_train.Pclass == 3].value_counts().plot(
    kind='bar', label='female, low class', color='pink')
ax2.set_xticklabels([u'未获救', u'获救'], rotation=0)
plt.legend([u'女性／低级舱'], loc='best')

ax3 = fig.add_subplot(143, sharey=ax1)
Survived_m[data_train.Pclass != 3].value_counts().plot(
    kind='bar', label='male, high class', color='lightblue')
ax3.set_xticklabels([u'未获救', u'获救'])
plt.legend([u'男性／高级舱'], loc='best')

ax4 = fig.add_subplot(144, sharey=ax1)
Survived_m[data_train.Pclass == 3].value_counts().plot(
    kind='bar', label='male, low class', color='steelblue')
ax4.set_xticklabels([u'未获救', u'获救'])
plt.legend([u'男性／低级舱'], loc='best')

plt.show()



In [73]:

    
# 各登船港口的获救情况
fig = plt.figure(figsize=(16, 8))
fig.set(alpha=0.2)

Survived_0 = data_train.Embarked[data_train.Survived == 0].value_counts()
Survived_1 = data_train.Embarked[data_train.Survived == 1].value_counts()
df = pd.DataFrame({u'获救':Survived_1, u'未获救':Survived_0})
df.plot(kind='bar', stacked=True)
plt.title(u'各登船港口的获救情况')
plt.xlabel(u'登陆港口')
plt.ylabel(u'人数')
plt.show()









    





<matplotlib.figure.Figure at 0xc55d908>



In [74]:

    
sibsp = data_train.groupby(['SibSp', 'Survived'])
df = pd.DataFrame(sibsp.count()['PassengerId'])
df









    Out[74]:






  
    
      
      
      PassengerId
    
    
      SibSp
      Survived
      
    
  
  
    
      0
      0
      398
    
    
      1
      210
    
    
      1
      0
      97
    
    
      1
      112
    
    
      2
      0
      15
    
    
      1
      13
    
    
      3
      0
      12
    
    
      1
      4
    
    
      4
      0
      15
    
    
      1
      3
    
    
      5
      0
      5
    
    
      8
      0
      7



In [75]:

    
parch = data_train.groupby(['Parch', 'Survived'])
df = pd.DataFrame(parch.count()['PassengerId'])
df









    Out[75]:






  
    
      
      
      PassengerId
    
    
      Parch
      Survived
      
    
  
  
    
      0
      0
      445
    
    
      1
      233
    
    
      1
      0
      53
    
    
      1
      65
    
    
      2
      0
      40
    
    
      1
      40
    
    
      3
      0
      2
    
    
      1
      3
    
    
      4
      0
      4
    
    
      5
      0
      4
    
    
      1
      1
    
    
      6
      0
      1



In [76]:

    
# cabin只有204个乘客有值，我们先看看它的一个分布
data_train.Cabin.value_counts()









    Out[76]:





G6                 4
C23 C25 C27        4
B96 B98            4
C22 C26            3
D                  3
F33                3
F2                 3
E101               3
B5                 2
E33                2
E67                2
B77                2
C2                 2
C123               2
B49                2
C93                2
C78                2
B57 B59 B63 B66    2
D35                2
B20                2
E121               2
B28                2
B22                2
F G73              2
B51 B53 B55        2
C126               2
D36                2
D17                2
C124               2
C125               2
                  ..
C87                1
D37                1
C95                1
A26                1
B38                1
A34                1
C118               1
F38                1
C106               1
C49                1
C47                1
D21                1
C128               1
B79                1
B30                1
D45                1
D10 D12            1
B3                 1
B102               1
A6                 1
A5                 1
E34                1
C103               1
D56                1
A32                1
E49                1
B19                1
C86                1
E10                1
C46                1
Name: Cabin, dtype: int64



In [77]:

    
fig = plt.figure()
fig.set(alpha=0.2)

Survived_cabin = data_train.Survived[pd.notnull(data_train.Cabin)].value_counts()
Survived_nocabin = data_train.Survived[pd.isnull(data_train.Cabin)].value_counts()
df = pd.DataFrame({u'有':Survived_cabin, u'无':Survived_nocabin}).transpose()
df.plot(kind='bar', stacked=True)
plt.title(u'按Cabin有无看获救情况')
plt.xlabel(u'Cabin有无')
plt.ylabel(u'人数')
plt.legend([u'未获救', u'获救'])
plt.show()









    





<matplotlib.figure.Figure at 0xa4bddd8>



In [78]:

    
##############################################################################  
# 简单数据预处理
##############################################################################



In [79]:

    
from sklearn.ensemble import RandomForestRegressor

# 使用 RandomForestClassifier 填补缺失的年龄属性
def set_missing_ages(df):
    # 把已有的数值型特征取出来丢进Random Forest Regressor中
    age_df = df[['Age','Fare', 'Parch', 'SibSp', 'Pclass']]
    # 乘客分成已知年龄和未知年龄两部分
    known_age = age_df[age_df.Age.notnull()].as_matrix()
    unknown_age = age_df[age_df.Age.isnull()].as_matrix()
    # y即目标年龄
    y = known_age[:, 0]
    # X即特征属性值
    X = known_age[:, 1:]
    # fit到RandomForestRegressor之中
    rfr = RandomForestRegressor(random_state=0, n_estimators=2000, n_jobs=-1)
    rfr.fit(X, y)
    # 用得到的模型进行未知年龄结果预测
    predictedAges = rfr.predict(unknown_age[:, 1::])
    # 用得到的预测结果填补原缺失数据
    df.loc[ (df.Age.isnull()), 'Age' ] = predictedAges 
    return df, rfr

def set_Cabin_type(df):
    df.loc[(df.Cabin.notnull()), 'Cabin'] = 'Yes'
    df.loc[(df.Cabin.isnull()), 'Cabin'] = 'No'
    return df

data_train, rfr = set_missing_ages(data_train)
data_train = set_Cabin_type(data_train)
data_train.head()









    Out[79]:






  
    
      
      PassengerId
      Survived
      Pclass
      Name
      Sex
      Age
      SibSp
      Parch
      Ticket
      Fare
      Cabin
      Embarked
    
  
  
    
      0
      1
      0
      3
      Braund, Mr. Owen Harris
      male
      22.0
      1
      0
      A/5 21171
      7.2500
      No
      S
    
    
      1
      2
      1
      1
      Cumings, Mrs. John Bradley (Florence Briggs Th...
      female
      38.0
      1
      0
      PC 17599
      71.2833
      Yes
      C
    
    
      2
      3
      1
      3
      Heikkinen, Miss. Laina
      female
      26.0
      0
      0
      STON/O2. 3101282
      7.9250
      No
      S
    
    
      3
      4
      1
      1
      Futrelle, Mrs. Jacques Heath (Lily May Peel)
      female
      35.0
      1
      0
      113803
      53.1000
      Yes
      S
    
    
      4
      5
      0
      3
      Allen, Mr. William Henry
      male
      35.0
      0
      0
      373450
      8.0500
      No
      S



In [80]:

    
data_train.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          891 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB



In [81]:

    
# 对类目型的特征因子化
dummies_Cabin = pd.get_dummies(data_train['Cabin'], prefix='Cabin')
dummies_Embarked = pd.get_dummies(data_train['Embarked'], prefix='Embarked')
dummies_Sex = pd.get_dummies(data_train['Sex'], prefix='Sex')
dummies_Pclass = pd.get_dummies(data_train['Pclass'], prefix='Pclass')

df = pd.concat([data_train, dummies_Cabin, dummies_Embarked, dummies_Sex, dummies_Pclass], axis=1)
df.drop(['Pclass', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True)
df.head()









    Out[81]:






  
    
      
      PassengerId
      Survived
      Age
      SibSp
      Parch
      Fare
      Cabin_No
      Cabin_Yes
      Embarked_C
      Embarked_Q
      Embarked_S
      Sex_female
      Sex_male
      Pclass_1
      Pclass_2
      Pclass_3
    
  
  
    
      0
      1
      0
      22.0
      1
      0
      7.2500
      1
      0
      0
      0
      1
      0
      1
      0
      0
      1
    
    
      1
      2
      1
      38.0
      1
      0
      71.2833
      0
      1
      1
      0
      0
      1
      0
      1
      0
      0
    
    
      2
      3
      1
      26.0
      0
      0
      7.9250
      1
      0
      0
      0
      1
      1
      0
      0
      0
      1
    
    
      3
      4
      1
      35.0
      1
      0
      53.1000
      0
      1
      0
      0
      1
      1
      0
      1
      0
      0
    
    
      4
      5
      0
      35.0
      0
      0
      8.0500
      1
      0
      0
      0
      1
      0
      1
      0
      0
      1



In [82]:

    
import sklearn.preprocessing as preprocessing

scaler = preprocessing.StandardScaler()
age_scale_param = scaler.fit(df['Age'])
df['Age_scaled'] = scaler.fit_transform(df['Age'], age_scale_param)
fare_scale_param = scaler.fit(df['Fare'])
df['Fare_scaled'] = scaler.fit_transform(df['Fare'], fare_scale_param)
df.head()









    



C:\Programs\Anaconda3\lib\site-packages\sklearn\preprocessing\data.py:586: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)
C:\Programs\Anaconda3\lib\site-packages\sklearn\preprocessing\data.py:586: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)
C:\Programs\Anaconda3\lib\site-packages\sklearn\preprocessing\data.py:649: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)
C:\Programs\Anaconda3\lib\site-packages\sklearn\preprocessing\data.py:586: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)
C:\Programs\Anaconda3\lib\site-packages\sklearn\preprocessing\data.py:586: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)
C:\Programs\Anaconda3\lib\site-packages\sklearn\preprocessing\data.py:649: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)






    Out[82]:






  
    
      
      PassengerId
      Survived
      Age
      SibSp
      Parch
      Fare
      Cabin_No
      Cabin_Yes
      Embarked_C
      Embarked_Q
      Embarked_S
      Sex_female
      Sex_male
      Pclass_1
      Pclass_2
      Pclass_3
      Age_scaled
      Fare_scaled
    
  
  
    
      0
      1
      0
      22.0
      1
      0
      7.2500
      1
      0
      0
      0
      1
      0
      1
      0
      0
      1
      -0.561363
      -0.502445
    
    
      1
      2
      1
      38.0
      1
      0
      71.2833
      0
      1
      1
      0
      0
      1
      0
      1
      0
      0
      0.613182
      0.786845
    
    
      2
      3
      1
      26.0
      0
      0
      7.9250
      1
      0
      0
      0
      1
      1
      0
      0
      0
      1
      -0.267727
      -0.488854
    
    
      3
      4
      1
      35.0
      1
      0
      53.1000
      0
      1
      0
      0
      1
      1
      0
      1
      0
      0
      0.392955
      0.420730
    
    
      4
      5
      0
      35.0
      0
      0
      8.0500
      1
      0
      0
      0
      1
      0
      1
      0
      0
      1
      0.392955
      -0.486337



In [83]:

    
from sklearn import linear_model

# 用正则取出我们要的属性值
train_df = df.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
train_np = train_df.as_matrix()

# y即Survival结果
y = train_np[:, 0]

# X即特征属性值
X = train_np[:, 1:]

# fit到RandomForestRegressor之中
clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6)
clf.fit(X, y)

clf









    Out[83]:





LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=1e-06,
          verbose=0, warm_start=False)



In [84]:

    
data_test = pd.read_csv('input/test.csv')
data_test.loc[(data_test.Fare.isnull()), 'Fare'] = 0
# 接着我们对test_data做和train_data中一致的特征变换
# 首先用同样的RandomForestRegressor模型填上丢失的年龄
tmp_df = data_test[['Age','Fare', 'Parch', 'SibSp', 'Pclass']]
null_age = tmp_df[data_test.Age.isnull()].as_matrix()
# 根据特征属性X预测年龄并补上
X = null_age[:, 1:]
predictedAges = rfr.predict(X)
data_test.loc[ (data_test.Age.isnull()), 'Age' ] = predictedAges

data_test = set_Cabin_type(data_test)
dummies_Cabin = pd.get_dummies(data_test['Cabin'], prefix= 'Cabin')
dummies_Embarked = pd.get_dummies(data_test['Embarked'], prefix= 'Embarked')
dummies_Sex = pd.get_dummies(data_test['Sex'], prefix= 'Sex')
dummies_Pclass = pd.get_dummies(data_test['Pclass'], prefix= 'Pclass')


df_test = pd.concat([data_test, dummies_Cabin, dummies_Embarked, dummies_Sex, dummies_Pclass], axis=1)
df_test.drop(['Pclass', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True)
df_test['Age_scaled'] = scaler.fit_transform(df_test['Age'], age_scale_param)
df_test['Fare_scaled'] = scaler.fit_transform(df_test['Fare'], fare_scale_param)
df_test.head()









    



C:\Programs\Anaconda3\lib\site-packages\sklearn\preprocessing\data.py:586: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)
C:\Programs\Anaconda3\lib\site-packages\sklearn\preprocessing\data.py:649: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)
C:\Programs\Anaconda3\lib\site-packages\sklearn\preprocessing\data.py:586: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)
C:\Programs\Anaconda3\lib\site-packages\sklearn\preprocessing\data.py:649: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)






    Out[84]:






  
    
      
      PassengerId
      Age
      SibSp
      Parch
      Fare
      Cabin_No
      Cabin_Yes
      Embarked_C
      Embarked_Q
      Embarked_S
      Sex_female
      Sex_male
      Pclass_1
      Pclass_2
      Pclass_3
      Age_scaled
      Fare_scaled
    
  
  
    
      0
      892
      34.5
      0
      0
      7.8292
      1
      0
      0
      1
      0
      0
      1
      0
      0
      1
      0.307535
      -0.496637
    
    
      1
      893
      47.0
      1
      0
      7.0000
      1
      0
      0
      0
      1
      1
      0
      0
      0
      1
      1.256230
      -0.511497
    
    
      2
      894
      62.0
      0
      0
      9.6875
      1
      0
      0
      1
      0
      0
      1
      0
      1
      0
      2.394665
      -0.463335
    
    
      3
      895
      27.0
      0
      0
      8.6625
      1
      0
      0
      0
      1
      0
      1
      0
      0
      1
      -0.261683
      -0.481704
    
    
      4
      896
      22.0
      1
      1
      12.2875
      1
      0
      0
      0
      1
      1
      0
      0
      0
      1
      -0.641161
      -0.416740



In [93]:

    
test = df_test.filter(regex='Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
predictions = clf.predict(test)
result = pd.DataFrame({'PassengerId':data_test['PassengerId'].as_matrix(),
                       'Survived':predictions.astype(np.int32)})
result.to_csv('output/titanic_lr_predictions.csv', index=False)
result.head()









    Out[93]:






  
    
      
      PassengerId
      Survived
    
  
  
    
      0
      892
      0
    
    
      1
      893
      0
    
    
      2
      894
      0
    
    
      3
      895
      0
    
    
      4
      896
      1



In [ ]:

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	373450	8.0500	NaN	S

	PassengerId	Survived	Pclass	Age	SibSp	Parch	Fare
count	891.000000	891.000000	891.000000	714.000000	891.000000	891.000000	891.000000
mean	446.000000	0.383838	2.308642	29.699118	0.523008	0.381594	32.204208
std	257.353842	0.486592	0.836071	14.526497	1.102743	0.806057	49.693429
min	1.000000	0.000000	1.000000	0.420000	0.000000	0.000000	0.000000
25%	223.500000	0.000000	2.000000	20.125000	0.000000	0.000000	7.910400
50%	446.000000	0.000000	3.000000	28.000000	0.000000	0.000000	14.454200
75%	668.500000	1.000000	3.000000	38.000000	1.000000	0.000000	31.000000
max	891.000000	1.000000	3.000000	80.000000	8.000000	6.000000	512.329200

		PassengerId
SibSp	Survived
0	0	398
0	1	210
1	0	97
1	1	112
2	0	15
2	1	13
3	0	12
3	1	4
4	0	15
4	1	3
5	0	5
8	0	7

	PassengerId	Survived	Age	SibSp	Fare	Cabin_No	Cabin_Yes	Embarked_C	Embarked_S	Sex_female	Sex_male	Pclass_1	Pclass_3	Age_scaled	Fare_scaled
0	1	0	22.0	1	7.2500	1	0	0	1	0	1	0	1	-0.561363	-0.502445
1	2	1	38.0	1	71.2833	0	1	1	0	1	0	1	0	0.613182	0.786845
2	3	1	26.0	0	7.9250	1	0	0	1	1	0	0	1	-0.267727	-0.488854
3	4	1	35.0	1	53.1000	0	1	0	1	1	0	1	0	0.392955	0.420730
4	5	0	35.0	0	8.0500	1	0	0	1	0	1	0	1	0.392955	-0.486337

	PassengerId	Age	SibSp	Parch	Fare	Cabin_No	Embarked_Q	Embarked_S	Sex_female	Sex_male	Pclass_2	Pclass_3	Age_scaled	Fare_scaled
0	892	34.5	0	0	7.8292	1	1	0	0	1	0	1	0.307535	-0.496637
1	893	47.0	1	0	7.0000	1	0	1	1	0	0	1	1.256230	-0.511497
2	894	62.0	0	0	9.6875	1	1	0	0	1	1	0	2.394665	-0.463335
3	895	27.0	0	0	8.6625	1	0	1	0	1	0	1	-0.261683	-0.481704
4	896	22.0	1	1	12.2875	1	0	1	1	0	0	1	-0.641161	-0.416740