In [80]:
#https://github.com/HanXiaoyang/Kaggle_Titanic/blob/master/Titanic.ipynb
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [81]:
data_train = pd.read_csv("train.csv")
print (data_train.describe())


       PassengerId    Survived      Pclass         Age       SibSp  \
count   891.000000  891.000000  891.000000  714.000000  891.000000   
mean    446.000000    0.383838    2.308642   29.699118    0.523008   
std     257.353842    0.486592    0.836071   14.526497    1.102743   
min       1.000000    0.000000    1.000000    0.420000    0.000000   
25%     223.500000    0.000000    2.000000   20.125000    0.000000   
50%     446.000000    0.000000    3.000000   28.000000    0.000000   
75%     668.500000    1.000000    3.000000   38.000000    1.000000   
max     891.000000    1.000000    3.000000   80.000000    8.000000   

            Parch        Fare  
count  891.000000  891.000000  
mean     0.381594   32.204208  
std      0.806057   49.693429  
min      0.000000    0.000000  
25%      0.000000    7.910400  
50%      0.000000   14.454200  
75%      0.000000   31.000000  
max      6.000000  512.329200  

In [82]:
from sklearn.ensemble import RandomForestRegressor
 
### 使用 RandomForestClassifier 填补缺失的年龄属性
def set_missing_ages(df):
    
    # 把已有的数值型特征取出来丢进Random Forest Regressor中
    age_df = df[['Age','Fare', 'Parch', 'SibSp', 'Pclass']]

    # 乘客分成已知年龄和未知年龄两部分
    known_age = age_df[age_df.Age.notnull()].as_matrix()
    unknown_age = age_df[age_df.Age.isnull()].as_matrix()

    # y即目标年龄
    y = known_age[:, 0]

    # X即特征属性值
    X = known_age[:, 1:]

    # fit到RandomForestRegressor之中
    rfr = RandomForestRegressor(random_state=0, n_estimators=2000, n_jobs=-1)
    rfr.fit(X, y)
    
    # 用得到的模型进行未知年龄结果预测
    predictedAges = rfr.predict(unknown_age[:, 1::])
    
    # 用得到的预测结果填补原缺失数据
    df.loc[ (df.Age.isnull()), 'Age' ] = predictedAges 
    
    return df, rfr

def set_Cabin_type(df):
    df.loc[ (df.Cabin.notnull()), 'Cabin' ] = "Yes"
    df.loc[ (df.Cabin.isnull()), 'Cabin' ] = "No"
    return df

data_train, rfr = set_missing_ages(data_train)
data_train = set_Cabin_type(data_train)

In [83]:
dummies_Cabin = pd.get_dummies(data_train['Cabin'], prefix= 'Cabin')

dummies_Embarked = pd.get_dummies(data_train['Embarked'], prefix= 'Embarked')

dummies_Sex = pd.get_dummies(data_train['Sex'], prefix= 'Sex')

dummies_Pclass = pd.get_dummies(data_train['Pclass'], prefix= 'Pclass')

df = pd.concat([data_train, dummies_Cabin, dummies_Embarked, dummies_Sex, dummies_Pclass], axis=1)
df.drop(['Pclass', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True)
df


Out[83]:
PassengerId Survived Age SibSp Parch Fare Cabin_No Cabin_Yes Embarked_C Embarked_Q Embarked_S Sex_female Sex_male Pclass_1 Pclass_2 Pclass_3
0 1 0 22.000000 1 0 7.2500 1 0 0 0 1 0 1 0 0 1
1 2 1 38.000000 1 0 71.2833 0 1 1 0 0 1 0 1 0 0
2 3 1 26.000000 0 0 7.9250 1 0 0 0 1 1 0 0 0 1
3 4 1 35.000000 1 0 53.1000 0 1 0 0 1 1 0 1 0 0
4 5 0 35.000000 0 0 8.0500 1 0 0 0 1 0 1 0 0 1
5 6 0 23.838953 0 0 8.4583 1 0 0 1 0 0 1 0 0 1
6 7 0 54.000000 0 0 51.8625 0 1 0 0 1 0 1 1 0 0
7 8 0 2.000000 3 1 21.0750 1 0 0 0 1 0 1 0 0 1
8 9 1 27.000000 0 2 11.1333 1 0 0 0 1 1 0 0 0 1
9 10 1 14.000000 1 0 30.0708 1 0 1 0 0 1 0 0 1 0
10 11 1 4.000000 1 1 16.7000 0 1 0 0 1 1 0 0 0 1
11 12 1 58.000000 0 0 26.5500 0 1 0 0 1 1 0 1 0 0
12 13 0 20.000000 0 0 8.0500 1 0 0 0 1 0 1 0 0 1
13 14 0 39.000000 1 5 31.2750 1 0 0 0 1 0 1 0 0 1
14 15 0 14.000000 0 0 7.8542 1 0 0 0 1 1 0 0 0 1
15 16 1 55.000000 0 0 16.0000 1 0 0 0 1 1 0 0 1 0
16 17 0 2.000000 4 1 29.1250 1 0 0 1 0 0 1 0 0 1
17 18 1 32.066493 0 0 13.0000 1 0 0 0 1 0 1 0 1 0
18 19 0 31.000000 1 0 18.0000 1 0 0 0 1 1 0 0 0 1
19 20 1 29.518205 0 0 7.2250 1 0 1 0 0 1 0 0 0 1
20 21 0 35.000000 0 0 26.0000 1 0 0 0 1 0 1 0 1 0
21 22 1 34.000000 0 0 13.0000 0 1 0 0 1 0 1 0 1 0
22 23 1 15.000000 0 0 8.0292 1 0 0 1 0 1 0 0 0 1
23 24 1 28.000000 0 0 35.5000 0 1 0 0 1 0 1 1 0 0
24 25 0 8.000000 3 1 21.0750 1 0 0 0 1 1 0 0 0 1
25 26 1 38.000000 1 5 31.3875 1 0 0 0 1 1 0 0 0 1
26 27 0 29.518205 0 0 7.2250 1 0 1 0 0 0 1 0 0 1
27 28 0 19.000000 3 2 263.0000 0 1 0 0 1 0 1 1 0 0
28 29 1 22.380113 0 0 7.8792 1 0 0 1 0 1 0 0 0 1
29 30 0 27.947206 0 0 7.8958 1 0 0 0 1 0 1 0 0 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
861 862 0 21.000000 1 0 11.5000 1 0 0 0 1 0 1 0 1 0
862 863 1 48.000000 0 0 25.9292 0 1 0 0 1 1 0 1 0 0
863 864 0 10.869867 8 2 69.5500 1 0 0 0 1 1 0 0 0 1
864 865 0 24.000000 0 0 13.0000 1 0 0 0 1 0 1 0 1 0
865 866 1 42.000000 0 0 13.0000 1 0 0 0 1 1 0 0 1 0
866 867 1 27.000000 1 0 13.8583 1 0 1 0 0 1 0 0 1 0
867 868 0 31.000000 0 0 50.4958 0 1 0 0 1 0 1 1 0 0
868 869 0 25.977889 0 0 9.5000 1 0 0 0 1 0 1 0 0 1
869 870 1 4.000000 1 1 11.1333 1 0 0 0 1 0 1 0 0 1
870 871 0 26.000000 0 0 7.8958 1 0 0 0 1 0 1 0 0 1
871 872 1 47.000000 1 1 52.5542 0 1 0 0 1 1 0 1 0 0
872 873 0 33.000000 0 0 5.0000 0 1 0 0 1 0 1 1 0 0
873 874 0 47.000000 0 0 9.0000 1 0 0 0 1 0 1 0 0 1
874 875 1 28.000000 1 0 24.0000 1 0 1 0 0 1 0 0 1 0
875 876 1 15.000000 0 0 7.2250 1 0 1 0 0 1 0 0 0 1
876 877 0 20.000000 0 0 9.8458 1 0 0 0 1 0 1 0 0 1
877 878 0 19.000000 0 0 7.8958 1 0 0 0 1 0 1 0 0 1
878 879 0 27.947206 0 0 7.8958 1 0 0 0 1 0 1 0 0 1
879 880 1 56.000000 0 1 83.1583 0 1 1 0 0 1 0 1 0 0
880 881 1 25.000000 0 1 26.0000 1 0 0 0 1 1 0 0 1 0
881 882 0 33.000000 0 0 7.8958 1 0 0 0 1 0 1 0 0 1
882 883 0 22.000000 0 0 10.5167 1 0 0 0 1 1 0 0 0 1
883 884 0 28.000000 0 0 10.5000 1 0 0 0 1 0 1 0 1 0
884 885 0 25.000000 0 0 7.0500 1 0 0 0 1 0 1 0 0 1
885 886 0 39.000000 0 5 29.1250 1 0 0 1 0 1 0 0 0 1
886 887 0 27.000000 0 0 13.0000 1 0 0 0 1 0 1 0 1 0
887 888 1 19.000000 0 0 30.0000 0 1 0 0 1 1 0 1 0 0
888 889 0 16.193950 1 2 23.4500 1 0 0 0 1 1 0 0 0 1
889 890 1 26.000000 0 0 30.0000 0 1 1 0 0 0 1 1 0 0
890 891 0 32.000000 0 0 7.7500 1 0 0 1 0 0 1 0 0 1

891 rows × 16 columns


In [84]:
# 接下来我们要接着做一些数据预处理的工作,比如scaling,将一些变化幅度较大的特征化到[-1,1]之内
# 这样可以加速logistic regression的收敛
'''
import sklearn.preprocessing as preprocessing
scaler = preprocessing.StandardScaler()
age_scale_param = scaler.fit(df['Age'])
age_scale_param
df['Age_scaled'] = scaler.fit_transform(df['Age'], age_scale_param)
fare_scale_param = scaler.fit(df['Fare'])
df['Fare_scaled'] = scaler.fit_transform(df['Fare'], fare_scale_param)
df
'''


Out[84]:
"\nimport sklearn.preprocessing as preprocessing\nscaler = preprocessing.StandardScaler()\nage_scale_param = scaler.fit(df['Age'])\nage_scale_param\ndf['Age_scaled'] = scaler.fit_transform(df['Age'], age_scale_param)\nfare_scale_param = scaler.fit(df['Fare'])\ndf['Fare_scaled'] = scaler.fit_transform(df['Fare'], fare_scale_param)\ndf\n"

In [85]:
# 我们把需要的feature字段取出来,转成numpy格式,使用scikit-learn中的LogisticRegression建模
from sklearn import linear_model

train_df = df.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
train_np = train_df.as_matrix()

# y即Survival结果
y = train_np[:, 0]

# X即特征属性值
X = train_np[:, 1:]

# fit到RandomForestRegressor之中
clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6)
clf.fit(X, y)
    
clf


Out[85]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=1e-06,
          verbose=0, warm_start=False)

In [86]:
data_test = pd.read_csv("test.csv")
data_test.loc[ (data_test.Fare.isnull()), 'Fare' ] = 0
# 接着我们对test_data做和train_data中一致的特征变换
# 首先用同样的RandomForestRegressor模型填上丢失的年龄
tmp_df = data_test[['Age','Fare', 'Parch', 'SibSp', 'Pclass']]
null_age = tmp_df[data_test.Age.isnull()].as_matrix()
# 根据特征属性X预测年龄并补上
X = null_age[:, 1:]
predictedAges = rfr.predict(X)
data_test.loc[ (data_test.Age.isnull()), 'Age' ] = predictedAges

data_test = set_Cabin_type(data_test)
dummies_Cabin = pd.get_dummies(data_test['Cabin'], prefix= 'Cabin')
dummies_Embarked = pd.get_dummies(data_test['Embarked'], prefix= 'Embarked')
dummies_Sex = pd.get_dummies(data_test['Sex'], prefix= 'Sex')
dummies_Pclass = pd.get_dummies(data_test['Pclass'], prefix= 'Pclass')


df_test = pd.concat([data_test, dummies_Cabin, dummies_Embarked, dummies_Sex, dummies_Pclass], axis=1)
df_test.drop(['Pclass', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True)
#df_test['Age_scaled'] = scaler.fit_transform(df_test['Age'], age_scale_param)
#df_test['Fare_scaled'] = scaler.fit_transform(df_test['Fare'], fare_scale_param)
df_test


Out[86]:
PassengerId Age SibSp Parch Fare Cabin_No Cabin_Yes Embarked_C Embarked_Q Embarked_S Sex_female Sex_male Pclass_1 Pclass_2 Pclass_3
0 892 34.500000 0 0 7.8292 1 0 0 1 0 0 1 0 0 1
1 893 47.000000 1 0 7.0000 1 0 0 0 1 1 0 0 0 1
2 894 62.000000 0 0 9.6875 1 0 0 1 0 0 1 0 1 0
3 895 27.000000 0 0 8.6625 1 0 0 0 1 0 1 0 0 1
4 896 22.000000 1 1 12.2875 1 0 0 0 1 1 0 0 0 1
5 897 14.000000 0 0 9.2250 1 0 0 0 1 0 1 0 0 1
6 898 30.000000 0 0 7.6292 1 0 0 1 0 1 0 0 0 1
7 899 26.000000 1 1 29.0000 1 0 0 0 1 0 1 0 1 0
8 900 18.000000 0 0 7.2292 1 0 1 0 0 1 0 0 0 1
9 901 21.000000 2 0 24.1500 1 0 0 0 1 0 1 0 0 1
10 902 27.947206 0 0 7.8958 1 0 0 0 1 0 1 0 0 1
11 903 46.000000 0 0 26.0000 1 0 0 0 1 0 1 1 0 0
12 904 23.000000 1 0 82.2667 0 1 0 0 1 1 0 1 0 0
13 905 63.000000 1 0 26.0000 1 0 0 0 1 0 1 0 1 0
14 906 47.000000 1 0 61.1750 0 1 0 0 1 1 0 1 0 0
15 907 24.000000 1 0 27.7208 1 0 1 0 0 1 0 0 1 0
16 908 35.000000 0 0 12.3500 1 0 0 1 0 0 1 0 1 0
17 909 21.000000 0 0 7.2250 1 0 1 0 0 0 1 0 0 1
18 910 27.000000 1 0 7.9250 1 0 0 0 1 1 0 0 0 1
19 911 45.000000 0 0 7.2250 1 0 1 0 0 1 0 0 0 1
20 912 55.000000 1 0 59.4000 1 0 1 0 0 0 1 1 0 0
21 913 9.000000 0 1 3.1708 1 0 0 0 1 0 1 0 0 1
22 914 52.314311 0 0 31.6833 1 0 0 0 1 1 0 1 0 0
23 915 21.000000 0 1 61.3792 1 0 1 0 0 0 1 1 0 0
24 916 48.000000 1 3 262.3750 0 1 1 0 0 1 0 1 0 0
25 917 50.000000 1 0 14.5000 1 0 0 0 1 0 1 0 0 1
26 918 22.000000 0 1 61.9792 0 1 1 0 0 1 0 1 0 0
27 919 22.500000 0 0 7.2250 1 0 1 0 0 0 1 0 0 1
28 920 41.000000 0 0 30.5000 0 1 0 0 1 0 1 1 0 0
29 921 23.459683 2 0 21.6792 1 0 1 0 0 0 1 0 0 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
388 1280 21.000000 0 0 7.7500 1 0 0 1 0 0 1 0 0 1
389 1281 6.000000 3 1 21.0750 1 0 0 0 1 0 1 0 0 1
390 1282 23.000000 0 0 93.5000 0 1 0 0 1 0 1 1 0 0
391 1283 51.000000 0 1 39.4000 0 1 0 0 1 1 0 1 0 0
392 1284 13.000000 0 2 20.2500 1 0 0 0 1 0 1 0 0 1
393 1285 47.000000 0 0 10.5000 1 0 0 0 1 0 1 0 1 0
394 1286 29.000000 3 1 22.0250 1 0 0 0 1 0 1 0 0 1
395 1287 18.000000 1 0 60.0000 0 1 0 0 1 1 0 1 0 0
396 1288 24.000000 0 0 7.2500 1 0 0 1 0 0 1 0 0 1
397 1289 48.000000 1 1 79.2000 0 1 1 0 0 1 0 1 0 0
398 1290 22.000000 0 0 7.7750 1 0 0 0 1 0 1 0 0 1
399 1291 31.000000 0 0 7.7333 1 0 0 1 0 0 1 0 0 1
400 1292 30.000000 0 0 164.8667 0 1 0 0 1 1 0 1 0 0
401 1293 38.000000 1 0 21.0000 1 0 0 0 1 0 1 0 1 0
402 1294 22.000000 0 1 59.4000 1 0 1 0 0 1 0 1 0 0
403 1295 17.000000 0 0 47.1000 1 0 0 0 1 0 1 1 0 0
404 1296 43.000000 1 0 27.7208 0 1 1 0 0 0 1 1 0 0
405 1297 20.000000 0 0 13.8625 0 1 1 0 0 0 1 0 1 0
406 1298 23.000000 1 0 10.5000 1 0 0 0 1 0 1 0 1 0
407 1299 50.000000 1 1 211.5000 0 1 1 0 0 0 1 1 0 0
408 1300 19.895581 0 0 7.7208 1 0 0 1 0 1 0 0 0 1
409 1301 3.000000 1 1 13.7750 1 0 0 0 1 1 0 0 0 1
410 1302 35.295824 0 0 7.7500 1 0 0 1 0 1 0 0 0 1
411 1303 37.000000 1 0 90.0000 0 1 0 1 0 1 0 1 0 0
412 1304 28.000000 0 0 7.7750 1 0 0 0 1 1 0 0 0 1
413 1305 30.705727 0 0 8.0500 1 0 0 0 1 0 1 0 0 1
414 1306 39.000000 0 0 108.9000 0 1 1 0 0 1 0 1 0 0
415 1307 38.500000 0 0 7.2500 1 0 0 0 1 0 1 0 0 1
416 1308 30.705727 0 0 8.0500 1 0 0 0 1 0 1 0 0 1
417 1309 25.783377 1 1 22.3583 1 0 1 0 0 0 1 0 0 1

418 rows × 15 columns


In [87]:
test = df_test.filter(regex='Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
predictions = clf.predict(test)
result = pd.DataFrame({'PassengerId':data_test['PassengerId'].as_matrix(), 'Survived':predictions.astype(np.int32)})
result.to_csv("logistic_regression_predictions.csv", index=False)

In [88]:
# validation 
#predicted_home_prices = melbourne_model.predict(val_x)
#print mean_absolute_error(val_y,predicted_home_prices)

In [ ]: