In [61]:
# coding: utf-8
# http://blog.csdn.net/han_xiaoyang/article/details/49797143
In [62]:
# 科学计算
import numpy as np
# 数据分析
import pandas as pd
from pandas import Series, DataFrame
In [63]:
data_train = pd.read_csv('input/train.csv')
data_train.head()
Out[63]:
In [64]:
# PassengerId => 乘客ID
# Pclass => 乘客等级(1/2/3等舱位)
# Name => 乘客姓名
# Sex => 性别
# Age => 年龄
# SibSp => 堂兄弟/妹个数
# Parch => 父母与小孩个数
# Ticket => 船票信息
# Fare => 票价
# Cabin => 客舱
# Embarked => 登船港口
# Survived字段表示的是该乘客是否获救
In [65]:
data_train.info()
In [66]:
# Age(年龄)属性只有714名乘客有记录
# Cabin(客舱)更是只有204名乘客是已知的
In [67]:
data_train.describe()
Out[67]:
In [68]:
##############################################################################
# 数据初步分析
##############################################################################
In [69]:
# 乘客各属性分布
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(14, 8))
fig.set(alpha=0.2) # 设定图表颜色alpha参数
plt.subplot2grid((2,3),(0,0)) # 在一张大图里分列几个小图
data_train.Survived.value_counts().plot(kind='bar')# 柱状图
plt.title(u"获救情况 (1为获救)") # 标题
plt.ylabel(u"人数")
plt.subplot2grid((2,3),(0,1))
data_train.Pclass.value_counts().plot(kind="bar")
plt.title(u"乘客等级分布")
plt.ylabel(u"人数")
plt.subplot2grid((2,3),(0,2))
plt.scatter(data_train.Survived, data_train.Age)
plt.title(u"按年龄看获救分布 (1为获救)")
plt.ylabel(u"年龄") # 设定纵坐标名称
plt.grid(b=True, which='major', axis='y')
plt.subplot2grid((2,3),(1,0), colspan=2)
data_train.Age[data_train.Pclass == 1].plot(kind='kde')
data_train.Age[data_train.Pclass == 2].plot(kind='kde')
data_train.Age[data_train.Pclass == 3].plot(kind='kde')
plt.title(u"各等级的乘客年龄分布")
plt.xlabel(u"年龄")# plots an axis lable
plt.ylabel(u"密度")
plt.legend((u'头等舱', u'2等舱',u'3等舱'),loc='best') # sets our legend for our graph.
plt.subplot2grid((2,3),(1,2))
data_train.Embarked.value_counts().plot(kind='bar')
plt.title(u"各登船口岸上船人数")
plt.ylabel(u"人数")
plt.show()
In [70]:
# 看看各乘客等级的获救情况
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(10, 4))
fig.set(alpha=0.2)
Survived_1 = data_train.Pclass[data_train.Survived == 1].value_counts()
Survived_0 = data_train.Pclass[data_train.Survived == 0].value_counts()
df = pd.DataFrame({u'获救': Survived_1, u'未获救': Survived_0})
df.plot(kind='bar', stacked=True)
plt.title(u'各乘客等级的获救情况')
plt.xlabel(u'乘客等级')
plt.ylabel(u'人数')
plt.show()
data_train.Pclass[data_train.Survived == 1].plot(kind='kde')
data_train.Pclass[data_train.Survived == 0].plot(kind='kde')
plt.title(u'各乘客等级的获救情况')
plt.xlabel(u'乘客等级')
plt.ylabel(u'密度')
plt.legend((u'获救', u'未获救'),loc='best') # sets our legend for our graph.
plt.show()
In [71]:
# 看看各性别的获救情况
fig = plt.figure()
fig.set(alpha=0.2)
Survived_m = data_train.Survived[data_train.Sex == 'male'].value_counts()
Survived_f = data_train.Survived[data_train.Sex == 'female'].value_counts()
df = pd.DataFrame({u'男性':Survived_m, u'女性':Survived_f})
df.plot(kind='bar', stacked=True)
plt.title(u'按性别看获救情况')
plt.xlabel(u'性别')
plt.ylabel(u'人数')
plt.show()
In [72]:
# 各乘客等级下的性别获救情况
fig = plt.figure(figsize=(16, 8))
fig.set(alpha=0.65)
plt.title(u'各乘客等级下的性别获救情况')
Survived_f = data_train.Survived[data_train.Sex == 'female']
Survived_m = data_train.Survived[data_train.Sex == 'male']
ax1 = fig.add_subplot(141)
Survived_f[data_train.Pclass != 3].value_counts().plot(
kind='bar', label='female, high class', color='#FA2479')
ax1.set_xticklabels([u'获救', u'未获救'], rotation=0)
ax1.legend([u'女性/高级舱'], loc='best')
ax2 = fig.add_subplot(142, sharey=ax1)
Survived_f[data_train.Pclass == 3].value_counts().plot(
kind='bar', label='female, low class', color='pink')
ax2.set_xticklabels([u'未获救', u'获救'], rotation=0)
plt.legend([u'女性/低级舱'], loc='best')
ax3 = fig.add_subplot(143, sharey=ax1)
Survived_m[data_train.Pclass != 3].value_counts().plot(
kind='bar', label='male, high class', color='lightblue')
ax3.set_xticklabels([u'未获救', u'获救'])
plt.legend([u'男性/高级舱'], loc='best')
ax4 = fig.add_subplot(144, sharey=ax1)
Survived_m[data_train.Pclass == 3].value_counts().plot(
kind='bar', label='male, low class', color='steelblue')
ax4.set_xticklabels([u'未获救', u'获救'])
plt.legend([u'男性/低级舱'], loc='best')
plt.show()
In [73]:
# 各登船港口的获救情况
fig = plt.figure(figsize=(16, 8))
fig.set(alpha=0.2)
Survived_0 = data_train.Embarked[data_train.Survived == 0].value_counts()
Survived_1 = data_train.Embarked[data_train.Survived == 1].value_counts()
df = pd.DataFrame({u'获救':Survived_1, u'未获救':Survived_0})
df.plot(kind='bar', stacked=True)
plt.title(u'各登船港口的获救情况')
plt.xlabel(u'登陆港口')
plt.ylabel(u'人数')
plt.show()
In [74]:
sibsp = data_train.groupby(['SibSp', 'Survived'])
df = pd.DataFrame(sibsp.count()['PassengerId'])
df
Out[74]:
In [75]:
parch = data_train.groupby(['Parch', 'Survived'])
df = pd.DataFrame(parch.count()['PassengerId'])
df
Out[75]:
In [76]:
# cabin只有204个乘客有值,我们先看看它的一个分布
data_train.Cabin.value_counts()
Out[76]:
In [77]:
fig = plt.figure()
fig.set(alpha=0.2)
Survived_cabin = data_train.Survived[pd.notnull(data_train.Cabin)].value_counts()
Survived_nocabin = data_train.Survived[pd.isnull(data_train.Cabin)].value_counts()
df = pd.DataFrame({u'有':Survived_cabin, u'无':Survived_nocabin}).transpose()
df.plot(kind='bar', stacked=True)
plt.title(u'按Cabin有无看获救情况')
plt.xlabel(u'Cabin有无')
plt.ylabel(u'人数')
plt.legend([u'未获救', u'获救'])
plt.show()
In [78]:
##############################################################################
# 简单数据预处理
##############################################################################
In [79]:
from sklearn.ensemble import RandomForestRegressor
# 使用 RandomForestClassifier 填补缺失的年龄属性
def set_missing_ages(df):
# 把已有的数值型特征取出来丢进Random Forest Regressor中
age_df = df[['Age','Fare', 'Parch', 'SibSp', 'Pclass']]
# 乘客分成已知年龄和未知年龄两部分
known_age = age_df[age_df.Age.notnull()].as_matrix()
unknown_age = age_df[age_df.Age.isnull()].as_matrix()
# y即目标年龄
y = known_age[:, 0]
# X即特征属性值
X = known_age[:, 1:]
# fit到RandomForestRegressor之中
rfr = RandomForestRegressor(random_state=0, n_estimators=2000, n_jobs=-1)
rfr.fit(X, y)
# 用得到的模型进行未知年龄结果预测
predictedAges = rfr.predict(unknown_age[:, 1::])
# 用得到的预测结果填补原缺失数据
df.loc[ (df.Age.isnull()), 'Age' ] = predictedAges
return df, rfr
def set_Cabin_type(df):
df.loc[(df.Cabin.notnull()), 'Cabin'] = 'Yes'
df.loc[(df.Cabin.isnull()), 'Cabin'] = 'No'
return df
data_train, rfr = set_missing_ages(data_train)
data_train = set_Cabin_type(data_train)
data_train.head()
Out[79]:
In [80]:
data_train.info()
In [81]:
# 对类目型的特征因子化
dummies_Cabin = pd.get_dummies(data_train['Cabin'], prefix='Cabin')
dummies_Embarked = pd.get_dummies(data_train['Embarked'], prefix='Embarked')
dummies_Sex = pd.get_dummies(data_train['Sex'], prefix='Sex')
dummies_Pclass = pd.get_dummies(data_train['Pclass'], prefix='Pclass')
df = pd.concat([data_train, dummies_Cabin, dummies_Embarked, dummies_Sex, dummies_Pclass], axis=1)
df.drop(['Pclass', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True)
df.head()
Out[81]:
In [82]:
import sklearn.preprocessing as preprocessing
scaler = preprocessing.StandardScaler()
age_scale_param = scaler.fit(df['Age'])
df['Age_scaled'] = scaler.fit_transform(df['Age'], age_scale_param)
fare_scale_param = scaler.fit(df['Fare'])
df['Fare_scaled'] = scaler.fit_transform(df['Fare'], fare_scale_param)
df.head()
Out[82]:
In [83]:
from sklearn import linear_model
# 用正则取出我们要的属性值
train_df = df.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
train_np = train_df.as_matrix()
# y即Survival结果
y = train_np[:, 0]
# X即特征属性值
X = train_np[:, 1:]
# fit到RandomForestRegressor之中
clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6)
clf.fit(X, y)
clf
Out[83]:
In [84]:
data_test = pd.read_csv('input/test.csv')
data_test.loc[(data_test.Fare.isnull()), 'Fare'] = 0
# 接着我们对test_data做和train_data中一致的特征变换
# 首先用同样的RandomForestRegressor模型填上丢失的年龄
tmp_df = data_test[['Age','Fare', 'Parch', 'SibSp', 'Pclass']]
null_age = tmp_df[data_test.Age.isnull()].as_matrix()
# 根据特征属性X预测年龄并补上
X = null_age[:, 1:]
predictedAges = rfr.predict(X)
data_test.loc[ (data_test.Age.isnull()), 'Age' ] = predictedAges
data_test = set_Cabin_type(data_test)
dummies_Cabin = pd.get_dummies(data_test['Cabin'], prefix= 'Cabin')
dummies_Embarked = pd.get_dummies(data_test['Embarked'], prefix= 'Embarked')
dummies_Sex = pd.get_dummies(data_test['Sex'], prefix= 'Sex')
dummies_Pclass = pd.get_dummies(data_test['Pclass'], prefix= 'Pclass')
df_test = pd.concat([data_test, dummies_Cabin, dummies_Embarked, dummies_Sex, dummies_Pclass], axis=1)
df_test.drop(['Pclass', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True)
df_test['Age_scaled'] = scaler.fit_transform(df_test['Age'], age_scale_param)
df_test['Fare_scaled'] = scaler.fit_transform(df_test['Fare'], fare_scale_param)
df_test.head()
Out[84]:
In [93]:
test = df_test.filter(regex='Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
predictions = clf.predict(test)
result = pd.DataFrame({'PassengerId':data_test['PassengerId'].as_matrix(),
'Survived':predictions.astype(np.int32)})
result.to_csv('output/titanic_lr_predictions.csv', index=False)
result.head()
Out[93]:
In [ ]: