In [1]:
from IPython.display import FileLink
FileLink('Titanic baby step for pandas Part 1.ipynb')
Out[1]:
In [2]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib as plt
from IPython.display import display_html
In [3]:
df = pd.read_csv('data/train.csv')
df.head()
Out[3]:
In [4]:
df.describe()
Out[4]:
In [5]:
df['AgeFixed'] = df['Age'].fillna(df['Age'].mean())
In [6]:
df['Salutation'] = df['Name'].apply(lambda x: x.split(', ')[1].split('.')[0])
df.head()
Out[6]:
here is how salutation looks like
In [7]:
df.groupby('Salutation')['PassengerId'].count()
Out[7]:
remove the Capt
/ Don
part
In [8]:
s = df.groupby('Salutation')['PassengerId'].count()
s.sort(ascending=False)
valid_vals = s[:4].index
df['SalutationNew'] = df['Salutation'].apply(lambda x: x in valid_vals and x or 'Others')
df.boxplot('Age', by='SalutationNew', figsize=(20, 4))
Out[8]:
SalutationNew 和 Age 关系比较显著
In [9]:
df.boxplot('Age', by=['Sex', 'Pclass'], figsize=(20, 4))
Out[9]:
Sex/Pclass 和 Age 的关系也比较显著 所以可以用 Sex/Pclass/SalutationNew 这三个变量矫正缺失值
In [10]:
# 翻转矩阵 算出 Sex/Pclass 下每个 SalutationNew 的 Age 中位数值 用来填充缺失 Age
t = df.pivot_table(values='Age', index='SalutationNew', columns=['Sex', 'Pclass'], aggfunc=np.median)
display_html(t)
df['AgeFixed'] = df['Age']
df['AgeFixed'].fillna(df[df['Age'].isnull()].apply(lambda x: t[x['Sex']][x['Pclass']][x['SalutationNew']], axis=1), inplace=True)
df.describe()
Out[10]:
找出异常的 Fare 数据并用正常临界值代替掉
In [11]:
fig = plt.pyplot.figure(figsize=(30, 4))
ax = fig.add_subplot(121)
ax.set_title('before fix')
s = df['Fare'].copy()
s.hist()
n = 4
outhigh_fare = s[s - s.mean() >= n * s.std()]
outlow_fare = s[s.mean() - s >= n * s.std()]
if outhigh_fare.count():
s[s >= outhigh_fare.max()] = s[s < outhigh_fare.min()].max()
if outlow_fare.count():
s[s <= outhigh_fare.min()] = s[s > outhigh_fare.max()].min()
ax = fig.add_subplot(122)
ax.set_title('after fix')
s.hist()
# ax = fig.add_subplot(133)
# ax.set_title('after fix')
df['FareFixed'] = s
df.boxplot(['Fare', 'FareFixed'], by='Pclass', figsize=(30, 4))
Out[11]:
In [11]: