In [39]:
import pandas as pd
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
In [40]:
df = pd.read_csv('../data/titanic.csv', index_col='PassengerId')
df.head()
Out[40]:
In [41]:
sex_counts = df['Sex'].value_counts()
print('{} {}'.format(sex_counts['male'], sex_counts['female']))
In [42]:
survived_df = df['Survived']
count_of_survived = survived_df.value_counts()[1]
survived_percentage = 100.0 * count_of_survived / survived_df.value_counts().sum()
print("{:0.2f}".format(survived_percentage))
In [46]:
pclass_df = df['Pclass']
count_of_first_class_passengers = pclass_df.value_counts()[1]
first_class_percentage = 100.0 * count_of_first_class_passengers / survived_df.value_counts().sum()
print("{:0.2f}".format(first_class_percentage))
In [47]:
ages = df['Age'].dropna()
print("{:0.2f} {:0.2f}".format(ages.mean(), ages.median()))
In [50]:
correlation = df['SibSp'].corr(df['Parch'])
print("{:0.2f}".format(correlation))
In [83]:
def clean_name(name):
# First word before comma is a surname
s = re.search('^[^,]+, (.*)', name)
if s:
name = s.group(1)
# get name from braces (if in braces)
s = re.search('\(([^)]+)\)', name)
if s:
name = s.group(1)
# Removing appeal
name = re.sub('(Miss\. |Mrs\. |Ms\. )', '', name)
# Get first left word and removing quotes
name = name.split(' ')[0].replace('"', '')
return name
names = df[df['Sex'] == 'female']['Name'].map(clean_name)
name_counts = names.value_counts()
name_counts.head()
print(name_counts.head(1).index.values[0])
Out[83]: