In [1]:
# load libraries
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from urllib.request import urlopen
from pandas.compat import StringIO
%matplotlib inline
In [2]:
# write function to load data from URL
def loadTitanicData():
url = "http://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv"
dat = urlopen(url).read().decode('UTF-8')
df = pd.read_csv(StringIO(dat))
return df
In [3]:
# load data in
df = loadTitanicData()
df.head()
Out[3]:
In [4]:
# check structure
print(df.shape)
print(df.dtypes)
In [5]:
# get the summary statistics (only for numeric)
df.describe()
Out[5]:
In [6]:
# let's look at missing
df.isnull().sum()
Out[6]:
In [7]:
# look at correlations
tmp = df.select_dtypes(include=["number"]).copy()
tmp_cor = tmp.corr()
plt.figure(figsize=(8,8))
sns.heatmap(tmp_cor, cmap="plasma")
Out[7]:
In [8]:
# pairwise plot
plt.figure(figsize=(15,15))
sns.pairplot(tmp, hue="Survived", diag_kind="kde")
Out[8]:
In [9]:
# investigate relationship between sex, survival rates and age
plt.figure(figsize=(8,8))
g = sns.swarmplot(x="Survived", y="Age",
hue="Sex", data=df)
g.set_title("Sex, Age and Survival")
Out[9]:
In [10]:
# grouping and aggregating
df.groupby("Sex", as_index=False)[['Age','Fare']].agg(['mean'])
Out[10]:
In [32]:
# filtering -
# I want to find all the female 3rd class passengers who were above 30
# how many survived?
df_filter = df[(df.Pclass == 3) & (df.Sex == 0) & (df.Age > 30)]
df_filter.Survived.value_counts(normalize=True)
Out[32]:
In [13]:
# count table of survival by gender
cnt_table = pd.crosstab(df.Survived, df.Sex)
cnt_table.index = ['died','survived']
cnt_table
Out[13]:
In [14]:
# frequency table of above
freq_table = pd.crosstab(df.Survived, df.Sex, margins=True)
freq_table.index = ['died','survived', 'total']
freq_table = freq_table/freq_table.loc['total']
freq_table
Out[14]:
In [15]:
# convert sex to 1 (male) and 0 (female)
# using replace and dict method
cleanup_sex = {"Sex":{"male": 1, "female": 0},
}
df.replace(cleanup_sex, inplace=True)
df.Sex.value_counts()
Out[15]:
In [16]:
# look at groupby stats for passenger class and gender
stat = df.groupby(["Sex","Pclass"])['Survived'].agg(['count','sum'])
stat['prop'] = stat['sum']/stat['count']
stat.columns = ['Total','Survived','Proportion']
print(stat)
stat.plot(y='Proportion', kind='line', color='orange')
Out[16]:
The story is the same for both genders, the lower the passenger class, the lower the survival rate
In [20]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score as acc
from sklearn.linear_model import LogisticRegression
In [21]:
# prepare data for modeling
tmp_cor = df.corr()
tmp_cor
Out[21]:
Appears that siblings/spouses and parents/children aboard isn't that useful.
In [22]:
# drop the two cols mentioned above and name
df_model = df.drop(df.columns[[-2,-3,2]], axis=1)
df_model.head()
Out[22]:
In [23]:
# split into data (X) and target (y)
X = df_model.drop(['Survived'], axis=1)
y = df['Survived']
print("X: ", X.shape)
print("y: ", y.shape)
In [24]:
# split into training and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
In [25]:
# define custom function to output test and training accuracy
def ACCscores(model):
train_preds = model.predict(X_train)
train_acc = acc(y_train, train_preds)
print("Train Accuracy: ", train_acc)
test_preds = model.predict(X_test)
test_acc = acc(y_test, test_preds)
print("Test Accuracy: ", test_acc)
return (train_acc,test_acc)
In [26]:
# Logistic Regression
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
ACCscores(log_reg)
Out[26]:
In [27]:
log_reg.get_params
Out[27]:
In [28]:
log_reg.coef_
Out[28]:
In [29]:
import statsmodels.formula.api as smf
import statsmodels.api as sm
In [30]:
formula = "Survived~ Pclass + Sex + Age + Fare"
data = df[['Survived','Pclass','Sex','Age','Fare']]
In [31]:
model = smf.glm(formula=formula, data=data, family=sm.families.Binomial()).fit()
model.summary()
Out[31]: