Python Vs R: Data Science Connect

By: Tobias LEONG (GovTech)

tobias@data.gov.sg


In [1]:
# load libraries
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from urllib.request import urlopen
from pandas.compat import StringIO

%matplotlib inline

In [2]:
# write function to load data from URL
def loadTitanicData():
    url = "http://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv"
    dat = urlopen(url).read().decode('UTF-8')
    df = pd.read_csv(StringIO(dat))
    return df

In [3]:
# load data in
df = loadTitanicData()
df.head()


Out[3]:
Survived Pclass Name Sex Age Siblings/Spouses Aboard Parents/Children Aboard Fare
0 0 3 Mr. Owen Harris Braund male 22.0 1 0 7.2500
1 1 1 Mrs. John Bradley (Florence Briggs Thayer) Cum... female 38.0 1 0 71.2833
2 1 3 Miss. Laina Heikkinen female 26.0 0 0 7.9250
3 1 1 Mrs. Jacques Heath (Lily May Peel) Futrelle female 35.0 1 0 53.1000
4 0 3 Mr. William Henry Allen male 35.0 0 0 8.0500

Exploratory Data Analysis

Let's explore the data!


In [4]:
# check structure
print(df.shape)
print(df.dtypes)


(887, 8)
Survived                     int64
Pclass                       int64
Name                        object
Sex                         object
Age                        float64
Siblings/Spouses Aboard      int64
Parents/Children Aboard      int64
Fare                       float64
dtype: object

In [5]:
# get the summary statistics (only for numeric)
df.describe()


Out[5]:
Survived Pclass Age Siblings/Spouses Aboard Parents/Children Aboard Fare
count 887.000000 887.000000 887.000000 887.000000 887.000000 887.00000
mean 0.385569 2.305524 29.471443 0.525366 0.383315 32.30542
std 0.487004 0.836662 14.121908 1.104669 0.807466 49.78204
min 0.000000 1.000000 0.420000 0.000000 0.000000 0.00000
25% 0.000000 2.000000 20.250000 0.000000 0.000000 7.92500
50% 0.000000 3.000000 28.000000 0.000000 0.000000 14.45420
75% 1.000000 3.000000 38.000000 1.000000 0.000000 31.13750
max 1.000000 3.000000 80.000000 8.000000 6.000000 512.32920

In [6]:
# let's look at missing
df.isnull().sum()


Out[6]:
Survived                   0
Pclass                     0
Name                       0
Sex                        0
Age                        0
Siblings/Spouses Aboard    0
Parents/Children Aboard    0
Fare                       0
dtype: int64

In [7]:
# look at correlations
tmp = df.select_dtypes(include=["number"]).copy()
tmp_cor = tmp.corr()
plt.figure(figsize=(8,8))
sns.heatmap(tmp_cor, cmap="plasma")


Out[7]:
<matplotlib.axes._subplots.AxesSubplot at 0x112eeb7f0>

In [8]:
# pairwise plot
plt.figure(figsize=(15,15))
sns.pairplot(tmp, hue="Survived", diag_kind="kde")


/Users/tobias/anaconda/lib/python3.6/site-packages/statsmodels/nonparametric/kde.py:494: RuntimeWarning: invalid value encountered in true_divide
  binned = fast_linbin(X,a,b,gridsize)/(delta*nobs)
/Users/tobias/anaconda/lib/python3.6/site-packages/statsmodels/nonparametric/kdetools.py:34: RuntimeWarning: invalid value encountered in double_scalars
  FAC1 = 2*(np.pi*bw/RANGE)**2
Out[8]:
<seaborn.axisgrid.PairGrid at 0x11c9b42b0>
<matplotlib.figure.Figure at 0x11c9b44a8>

In [9]:
# investigate relationship between sex, survival rates and age
plt.figure(figsize=(8,8))
g = sns.swarmplot(x="Survived", y="Age", 
                  hue="Sex", data=df)
g.set_title("Sex, Age and Survival")


Out[9]:
<matplotlib.text.Text at 0x11ef64e48>

In [10]:
# grouping and aggregating
df.groupby("Sex", as_index=False)[['Age','Fare']].agg(['mean'])


Out[10]:
Age Fare
mean mean
Sex
female 27.719745 44.479818
male 30.431361 25.633935

In [32]:
# filtering - 
# I want to find all the female 3rd class passengers who were above 30
# how many survived?

df_filter = df[(df.Pclass == 3) & (df.Sex == 0) & (df.Age > 30)]
df_filter.Survived.value_counts(normalize=True)


Out[32]:
0    0.625
1    0.375
Name: Survived, dtype: float64

In [13]:
# count table of survival by gender
cnt_table = pd.crosstab(df.Survived, df.Sex)
cnt_table.index = ['died','survived']
cnt_table


Out[13]:
Sex female male
died 81 464
survived 233 109

In [14]:
# frequency table of above
freq_table = pd.crosstab(df.Survived, df.Sex, margins=True)
freq_table.index = ['died','survived', 'total']
freq_table = freq_table/freq_table.loc['total']
freq_table


Out[14]:
Sex female male All
died 0.257962 0.809773 0.614431
survived 0.742038 0.190227 0.385569
total 1.000000 1.000000 1.000000

In [15]:
# convert sex to 1 (male) and 0 (female)
# using replace and dict method
cleanup_sex = {"Sex":{"male": 1, "female": 0},
                }
df.replace(cleanup_sex, inplace=True)
df.Sex.value_counts()


Out[15]:
1    573
0    314
Name: Sex, dtype: int64

In [16]:
# look at groupby stats for passenger class and gender
stat = df.groupby(["Sex","Pclass"])['Survived'].agg(['count','sum'])
stat['prop'] = stat['sum']/stat['count']
stat.columns = ['Total','Survived','Proportion']
print(stat)
stat.plot(y='Proportion', kind='line', color='orange')


            Total  Survived  Proportion
Sex Pclass                             
0   1          94        91    0.968085
    2          76        70    0.921053
    3         144        72    0.500000
1   1         122        45    0.368852
    2         108        17    0.157407
    3         343        47    0.137026
Out[16]:
<matplotlib.axes._subplots.AxesSubplot at 0x11f557b00>

The story is the same for both genders, the lower the passenger class, the lower the survival rate

Modeling

Let's train some models and see how they perform in predicting Survival.


In [20]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score as acc
from sklearn.linear_model import LogisticRegression

In [21]:
# prepare data for modeling

tmp_cor = df.corr()
tmp_cor


Out[21]:
Survived Pclass Sex Age Siblings/Spouses Aboard Parents/Children Aboard Fare
Survived 1.000000 -0.336528 -0.542152 -0.059665 -0.037082 0.080097 0.256179
Pclass -0.336528 1.000000 0.129507 -0.391492 0.085026 0.020252 -0.548919
Sex -0.542152 0.129507 1.000000 0.091875 -0.113249 -0.244337 -0.181137
Age -0.059665 -0.391492 0.091875 1.000000 -0.297669 -0.193741 0.112329
Siblings/Spouses Aboard -0.037082 0.085026 -0.113249 -0.297669 1.000000 0.414244 0.158839
Parents/Children Aboard 0.080097 0.020252 -0.244337 -0.193741 0.414244 1.000000 0.215470
Fare 0.256179 -0.548919 -0.181137 0.112329 0.158839 0.215470 1.000000

Appears that siblings/spouses and parents/children aboard isn't that useful.


In [22]:
# drop the two cols mentioned above and name
df_model = df.drop(df.columns[[-2,-3,2]], axis=1)
df_model.head()


Out[22]:
Survived Pclass Sex Age Fare
0 0 3 1 22.0 7.2500
1 1 1 0 38.0 71.2833
2 1 3 0 26.0 7.9250
3 1 1 0 35.0 53.1000
4 0 3 1 35.0 8.0500

In [23]:
# split into data (X) and target (y)
X = df_model.drop(['Survived'], axis=1)
y = df['Survived']
print("X: ", X.shape)
print("y: ", y.shape)


X:  (887, 4)
y:  (887,)

In [24]:
# split into training and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [25]:
# define custom function to output test and training accuracy
def ACCscores(model):
    train_preds = model.predict(X_train)
    train_acc = acc(y_train, train_preds)
    print("Train Accuracy: ", train_acc)
    test_preds = model.predict(X_test)
    test_acc = acc(y_test, test_preds)
    print("Test Accuracy: ", test_acc)
    return (train_acc,test_acc)

In [26]:
# Logistic Regression
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
ACCscores(log_reg)


Train Accuracy:  0.788434414669
Test Accuracy:  0.837078651685
Out[26]:
(0.78843441466854725, 0.8370786516853933)

In [27]:
log_reg.get_params


Out[27]:
<bound method BaseEstimator.get_params of LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)>

In [28]:
log_reg.coef_


Out[28]:
array([[-0.92869031, -2.19505977, -0.02192635,  0.00232015]])

In [29]:
import statsmodels.formula.api as smf
import statsmodels.api as sm


/Users/tobias/anaconda/lib/python3.6/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools

In [30]:
formula = "Survived~ Pclass + Sex + Age + Fare"
data = df[['Survived','Pclass','Sex','Age','Fare']]

In [31]:
model = smf.glm(formula=formula, data=data, family=sm.families.Binomial()).fit()
model.summary()


Out[31]:
Generalized Linear Model Regression Results
Dep. Variable: Survived No. Observations: 887
Model: GLM Df Residuals: 882
Model Family: Binomial Df Model: 4
Link Function: logit Scale: 1.0
Method: IRLS Log-Likelihood: -400.79
Date: Fri, 27 Apr 2018 Deviance: 801.59
Time: 16:39:49 Pearson chi2: 931.
No. Iterations: 5
coef std err z P>|z| [0.025 0.975]
Intercept 4.8397 0.525 9.215 0.000 3.810 5.869
Pclass -1.2200 0.142 -8.610 0.000 -1.498 -0.942
Sex -2.5865 0.188 -13.783 0.000 -2.954 -2.219
Age -0.0342 0.007 -4.771 0.000 -0.048 -0.020
Fare 0.0003 0.002 0.157 0.875 -0.004 0.004