Python Vs R: Data Science Connect

By: Tobias LEONG (GovTech)

tobias@data.gov.sg



In [1]:

    
# load libraries
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from urllib.request import urlopen
from pandas.compat import StringIO

%matplotlib inline



In [2]:

    
# write function to load data from URL
def loadTitanicData():
    url = "http://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv"
    dat = urlopen(url).read().decode('UTF-8')
    df = pd.read_csv(StringIO(dat))
    return df



In [3]:

    
# load data in
df = loadTitanicData()
df.head()









    Out[3]:







  
    
      
      Survived
      Pclass
      Name
      Sex
      Age
      Siblings/Spouses Aboard
      Parents/Children Aboard
      Fare
    
  
  
    
      0
      0
      3
      Mr. Owen Harris Braund
      male
      22.0
      1
      0
      7.2500
    
    
      1
      1
      1
      Mrs. John Bradley (Florence Briggs Thayer) Cum...
      female
      38.0
      1
      0
      71.2833
    
    
      2
      1
      3
      Miss. Laina Heikkinen
      female
      26.0
      0
      0
      7.9250
    
    
      3
      1
      1
      Mrs. Jacques Heath (Lily May Peel) Futrelle
      female
      35.0
      1
      0
      53.1000
    
    
      4
      0
      3
      Mr. William Henry Allen
      male
      35.0
      0
      0
      8.0500

Exploratory Data Analysis

Let's explore the data!



In [4]:

    
# check structure
print(df.shape)
print(df.dtypes)









    



(887, 8)
Survived                     int64
Pclass                       int64
Name                        object
Sex                         object
Age                        float64
Siblings/Spouses Aboard      int64
Parents/Children Aboard      int64
Fare                       float64
dtype: object



In [5]:

    
# get the summary statistics (only for numeric)
df.describe()









    Out[5]:







  
    
      
      Survived
      Pclass
      Age
      Siblings/Spouses Aboard
      Parents/Children Aboard
      Fare
    
  
  
    
      count
      887.000000
      887.000000
      887.000000
      887.000000
      887.000000
      887.00000
    
    
      mean
      0.385569
      2.305524
      29.471443
      0.525366
      0.383315
      32.30542
    
    
      std
      0.487004
      0.836662
      14.121908
      1.104669
      0.807466
      49.78204
    
    
      min
      0.000000
      1.000000
      0.420000
      0.000000
      0.000000
      0.00000
    
    
      25%
      0.000000
      2.000000
      20.250000
      0.000000
      0.000000
      7.92500
    
    
      50%
      0.000000
      3.000000
      28.000000
      0.000000
      0.000000
      14.45420
    
    
      75%
      1.000000
      3.000000
      38.000000
      1.000000
      0.000000
      31.13750
    
    
      max
      1.000000
      3.000000
      80.000000
      8.000000
      6.000000
      512.32920



In [6]:

    
# let's look at missing
df.isnull().sum()









    Out[6]:





Survived                   0
Pclass                     0
Name                       0
Sex                        0
Age                        0
Siblings/Spouses Aboard    0
Parents/Children Aboard    0
Fare                       0
dtype: int64



In [7]:

    
# look at correlations
tmp = df.select_dtypes(include=["number"]).copy()
tmp_cor = tmp.corr()
plt.figure(figsize=(8,8))
sns.heatmap(tmp_cor, cmap="plasma")









    Out[7]:





<matplotlib.axes._subplots.AxesSubplot at 0x112eeb7f0>



In [8]:

    
# pairwise plot
plt.figure(figsize=(15,15))
sns.pairplot(tmp, hue="Survived", diag_kind="kde")









    



/Users/tobias/anaconda/lib/python3.6/site-packages/statsmodels/nonparametric/kde.py:494: RuntimeWarning: invalid value encountered in true_divide
  binned = fast_linbin(X,a,b,gridsize)/(delta*nobs)
/Users/tobias/anaconda/lib/python3.6/site-packages/statsmodels/nonparametric/kdetools.py:34: RuntimeWarning: invalid value encountered in double_scalars
  FAC1 = 2*(np.pi*bw/RANGE)**2






    Out[8]:





<seaborn.axisgrid.PairGrid at 0x11c9b42b0>






    





<matplotlib.figure.Figure at 0x11c9b44a8>



In [9]:

    
# investigate relationship between sex, survival rates and age
plt.figure(figsize=(8,8))
g = sns.swarmplot(x="Survived", y="Age", 
                  hue="Sex", data=df)
g.set_title("Sex, Age and Survival")









    Out[9]:





<matplotlib.text.Text at 0x11ef64e48>



In [10]:

    
# grouping and aggregating
df.groupby("Sex", as_index=False)[['Age','Fare']].agg(['mean'])



In [32]:

    
# filtering - 
# I want to find all the female 3rd class passengers who were above 30
# how many survived?

df_filter = df[(df.Pclass == 3) & (df.Sex == 0) & (df.Age > 30)]
df_filter.Survived.value_counts(normalize=True)









    Out[32]:





0    0.625
1    0.375
Name: Survived, dtype: float64



In [13]:

    
# count table of survival by gender
cnt_table = pd.crosstab(df.Survived, df.Sex)
cnt_table.index = ['died','survived']
cnt_table









    Out[13]:







  
    
      Sex
      female
      male
    
  
  
    
      died
      81
      464
    
    
      survived
      233
      109



In [14]:

    
# frequency table of above
freq_table = pd.crosstab(df.Survived, df.Sex, margins=True)
freq_table.index = ['died','survived', 'total']
freq_table = freq_table/freq_table.loc['total']
freq_table



In [15]:

    
# convert sex to 1 (male) and 0 (female)
# using replace and dict method
cleanup_sex = {"Sex":{"male": 1, "female": 0},
                }
df.replace(cleanup_sex, inplace=True)
df.Sex.value_counts()









    Out[15]:





1    573
0    314
Name: Sex, dtype: int64



In [16]:

    
# look at groupby stats for passenger class and gender
stat = df.groupby(["Sex","Pclass"])['Survived'].agg(['count','sum'])
stat['prop'] = stat['sum']/stat['count']
stat.columns = ['Total','Survived','Proportion']
print(stat)
stat.plot(y='Proportion', kind='line', color='orange')









    



            Total  Survived  Proportion
Sex Pclass                             
0   1          94        91    0.968085
    2          76        70    0.921053
    3         144        72    0.500000
1   1         122        45    0.368852
    2         108        17    0.157407
    3         343        47    0.137026






    Out[16]:





<matplotlib.axes._subplots.AxesSubplot at 0x11f557b00>

The story is the same for both genders, the lower the passenger class, the lower the survival rate

Modeling

Let's train some models and see how they perform in predicting Survival.



In [20]:

    
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score as acc
from sklearn.linear_model import LogisticRegression



In [21]:

    
# prepare data for modeling

tmp_cor = df.corr()
tmp_cor









    Out[21]:







  
    
      
      Survived
      Pclass
      Sex
      Age
      Siblings/Spouses Aboard
      Parents/Children Aboard
      Fare
    
  
  
    
      Survived
      1.000000
      -0.336528
      -0.542152
      -0.059665
      -0.037082
      0.080097
      0.256179
    
    
      Pclass
      -0.336528
      1.000000
      0.129507
      -0.391492
      0.085026
      0.020252
      -0.548919
    
    
      Sex
      -0.542152
      0.129507
      1.000000
      0.091875
      -0.113249
      -0.244337
      -0.181137
    
    
      Age
      -0.059665
      -0.391492
      0.091875
      1.000000
      -0.297669
      -0.193741
      0.112329
    
    
      Siblings/Spouses Aboard
      -0.037082
      0.085026
      -0.113249
      -0.297669
      1.000000
      0.414244
      0.158839
    
    
      Parents/Children Aboard
      0.080097
      0.020252
      -0.244337
      -0.193741
      0.414244
      1.000000
      0.215470
    
    
      Fare
      0.256179
      -0.548919
      -0.181137
      0.112329
      0.158839
      0.215470
      1.000000

Appears that siblings/spouses and parents/children aboard isn't that useful.



In [22]:

    
# drop the two cols mentioned above and name
df_model = df.drop(df.columns[[-2,-3,2]], axis=1)
df_model.head()



In [23]:

    
# split into data (X) and target (y)
X = df_model.drop(['Survived'], axis=1)
y = df['Survived']
print("X: ", X.shape)
print("y: ", y.shape)









    



X:  (887, 4)
y:  (887,)



In [24]:

    
# split into training and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)



In [25]:

    
# define custom function to output test and training accuracy
def ACCscores(model):
    train_preds = model.predict(X_train)
    train_acc = acc(y_train, train_preds)
    print("Train Accuracy: ", train_acc)
    test_preds = model.predict(X_test)
    test_acc = acc(y_test, test_preds)
    print("Test Accuracy: ", test_acc)
    return (train_acc,test_acc)



In [26]:

    
# Logistic Regression
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
ACCscores(log_reg)









    



Train Accuracy:  0.788434414669
Test Accuracy:  0.837078651685






    Out[26]:





(0.78843441466854725, 0.8370786516853933)



In [27]:

    
log_reg.get_params









    Out[27]:





<bound method BaseEstimator.get_params of LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)>



In [28]:

    
log_reg.coef_









    Out[28]:





array([[-0.92869031, -2.19505977, -0.02192635,  0.00232015]])



In [29]:

    
import statsmodels.formula.api as smf
import statsmodels.api as sm









    



/Users/tobias/anaconda/lib/python3.6/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools



In [30]:

    
formula = "Survived~ Pclass + Sex + Age + Fare"
data = df[['Survived','Pclass','Sex','Age','Fare']]



In [31]:

    
model = smf.glm(formula=formula, data=data, family=sm.families.Binomial()).fit()
model.summary()









    Out[31]:





Generalized Linear Model Regression Results

  Dep. Variable:       Survived        No. Observations:       887 


  Model:                  GLM          Df Residuals:           882 


  Model Family:        Binomial        Df Model:                 4 


  Link Function:         logit         Scale:                 1.0  


  Method:                IRLS          Log-Likelihood:      -400.79


  Date:            Fri, 27 Apr 2018    Deviance:             801.59


  Time:                16:39:49        Pearson chi2:          931. 


  No. Iterations:          5                                       




               coef      std err       z       P>|z|   [0.025     0.975]  


  Intercept      4.8397      0.525      9.215   0.000      3.810      5.869


  Pclass        -1.2200      0.142     -8.610   0.000     -1.498     -0.942


  Sex           -2.5865      0.188    -13.783   0.000     -2.954     -2.219


  Age           -0.0342      0.007     -4.771   0.000     -0.048     -0.020


  Fare           0.0003      0.002      0.157   0.875     -0.004      0.004

	Survived	Pclass	Name	Sex	Age	Siblings/Spouses Aboard	Fare
0	0	3	Mr. Owen Harris Braund	male	22.0	1	7.2500
1	1	1	Mrs. John Bradley (Florence Briggs Thayer) Cum...	female	38.0	1	71.2833
2	1	3	Miss. Laina Heikkinen	female	26.0	0	7.9250
3	1	1	Mrs. Jacques Heath (Lily May Peel) Futrelle	female	35.0	1	53.1000
4	0	3	Mr. William Henry Allen	male	35.0	0	8.0500

	Survived	Pclass	Age	Siblings/Spouses Aboard	Parents/Children Aboard	Fare
count	887.000000	887.000000	887.000000	887.000000	887.000000	887.00000
mean	0.385569	2.305524	29.471443	0.525366	0.383315	32.30542
std	0.487004	0.836662	14.121908	1.104669	0.807466	49.78204
min	0.000000	1.000000	0.420000	0.000000	0.000000	0.00000
25%	0.000000	2.000000	20.250000	0.000000	0.000000	7.92500
50%	0.000000	3.000000	28.000000	0.000000	0.000000	14.45420
75%	1.000000	3.000000	38.000000	1.000000	0.000000	31.13750
max	1.000000	3.000000	80.000000	8.000000	6.000000	512.32920

Sex	female	male	All
died	0.257962	0.809773	0.614431
survived	0.742038	0.190227	0.385569
total	1.000000	1.000000	1.000000

	Survived	Pclass	Sex	Age	Siblings/Spouses Aboard	Parents/Children Aboard	Fare
Survived	1.000000	-0.336528	-0.542152	-0.059665	-0.037082	0.080097	0.256179
Pclass	-0.336528	1.000000	0.129507	-0.391492	0.085026	0.020252	-0.548919
Sex	-0.542152	0.129507	1.000000	0.091875	-0.113249	-0.244337	-0.181137
Age	-0.059665	-0.391492	0.091875	1.000000	-0.297669	-0.193741	0.112329
Siblings/Spouses Aboard	-0.037082	0.085026	-0.113249	-0.297669	1.000000	0.414244	0.158839
Parents/Children Aboard	0.080097	0.020252	-0.244337	-0.193741	0.414244	1.000000	0.215470
Fare	0.256179	-0.548919	-0.181137	0.112329	0.158839	0.215470	1.000000

Dep. Variable:	Survived	No. Observations:	887
Model:	GLM	Df Residuals:	882
Model Family:	Binomial	Df Model:	4
Link Function:	logit	Scale:	1.0
Method:	IRLS	Log-Likelihood:	-400.79
Date:	Fri, 27 Apr 2018	Deviance:	801.59
Time:	16:39:49	Pearson chi2:	931.
No. Iterations:	5

	coef	std err	z	P>\|z\|	[0.025	0.975]
Intercept	4.8397	0.525	9.215	0.000	3.810	5.869
Pclass	-1.2200	0.142	-8.610	0.000	-1.498	-0.942
Sex	-2.5865	0.188	-13.783	0.000	-2.954	-2.219
Age	-0.0342	0.007	-4.771	0.000	-0.048	-0.020
Fare	0.0003	0.002	0.157	0.875	-0.004	0.004