In [1]:
import pandas as pd
%matplotlib inline
import numpy as np

In [2]:
titanic = pd.read_csv("data/titanic.csv")

In [3]:
titanic.columns


Out[3]:
Index(['row.names', 'pclass', 'survived', 'name', 'age', 'embarked',
       'home.dest', 'room', 'ticket', 'boat', 'sex'],
      dtype='object')

Let's do a simple logistic regression to predict survival based on pclass and sex

First we need to prepare our features. Remember we drop one value in each dummy to avoid the dummy variable trap


In [4]:
titanic['sex_female'] = titanic['sex'].apply(lambda x:1 if x=='female' else 0)

In [5]:
dataset = titanic[['survived']].join([pd.get_dummies(titanic['pclass'],prefix="pclass"),titanic.sex_female])

In [6]:
from sklearn.linear_model import LogisticRegression

In [7]:
lm = LogisticRegression()

In [8]:
#drop pclass_1st to avoid dummy variable trap
x = np.asarray(dataset[['pclass_2nd','pclass_3rd','sex_female']])
y = np.asarray(dataset['survived'])

In [9]:
lm = lm.fit(x,y)

In [10]:
lm.score(x,y)


Out[10]:
0.81492764661081496

In [ ]:
y.mean()

In [11]:
lm.coef_


Out[11]:
array([[-0.72686459, -1.96690903,  2.35874735]])

In [12]:
lm.intercept_


Out[12]:
array([-0.50818803])

In [14]:
lm.predict([0,0,1])


/home/ec2-user/anaconda3/lib/python3.5/site-packages/sklearn/utils/validation.py:386: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and willraise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)
Out[14]:
array([1])

In [16]:
lm.predict([0,0,0])


/home/ec2-user/anaconda3/lib/python3.5/site-packages/sklearn/utils/validation.py:386: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and willraise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)
Out[16]:
array([0])

In [17]:
lm.predict([0,1,0])


/home/ec2-user/anaconda3/lib/python3.5/site-packages/sklearn/utils/validation.py:386: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and willraise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)
Out[17]:
array([0])

In [19]:
lm.predict_log_proba([0,0,1])


/home/ec2-user/anaconda3/lib/python3.5/site-packages/sklearn/utils/validation.py:386: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and willraise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)
Out[19]:
array([[-1.99651875, -0.14595943]])

In [ ]: