In [116]:
# This line configures matplotlib to show figures embedded in the notebook, 
# instead of opening a new window for each figure. 
%matplotlib inline

In [117]:
# Use the pandas library to read in the csv file 
import pandas as pd

# This will create a pandas dataframe and assign it to the titanic variable
titanic = pd.read_csv("titanic_data.csv")

In [118]:
# Print the first 5 rows of the dataframe
titanic.head(5)


Out[118]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35 0 0 373450 8.0500 NaN S

In [119]:
titanic.shape


Out[119]:
(891, 12)

In [120]:
# Show data types of the class label and features
titanic.dtypes


Out[120]:
PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [121]:
titanic.describe()


Out[121]:
PassengerId Survived Pclass Age SibSp Parch Fare
count 891.000000 891.000000 891.000000 714.000000 891.000000 891.000000 891.000000
mean 446.000000 0.383838 2.308642 29.699118 0.523008 0.381594 32.204208
std 257.353842 0.486592 0.836071 14.526497 1.102743 0.806057 49.693429
min 1.000000 0.000000 1.000000 0.420000 0.000000 0.000000 0.000000
25% 223.500000 0.000000 2.000000 20.125000 0.000000 0.000000 7.910400
50% 446.000000 0.000000 3.000000 28.000000 0.000000 0.000000 14.454200
75% 668.500000 1.000000 3.000000 38.000000 1.000000 0.000000 31.000000
max 891.000000 1.000000 3.000000 80.000000 8.000000 6.000000 512.329200

In [122]:
import matplotlib as plt

fig = plt.pyplot.figure()
axis = fig.add_subplot(111)
axis.hist(titanic['Age'], bins=10, range=(titanic['Age'].min(), titanic['Age'].max()))
plt.pyplot.title('Age Distribution')
plt.pyplot.xlabel('Age')
plt.pyplot.ylabel('# Passengers')
plt.pyplot.show()



In [123]:
# Get the counts of males and females in the data
titanic.groupby('Sex').count()


Out[123]:
PassengerId Survived Pclass Name Age SibSp Parch Ticket Fare Cabin Embarked
Sex
female 314 314 314 314 261 314 314 314 314 97 312
male 577 577 577 577 453 577 577 577 577 107 577

In [124]:
# Get summary stats grouped by Sex field
titanic.groupby('Sex').describe()


Out[124]:
Age Fare Parch PassengerId Pclass SibSp Survived
Sex
female count 261.000000 314.000000 314.000000 314.000000 314.000000 314.000000 314.000000
mean 27.915709 44.479818 0.649682 431.028662 2.159236 0.694268 0.742038
std 14.110146 57.997698 1.022846 256.846324 0.857290 1.156520 0.438211
min 0.750000 6.750000 0.000000 2.000000 1.000000 0.000000 0.000000
25% 18.000000 12.071875 0.000000 231.750000 1.000000 0.000000 0.000000
50% 27.000000 23.000000 0.000000 414.500000 2.000000 0.000000 1.000000
75% 37.000000 55.000000 1.000000 641.250000 3.000000 1.000000 1.000000
max 63.000000 512.329200 6.000000 889.000000 3.000000 8.000000 1.000000
male count 453.000000 577.000000 577.000000 577.000000 577.000000 577.000000 577.000000
mean 30.726645 25.523893 0.235702 454.147314 2.389948 0.429809 0.188908
std 14.678201 43.138263 0.612294 257.486139 0.813580 1.061811 0.391775
min 0.420000 0.000000 0.000000 1.000000 1.000000 0.000000 0.000000
25% 21.000000 7.895800 0.000000 222.000000 2.000000 0.000000 0.000000
50% 29.000000 10.500000 0.000000 464.000000 3.000000 0.000000 0.000000
75% 39.000000 26.550000 0.000000 680.000000 3.000000 0.000000 0.000000
max 80.000000 512.329200 5.000000 891.000000 3.000000 8.000000 1.000000

In [125]:
# Retrieve unique values for the Sex element
titanic['Sex'].unique()


Out[125]:
array(['male', 'female'], dtype=object)

In [126]:
crosstab = pd.crosstab(titanic['Sex'], titanic['Survived'].astype(bool))
crosstab.plot(kind='bar', stacked=True, color=['red','green'], grid=False)


Out[126]:
<matplotlib.axes._subplots.AxesSubplot at 0x117522fd0>

In [127]:
# Start here with post 2 code

In [128]:
import matplotlib.pyplot as plt

# set display to a a more appealing style
pd.options.display.mpl_style = 'default'

# plot the numeric variables using box plot chart
titanic.boxplot(column=['Age', 'SibSp', 'Parch', 'Fare'])


Out[128]:
{'boxes': [<matplotlib.lines.Line2D at 0x118025090>,
  <matplotlib.lines.Line2D at 0x11803ab10>,
  <matplotlib.lines.Line2D at 0x118145750>,
  <matplotlib.lines.Line2D at 0x118167710>],
 'caps': [<matplotlib.lines.Line2D at 0x1180311d0>,
  <matplotlib.lines.Line2D at 0x118031810>,
  <matplotlib.lines.Line2D at 0x11812edd0>,
  <matplotlib.lines.Line2D at 0x118138450>,
  <matplotlib.lines.Line2D at 0x118150a10>,
  <matplotlib.lines.Line2D at 0x11815b090>,
  <matplotlib.lines.Line2D at 0x118173650>,
  <matplotlib.lines.Line2D at 0x118173c90>],
 'fliers': [<matplotlib.lines.Line2D at 0x11803a4d0>,
  <matplotlib.lines.Line2D at 0x118145110>,
  <matplotlib.lines.Line2D at 0x11815bd10>,
  <matplotlib.lines.Line2D at 0x11817e950>],
 'means': [],
 'medians': [<matplotlib.lines.Line2D at 0x118031e50>,
  <matplotlib.lines.Line2D at 0x118138a90>,
  <matplotlib.lines.Line2D at 0x11815b6d0>,
  <matplotlib.lines.Line2D at 0x11817e310>],
 'whiskers': [<matplotlib.lines.Line2D at 0x118025490>,
  <matplotlib.lines.Line2D at 0x118025b50>,
  <matplotlib.lines.Line2D at 0x11812e150>,
  <matplotlib.lines.Line2D at 0x11812e790>,
  <matplotlib.lines.Line2D at 0x118145d50>,
  <matplotlib.lines.Line2D at 0x1181503d0>,
  <matplotlib.lines.Line2D at 0x118167990>,
  <matplotlib.lines.Line2D at 0x118167fd0>]}

In [129]:
titanic.hist(column=['Age', 'SibSp', 'Parch', 'Fare'], figsize=[10,10])


Out[129]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x118193390>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1182d5210>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x11844de10>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1184bd5d0>]], dtype=object)

In [130]:
titanic[titanic['Fare'] > 300][['PassengerId', 'Name', 'Sex', 'Fare']]


Out[130]:
PassengerId Name Sex Fare
258 259 Ward, Miss. Anna female 512.3292
679 680 Cardeza, Mr. Thomas Drake Martinez male 512.3292
737 738 Lesurer, Mr. Gustave J male 512.3292

In [131]:
titanic.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 90.5+ KB

In [132]:
titanic['Age'][0:10]


Out[132]:
0    22
1    38
2    26
3    35
4    35
5   NaN
6    54
7     2
8    27
9    14
Name: Age, dtype: float64

In [133]:
titanic['Age'].mean()


Out[133]:
29.69911764705882

In [134]:
titanic['Age'].median()


Out[134]:
28.0

In [135]:
# Create a new column in the data frame and assign it values from existing column
titanic['AgeBackFill'] = titanic['Age']

In [136]:
# Check if the values were copied over
titanic[['Sex', 'Age', 'AgeBackFill']].head(10)


Out[136]:
Sex Age AgeBackFill
0 male 22 22
1 female 38 38
2 female 26 26
3 female 35 35
4 male 35 35
5 male NaN NaN
6 male 54 54
7 male 2 2
8 female 27 27
9 female 14 14

In [137]:
titanic.loc[titanic['Age'].isnull(), 'AgeBackFill'] = titanic['Age'].median()

In [138]:
# Check if the values were backfilled 
titanic[['Sex', 'Age', 'AgeBackFill']].head(10)


Out[138]:
Sex Age AgeBackFill
0 male 22 22
1 female 38 38
2 female 26 26
3 female 35 35
4 male 35 35
5 male NaN 28
6 male 54 54
7 male 2 2
8 female 27 27
9 female 14 14

In [139]:
titanic['Gender'] = titanic['Sex'].map( {'female': 0, 'male': 1} ).astype(int)

In [140]:
# Check the transformation 
titanic[['Sex', 'Gender']].head(10)


Out[140]:
Sex Gender
0 male 1
1 female 0
2 female 0
3 female 0
4 male 1
5 male 1
6 male 1
7 male 1
8 female 0
9 female 0

In [141]:
titanic[['PassengerId', 'Embarked']].groupby('Embarked').count()


Out[141]:
PassengerId
Embarked
C 168
Q 77
S 644

In [142]:
# Create a new column
titanic['EmbarkedBackFill'] = titanic['Embarked']

# Backfill missing values
titanic['EmbarkedBackFill'] = titanic['EmbarkedBackFill'].fillna('S')

# Confirm backfill worked
titanic[titanic['Embarked'].isnull()][['Embarked', 'EmbarkedBackFill']]


Out[142]:
Embarked EmbarkedBackFill
61 NaN S
829 NaN S

In [143]:
titanic.loc[titanic['EmbarkedBackFill'] == 'S', 'EmbarkedBackFill'] = 0
titanic.loc[titanic['EmbarkedBackFill'] == 'C', 'EmbarkedBackFill'] = 1
titanic.loc[titanic['EmbarkedBackFill'] == 'Q', 'EmbarkedBackFill'] = 2
titanic['EmbarkedBackFill'] = titanic['EmbarkedBackFill'].astype(int)

In [144]:
titanic['FamilySize'] = titanic['SibSp'] + titanic['Parch']

titanic['Age*Class'] = titanic['AgeBackFill'] * titanic['Pclass']

In [145]:
titanic.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 17 columns):
PassengerId         891 non-null int64
Survived            891 non-null int64
Pclass              891 non-null int64
Name                891 non-null object
Sex                 891 non-null object
Age                 714 non-null float64
SibSp               891 non-null int64
Parch               891 non-null int64
Ticket              891 non-null object
Fare                891 non-null float64
Cabin               204 non-null object
Embarked            889 non-null object
AgeBackFill         891 non-null float64
Gender              891 non-null int64
EmbarkedBackFill    891 non-null int64
FamilySize          891 non-null int64
Age*Class           891 non-null float64
dtypes: float64(4), int64(8), object(5)
memory usage: 125.3+ KB

In [146]:
# create a new data frame and remove the columns we will not use
titanic_clean = titanic.drop(['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked', 'Age'], axis=1)

In [147]:
titanic_clean.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 11 columns):
PassengerId         891 non-null int64
Survived            891 non-null int64
Pclass              891 non-null int64
SibSp               891 non-null int64
Parch               891 non-null int64
Fare                891 non-null float64
AgeBackFill         891 non-null float64
Gender              891 non-null int64
EmbarkedBackFill    891 non-null int64
FamilySize          891 non-null int64
Age*Class           891 non-null float64
dtypes: float64(3), int64(8)
memory usage: 83.5 KB

In [148]:
train_data = titanic_clean.values
train_data


Out[148]:
array([[   1.,    0.,    3., ...,    0.,    1.,   66.],
       [   2.,    1.,    1., ...,    1.,    1.,   38.],
       [   3.,    1.,    3., ...,    0.,    0.,   78.],
       ..., 
       [ 889.,    0.,    3., ...,    0.,    3.,   84.],
       [ 890.,    1.,    1., ...,    1.,    0.,   26.],
       [ 891.,    0.,    3., ...,    2.,    0.,   96.]])

In [149]:
titanic_clean.groupby('Survived').AgeBackFill.hist(alpha=0.4)


Out[149]:
Survived
0    Axes(0.125,0.125;0.775x0.775)
1    Axes(0.125,0.125;0.775x0.775)
Name: AgeBackFill, dtype: object

In [150]:
titanic_clean.groupby('Survived').Fare.hist(alpha=0.4)


Out[150]:
Survived
0    Axes(0.125,0.125;0.775x0.775)
1    Axes(0.125,0.125;0.775x0.775)
Name: Fare, dtype: object

In [151]:
titanic_clean.groupby('Survived').FamilySize.hist(alpha=0.4)


Out[151]:
Survived
0    Axes(0.125,0.125;0.775x0.775)
1    Axes(0.125,0.125;0.775x0.775)
Name: FamilySize, dtype: object

In [152]:
from pandas.tools.plotting import scatter_matrix
scatter_matrix(titanic_clean[['Survived', 'Pclass','AgeBackFill', 'Gender', 'EmbarkedBackFill', 'FamilySize', 'Fare']], alpha=0.2, figsize=(15, 15), diagonal='kde')


Out[152]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x118f66250>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11710dcd0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1170e8910>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x116625610>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1160da750>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x115756e10>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1156a7390>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x11534f750>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1150b5ad0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1150cae50>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11199e9d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10ecd99d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10e2021d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11505b110>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x1161c8190>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11926be90>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11648af10>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x116439c50>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1153c7ad0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x116220810>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11545f790>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x1191b65d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x106123250>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10ea293d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10eaac110>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10ec0e850>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11717f690>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11930f3d0>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x1193932d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x119409fd0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x119477fd0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1194fad10>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x119561c90>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1195e2ad0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x119584690>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x1196d58d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11987f610>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1198e4d50>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x119a68b90>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x119acd8d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x119b507d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x119bd4510>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x119c42510>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x119cc7250>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x119e2b1d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x119ea0fd0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x119e453d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x119f93dd0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11a917b10>]], dtype=object)

In [153]:
# Start here with post 3 code

In [154]:
# Import function to split the data
from sklearn.cross_validation import train_test_split

# Select the columns to predict the target
predictors = ["Pclass","Fare", "AgeBackFill", "Gender", "EmbarkedBackFill", "FamilySize"]

# Specify target column
target = "Survived"

# Set features
X = titanic_clean[predictors]

# Set targets
y = titanic_clean[target]

# Split data into 80% training and 20% test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size =0.8, random_state=0)

In [155]:
import numpy as np
# Utility method to draw a confusion matrix
def plot_confusion_matrix(cm, target, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(target.unique()))
    plt.xticks(tick_marks, target.unique(), rotation=45)
    plt.yticks(tick_marks, target.unique())
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    for y in range(cm.shape[0]):
        for x in range(cm.shape[1]):
            plt.text(x, y, '%.0f' % cm[y, x],
                     horizontalalignment='center',
                     verticalalignment='center',
                     color='red',
                     fontsize=20
                     )

In [156]:
# Import the linear regression class
from sklearn.linear_model import LinearRegression

# Initialize our algorithm class
algo = LinearRegression()

# fit the model 
algo.fit(X_train, y_train)

# predict on our test data
y_pred = algo.predict(X_test)

In [157]:
from sklearn.metrics import confusion_matrix

y_pred[y_pred > .5] = 1
y_pred[y_pred <=.5] = 0

cm = confusion_matrix(y_test, y_pred)
plt.figure()
plot_confusion_matrix(cm, titanic_clean[target])



In [158]:
# Import the logistic regression class
from sklearn.linear_model import LogisticRegression

# Initialize our algorithm class
algo = LogisticRegression(random_state=0)

# fit the model 
algo.fit(X_train, y_train)

# predict on our test data
y_pred = algo.predict(X_test)

In [159]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)
plt.figure()
plot_confusion_matrix(cm, titanic_clean[target])



In [160]:
# Import the Decision tree classifier
from sklearn.tree import DecisionTreeClassifier

# Initialize our algorithm class
algo = DecisionTreeClassifier(random_state=0)

# fit the model 
algo.fit(X_train, y_train)

# predict on our test data
y_pred = algo.predict(X_test)

In [161]:
cm = confusion_matrix(y_test, y_pred)
plt.figure()
plot_confusion_matrix(cm, titanic_clean[target])



In [162]:
# Import the Random Forest classifier
from sklearn.ensemble import RandomForestClassifier

algo = RandomForestClassifier(n_estimators=100, random_state=0)

# fit the model 
algo.fit(X_train, y_train)

# predict on our test data
y_pred = algo.predict(X_test)

In [163]:
cm = confusion_matrix(y_test, y_pred)
plt.figure()
plot_confusion_matrix(cm, titanic_clean[target])



In [ ]:
# Start here with post 4 code

In [164]:
# Use the pandas library to read in the csv file 
import pandas as pd

# This will create a pandas dataframe and assign it to the titanic variable
validation = pd.read_csv("validation_ob.csv")

In [165]:
# transform
validation['AgeBackFill'] = validation['Age']
validation.loc[validation['Age'].isnull(), 'AgeBackFill'] = 28 # use same value used in training data
validation['Gender'] = validation['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
validation['EmbarkedBackFill'] = validation['Embarked']
validation['EmbarkedBackFill'] = validation['EmbarkedBackFill'].fillna('S')
validation.loc[validation['EmbarkedBackFill'] == 'S', 'EmbarkedBackFill'] = 0
validation.loc[validation['EmbarkedBackFill'] == 'C', 'EmbarkedBackFill'] = 1
validation.loc[validation['EmbarkedBackFill'] == 'Q', 'EmbarkedBackFill'] = 2
validation['EmbarkedBackFill'] = validation['EmbarkedBackFill'].astype(int)
validation['FamilySize'] = validation['SibSp'] + validation['Parch']
validation['Age*Class'] = validation['AgeBackFill'] * validation['Pclass']
data = validation.drop(['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked', 'Age'], axis=1)
# transform end

In [166]:
# Select the columns to predict the target
predictors = ["Pclass","Fare", "AgeBackFill", "Gender", "EmbarkedBackFill", "FamilySize"]

# Specify target column
target = "Survived"

# Set features
X = data[predictors]

# Set targets
y = data[target]

In [167]:
y_pred = algo.predict(X)

cm = confusion_matrix(y, y_pred)
plt.figure()
plot_confusion_matrix(cm, data[target])