In [1]:
import numpy as np
import pandas as pd

#The Machine learning alogorithm
from sklearn.ensemble import RandomForestClassifier

# Test train split
from sklearn.cross_validation import train_test_split

# Just to switch off pandas warning
pd.options.mode.chained_assignment = None

# Used to write our model to a file
from sklearn.externals import joblib

In [2]:
data = pd.read_csv("titanic_train.csv")
data.head()


Out[2]:
Unnamed: 0 row.names pclass survived name age embarked home.dest room ticket boat sex
0 998 999 3rd 1 McCarthy, Miss Katie NaN NaN NaN NaN NaN NaN female
1 179 180 1st 0 Millet, Mr Francis Davis 65 Southampton East Bridgewater, MA NaN NaN (249) male
2 556 557 2nd 0 Sjostedt, Mr Ernst Adolf 59 Southampton Sault St Marie, ON NaN NaN NaN male
3 174 175 1st 0 McCaffry, Mr Thomas Francis 46 Cherbourg Vancouver, BC NaN NaN (292) male
4 1232 1233 3rd 0 Strilic, Mr Ivan NaN NaN NaN NaN NaN NaN male

In [3]:
data.columns


Out[3]:
Index(['Unnamed: 0', 'row.names', 'pclass', 'survived', 'name', 'age',
       'embarked', 'home.dest', 'room', 'ticket', 'boat', 'sex'],
      dtype='object')

In [7]:
median_age = data['age'].median()
print("Median age is {}".format(median_age))


Median age is 29.0

In [8]:
data['age'].fillna(median_age, inplace = True)
data['age'].head()


Out[8]:
0    29
1    65
2    59
3    46
4    29
Name: age, dtype: float64

In [10]:
data_inputs = data[["pclass", "age", "sex"]]
data_inputs.head()


Out[10]:
pclass age sex
0 3rd 29 female
1 1st 65 male
2 2nd 59 male
3 1st 46 male
4 3rd 29 male

In [11]:
expected_output = data[["survived"]]
expected_output.head()


Out[11]:
survived
0 1
1 0
2 0
3 0
4 0

In [13]:
data_inputs["pclass"].replace("3rd", 3, inplace = True)
data_inputs["pclass"].replace("2nd", 2, inplace = True)
data_inputs["pclass"].replace("1st", 1, inplace = True)
data_inputs.head()


Out[13]:
pclass age sex
0 3 29 female
1 1 65 male
2 2 59 male
3 1 46 male
4 3 29 male

In [14]:
data_inputs["sex"] = np.where(data_inputs["sex"] == "female", 0, 1)
data_inputs.head()


Out[14]:
pclass age sex
0 3 29 0
1 1 65 1
2 2 59 1
3 1 46 1
4 3 29 1

In [21]:
inputs_train, inputs_test, expected_output_train, expected_output_test   = train_test_split (data_inputs, expected_output, test_size = 0.33, random_state = 42)

print(inputs_train.head())
print(expected_output_train.head())


     pclass  age  sex
618       3   19    1
169       3   29    1
830       1   54    1
140       3   29    1
173       2   28    1
     survived
618         0
169         0
830         1
140         0
173         0

In [22]:
rf = RandomForestClassifier (n_estimators=100)

In [23]:
rf.fit(inputs_train, expected_output_train)


C:\st\Anaconda3\lib\site-packages\ipykernel\__main__.py:1: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
  if __name__ == '__main__':
Out[23]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [24]:
accuracy = rf.score(inputs_test, expected_output_test)
print("Accuracy = {}%".format(accuracy * 100))


Accuracy = 79.60526315789474%

In [20]:
joblib.dump(rf, "titanic_model1", compress=9)


Out[20]:
['titanic_model1']

In [ ]: