In [1]:
import numpy as np
import pandas as pd
#The Machine learning alogorithm
from sklearn.ensemble import RandomForestClassifier
# Test train split
from sklearn.cross_validation import train_test_split
# Just to switch off pandas warning
pd.options.mode.chained_assignment = None
# Used to write our model to a file
from sklearn.externals import joblib
In [2]:
data = pd.read_csv("titanic_train.csv")
data.head()
Out[2]:
In [3]:
data.columns
Out[3]:
In [7]:
median_age = data['age'].median()
print("Median age is {}".format(median_age))
In [8]:
data['age'].fillna(median_age, inplace = True)
data['age'].head()
Out[8]:
In [10]:
data_inputs = data[["pclass", "age", "sex"]]
data_inputs.head()
Out[10]:
In [11]:
expected_output = data[["survived"]]
expected_output.head()
Out[11]:
In [13]:
data_inputs["pclass"].replace("3rd", 3, inplace = True)
data_inputs["pclass"].replace("2nd", 2, inplace = True)
data_inputs["pclass"].replace("1st", 1, inplace = True)
data_inputs.head()
Out[13]:
In [14]:
data_inputs["sex"] = np.where(data_inputs["sex"] == "female", 0, 1)
data_inputs.head()
Out[14]:
In [21]:
inputs_train, inputs_test, expected_output_train, expected_output_test = train_test_split (data_inputs, expected_output, test_size = 0.33, random_state = 42)
print(inputs_train.head())
print(expected_output_train.head())
In [22]:
rf = RandomForestClassifier (n_estimators=100)
In [23]:
rf.fit(inputs_train, expected_output_train)
Out[23]:
In [24]:
accuracy = rf.score(inputs_test, expected_output_test)
print("Accuracy = {}%".format(accuracy * 100))
In [20]:
joblib.dump(rf, "titanic_model1", compress=9)
Out[20]:
In [ ]: