In [1]:
import csv as csv
import numpy as np
import pandas as pd
In [2]:
# We can use the pandas library in python to read in the csv file.
# This creates a pandas dataframe and assigns it to the titanic variable.
titanic = pd.read_csv("data/train.csv")
# Print the first 5 rows of the dataframe.
print(titanic.head(5))
In [3]:
print(titanic.describe())
In [13]:
titanic["Age"] = titanic["Age"].fillna(titanic["Age"].median())
In [5]:
print(titanic["Cabin"].unique())
In [7]:
import re
import pandas
import random
import string
# A function to get the title from a name.
def get_cabin_section(cabin):
# Use a regular expression to search for the cabin data.
cabin_search = re.search('([A-Za-z]+)', cabin)
# If the cabin_search exists, extract and return it.
if cabin_search:
return cabin_search.group(1)
return "N"
In [8]:
# Get all the cabin sections
titanic = pd.read_csv("data/train.csv")
titanic["Cabin"] = titanic["Cabin"].fillna("")
cabin_section = titanic["Cabin"].apply(get_cabin_section)
unique_cabins = cabin_section.unique()
print(unique_cabins)
In [9]:
section_mapping = {}
i=0
for c in unique_cabins:
section_mapping[c] = i
i += 1
print(section_mapping)
#section_mapping = [{j:i} for i,j in unique_cabins]
for k,v in section_mapping.items():
cabin_section[cabin_section == k] = v
titanic["CabinSection"] = cabin_section
print(titanic["CabinSection"].unique())
In [10]:
# Find all the unique genders -- the column appears to contain only male and female.
print(titanic["Sex"].unique())
# Replace all the occurences of male with the number 0.
titanic.loc[titanic["Sex"] == "male", "Sex"] = 0
titanic.loc[titanic["Sex"] == "female", "Sex"] = 1
In [11]:
# Find all the unique values for "Embarked".
print(titanic["Embarked"].unique())
titanic["Embarked"] = titanic["Embarked"].fillna("S")
titanic.loc[titanic["Embarked"] == "S", "Embarked"] = 0
titanic.loc[titanic["Embarked"] == "C", "Embarked"] = 1
titanic.loc[titanic["Embarked"] == "Q", "Embarked"] = 2
In [16]:
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier
predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]
# Initialize our algorithm with the default paramters
# n_estimators is the number of trees we want to make
# min_samples_split is the minimum number of rows we need to make a split
# min_samples_leaf is the minimum number of samples we can have at the place where a tree branch ends (the bottom points of the tree)
alg = RandomForestClassifier(random_state=1, n_estimators=10, min_samples_split=2, min_samples_leaf=1)
scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic["Survived"], cv=3)
# Take the mean of the scores (because we have one for each fold)
print(scores.mean())
In [19]:
alg = RandomForestClassifier(random_state=1, n_estimators=150, min_samples_split=4, min_samples_leaf=2)
predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]
scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic["Survived"], cv=3)
# Take the mean of the scores (because we have one for each fold)
print(scores.mean())
In [20]:
# Generating a familysize column
titanic["FamilySize"] = titanic["SibSp"] + titanic["Parch"]
# The .apply method generates a new series
titanic["NameLength"] = titanic["Name"].apply(lambda x: len(x))
Extract the title using a regexp and map each unique title to an integer value
In [21]:
import re
import pandas
# A function to get the title from a name.
def get_title(name):
# Use a regular expression to search for a title. Titles always consist of capital and lowercase letters, and end with a period.
title_search = re.search(' ([A-Za-z]+)\.', name)
# If the title exists, extract and return it.
if title_search:
return title_search.group(1)
return ""
# Get all the titles and print how often each one occurs.
titles = titanic["Name"].apply(get_title)
print(pandas.value_counts(titles))
# Map each title to an integer. Some titles are very rare, and are compressed into the same codes as other titles.
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Dr": 5, "Rev": 6, "Major": 7, "Col": 7, "Mlle": 8, "Mme": 8, "Don": 9, "Lady": 10, "Countess": 10, "Jonkheer": 10, "Sir": 9, "Capt": 7, "Ms": 2}
for k,v in title_mapping.items():
titles[titles == k] = v
# Verify that we converted everything.
print(pandas.value_counts(titles))
# Add in the title column.
titanic["Title"] = titles
We can also generate a feature indicating which family people are in. Because survival was likely highly dependent on your family and the people around you, this has a good chance at being a good feature.
To get this, we'll concatenate someone's last name with FamilySize to get a unique family id. We'll then be able to assign a code to each person based on their family id.
In [22]:
import operator
# A dictionary mapping family name to id
family_id_mapping = {}
# A function to get the id given a row
def get_family_id(row):
# Find the last name by splitting on a comma
last_name = row["Name"].split(",")[0]
# Create the family id
family_id = "{0}{1}".format(last_name, row["FamilySize"])
# Look up the id in the mapping
if family_id not in family_id_mapping:
if len(family_id_mapping) == 0:
current_id = 1
else:
# Get the maximum id from the mapping and add one to it if we don't have an id
current_id = (max(family_id_mapping.items(), key=operator.itemgetter(1))[1] + 1)
family_id_mapping[family_id] = current_id
return family_id_mapping[family_id]
# Get the family ids with the apply method
family_ids = titanic.apply(get_family_id, axis=1)
# There are a lot of family ids, so we'll compress all of the families under 3 members into one code.
family_ids[titanic["FamilySize"] < 3] = -1
# Print the count of each unique id.
print(pandas.value_counts(family_ids))
titanic["FamilyId"] = family_ids
In [26]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest, f_classif
predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked", "FamilySize", "Title", "FamilyId","CabinSection"]
# Perform feature selection
selector = SelectKBest(f_classif, k=5)
selector.fit(titanic[predictors], titanic["Survived"])
# Get the raw p-values for each feature, and transform from p-values into scores
scores = -np.log10(selector.pvalues_)
%matplotlib notebook
# Plot the scores. See how "Pclass", "Sex", "Title", and "Fare" are the best?
plt.bar(range(len(predictors)), scores)
plt.xticks(range(len(predictors)), predictors, rotation='vertical')
plt.show()
# Pick only the four best features.
predictors = ["Pclass", "Sex", "Fare", "Title","CabinSection"]
alg = RandomForestClassifier(random_state=1, n_estimators=150, min_samples_split=8, min_samples_leaf=4)
scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic["Survived"], cv=3)
# Take the mean of the scores (because we have one for each fold)
print(scores.mean())
In [62]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import KFold
import numpy as np
# The algorithms we want to ensemble.
# We're using the more linear predictors for the logistic regression, and everything with the gradient boosting classifier.
algorithms = [
[GradientBoostingClassifier(random_state=1, n_estimators=25, max_depth=3), ["Pclass", "Sex", "Age", "Fare", "Embarked", "FamilySize", "Title", "FamilyId", "CabinSection"]],
[LogisticRegression(random_state=1), ["Pclass", "Sex", "Fare", "FamilySize", "Title", "Age", "Embarked","CabinSection"]],
[RandomForestClassifier(random_state=1, n_estimators=150, min_samples_split=8, min_samples_leaf=4), ["Pclass", "Sex", "Fare", "Title","CabinSection"]]
]
# Initialize the cross validation folds
kf = KFold(titanic.shape[0], n_folds=3, random_state=1)
predictions = []
for train, test in kf:
train_target = titanic["Survived"].iloc[train]
full_test_predictions = []
# Make predictions for each algorithm on each fold
for alg, predictors in algorithms:
# Fit the algorithm on the training data.
alg.fit(titanic[predictors].iloc[train,:], train_target)
# Select and predict on the test fold.
# The .astype(float) is necessary to convert the dataframe to all floats and avoid an sklearn error.
test_predictions = alg.predict_proba(titanic[predictors].iloc[test,:].astype(float))[:,1]
full_test_predictions.append(test_predictions)
# Use a simple ensembling scheme -- just average the predictions to get the final classification.
test_predictions = (4*full_test_predictions[0] + 2*full_test_predictions[1] + full_test_predictions[2]) / 7
# Any value over .5 is assumed to be a 1 prediction, and below .5 is a 0 prediction.
test_predictions[test_predictions <= .5] = 0
test_predictions[test_predictions > .5] = 1
predictions.append(test_predictions)
# Put all the predictions together into one array.
predictions = np.concatenate(predictions, axis=0)
# Compute accuracy by comparing to the training data.
accuracy = sum(predictions[predictions == titanic["Survived"]]) / len(predictions)
print(accuracy)
In [54]:
import pandas
import string
titanic_test = pandas.read_csv("data/test.csv")
titanic_test["Age"] = titanic_test["Age"].fillna(titanic["Age"].median())
titanic_test["Fare"] = titanic_test["Fare"].fillna(titanic_test["Fare"].median())
titanic_test.loc[titanic_test["Sex"] == "male", "Sex"] = 0
titanic_test.loc[titanic_test["Sex"] == "female", "Sex"] = 1
titanic_test["Embarked"] = titanic_test["Embarked"].fillna("S")
titanic_test.loc[titanic_test["Embarked"] == "S", "Embarked"] = 0
titanic_test.loc[titanic_test["Embarked"] == "C", "Embarked"] = 1
titanic_test.loc[titanic_test["Embarked"] == "Q", "Embarked"] = 2
# Get all the cabin sections
titanic["Cabin"] = titanic["Cabin"].fillna("")
cabin_section = titanic["Cabin"].apply(get_cabin_section)
unique_cabins = cabin_section.unique()
print(unique_cabins)
section_mapping = {}
i=0
for c in unique_cabins:
section_mapping[c] = i
i += 1
# Map each cabin section to an integer. Some sections are missing and will be compressed into one section.
for k,v in section_mapping.items():
cabin_section[cabin_section == k] = v
titanic_test["CabinSection"] = cabin_section
print(section_mapping)
# First, we'll add titles to the test set.
titles = titanic_test["Name"].apply(get_title)
# We're adding the Dona title to the mapping, because it's in the test set, but not the training set
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Dr": 5, "Rev": 6, "Major": 7, "Col": 7, "Mlle": 8, "Mme": 8, "Don": 9, "Lady": 10, "Countess": 10, "Jonkheer": 10, "Sir": 9, "Capt": 7, "Ms": 2, "Dona": 10}
for k,v in title_mapping.items():
titles[titles == k] = v
titanic_test["Title"] = titles
# Check the counts of each unique title.
print(pandas.value_counts(titanic_test["Title"]))
# Now, we add the family size column.
titanic_test["FamilySize"] = titanic_test["SibSp"] + titanic_test["Parch"]
# Now we can add family ids.
# We'll use the same ids that we did earlier.
family_ids = titanic_test.apply(get_family_id, axis=1)
family_ids[titanic_test["FamilySize"] < 3] = -1
titanic_test["FamilyId"] = family_ids
titanic_test["NameLength"] = titanic_test["Name"].apply(lambda x: len(x))
In [63]:
predictors = ["Pclass", "Sex", "Age", "Fare", "Embarked", "FamilySize", "Title", "FamilyId","CabinSection"]
algorithms = [
[GradientBoostingClassifier(random_state=1, n_estimators=25, max_depth=3), predictors],
[LogisticRegression(random_state=1), ["Pclass", "Sex", "Fare", "FamilySize", "Title", "Age", "Embarked","CabinSection"]],
[RandomForestClassifier(random_state=1, n_estimators=150, min_samples_split=8, min_samples_leaf=4), ["Pclass", "Sex", "Fare", "Title","CabinSection"]]
]
full_predictions = []
for alg, predictors in algorithms:
# Fit the algorithm using the full training data.
alg.fit(titanic[predictors], titanic["Survived"])
# Predict using the test dataset. We have to convert all the columns to floats to avoid an error.
predictions = alg.predict_proba(titanic_test[predictors].astype(float))[:,1]
full_predictions.append(predictions)
# The gradient boosting classifier generates better predictions, so we weight it higher.
predictions = (full_predictions[0] * 4 + 2*full_predictions[1] + full_predictions[2]) / 7
In [64]:
predictions[predictions <= .5] = 0
predictions[predictions > .5] = 1
predictions = predictions.astype(int)
submission = pandas.DataFrame({
"PassengerId": titanic_test["PassengerId"],
"Survived": predictions
})
In [65]:
# make a kaggle submission from test set predictions with PassengerId,Survived
submission.to_csv("kaggle.csv", index=False)
In [58]:
print(submission.shape)