In [1]:
import pandas as pd
import numpy as np
import re
import itertools
In [2]:
train = pd.read_csv("data/train.csv")
train["dataset"] = "train"
train.head()
Out[2]:
In [3]:
test = pd.read_csv("data/test.csv")
test["dataset"] = "test"
test.head()
Out[3]:
In [4]:
#Combine both datasets to predict families
train = train.append(test)
train.set_index(train["PassengerId"],inplace=True)
In [5]:
name_tokenizer = re.compile(r"^(?P<surname>[^,]+), (?P<title>[A-Z a-z]+?)\. (?P<f_name>[A-Z a-z.]+)?(?P<maiden_name>\([A-Za-z .]+\))?")
In [6]:
name_tokens = ["surname","title","f_name","maiden_name"]
for name_tk in name_tokens:
train[name_tk] = train.Name.apply(lambda x: name_tokenizer.match(x).group(name_tk))
test[name_tk] = test.Name.apply(lambda x: name_tokenizer.match(x).group(name_tk))
train.head(n=5)
Out[6]:
In [7]:
print train.groupby(["title","Sex"]).size()
It seems we can extract some info from title
In [8]:
#Encode special title following this logic
train.has_special_title = train.title.apply(lambda x: x not in ["Mr","Mrs","Miss","Mme","Mlle","Master"])
In [14]:
def is_married(couple_rows):
are_married=False
if couple_rows.irow(0).Sex != couple_rows.irow(1).Sex:
#Get who is the husband and whose the wife
man = couple_rows.irow(0) if couple_rows.irow(0).Sex == "male" else couple_rows.irow(1)
woman = couple_rows.irow(0) if couple_rows.irow(0).Sex == "female" else couple_rows.irow(1)
#Marriage tests
marriage_tests = {}
marriage_tests["same_f_name"] = woman.f_name is not None and woman.f_name in man.f_name
marriage_tests["consistent_title"] = woman.title not in ("Miss","Mlle") and man.title != "Master"
marriage_tests["same_ticket"] = woman.Ticket == man.Ticket
marriage_tests["same_pclass"] = woman.Pclass == man.Pclass
marriage_tests["legal_age"] = (woman.title in ("Mme","Mrs") or woman.Age >= 10) and man.Age > 10
marriage_tests["consistent_SibSp"] = (woman.SibSp > 0 and man.SibSp > 0) or (woman.SibSp == man.SibSp)
are_married = marriage_tests["same_f_name"] and marriage_tests["legal_age"] or ( )
consistency_checks = ( marriage_tests["consistent_title"] and
marriage_tests["legal_age"] and
marriage_tests["same_pclass"] and
marriage_tests["same_ticket"] and
marriage_tests["consistent_SibSp"])
if are_married and not consistency_checks:
failed_tests = ", ".join("{}:{}".format(x,marriage_tests[x]) for x in marriage_tests if not marriage_tests[x])
print "WARNING: Sketchy marriage: {}".format(failed_tests)
print couple_rows
print
return are_married
In [15]:
#Data structures - sets to keep track which ones have already been assigned
married_people = set()
people_with_parents = set()
In [16]:
links_to_assign = train[["SibSp","Parch"]]
#Matches a couple with the Max amount of kids they can have
#Which is the min(husband.Parch, wife.Parch)
marriages_table = {}
In [17]:
#Subset only people who have spouses/siblings on the boat
train_sibsp = train.ix[ train.SibSp > 0]
#People grouped by surname
surname_groups = train_sibsp.groupby("surname").groups
In [18]:
for surname in surname_groups:
surname_rows = surname_groups[surname]
couples = itertools.combinations(surname_rows,2)
for cpl in couples:
cpl_rows = train_sibsp.ix[list(cpl)]
if is_married(cpl_rows):
#Make sure we're not marrying somebody twice :p
assert cpl[0] not in married_people,"{} is already married :/".format(cpl[0])
assert cpl[1] not in married_people,"{} is already married :/".format(cpl[1])
#add couples to married set
married_people.add(cpl[0])
married_people.add(cpl[1])
marriages_table[cpl] = min(links_to_assign.ix[cpl[0]]["Parch"], links_to_assign.ix[cpl[1]]["Parch"] )
#print
# break
In [19]:
marriages_table
Out[19]:
In [ ]:
train.ix[list((26,1066))]
In [26]:
train.ix[ (train.SibSp > 0) | (train.Parch > 0) ].shape
Out[26]:
In [22]:
train
Out[22]:
In [ ]: