In [1]:
import pandas as pd
import numpy as np
import re
import itertools

1. Load train and test data


In [2]:
train = pd.read_csv("data/train.csv")
train["dataset"] = "train"
train.head()


Out[2]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked dataset
0 1 0 3 Braund, Mr. Owen Harris male 22 1 0 A/5 21171 7.2500 NaN S train
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38 1 0 PC 17599 71.2833 C85 C train
2 3 1 3 Heikkinen, Miss. Laina female 26 0 0 STON/O2. 3101282 7.9250 NaN S train
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35 1 0 113803 53.1000 C123 S train
4 5 0 3 Allen, Mr. William Henry male 35 0 0 373450 8.0500 NaN S train

In [3]:
test = pd.read_csv("data/test.csv")
test["dataset"] = "test"
test.head()


Out[3]:
PassengerId Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked dataset
0 892 3 Kelly, Mr. James male 34.5 0 0 330911 7.8292 NaN Q test
1 893 3 Wilkes, Mrs. James (Ellen Needs) female 47.0 1 0 363272 7.0000 NaN S test
2 894 2 Myles, Mr. Thomas Francis male 62.0 0 0 240276 9.6875 NaN Q test
3 895 3 Wirz, Mr. Albert male 27.0 0 0 315154 8.6625 NaN S test
4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female 22.0 1 1 3101298 12.2875 NaN S test

In [4]:
#Combine both datasets to predict families
train = train.append(test)
train.set_index(train["PassengerId"],inplace=True)

2. Tokenize name into (surname, title, first name and maiden name)


In [5]:
name_tokenizer = re.compile(r"^(?P<surname>[^,]+), (?P<title>[A-Z a-z]+?)\. (?P<f_name>[A-Z a-z.]+)?(?P<maiden_name>\([A-Za-z .]+\))?")

In [6]:
name_tokens = ["surname","title","f_name","maiden_name"]
for name_tk in name_tokens:
    train[name_tk] = train.Name.apply(lambda x: name_tokenizer.match(x).group(name_tk))
    test[name_tk] = test.Name.apply(lambda x: name_tokenizer.match(x).group(name_tk))
train.head(n=5)


Out[6]:
Age Cabin Embarked Fare Name Parch PassengerId Pclass Sex SibSp Survived Ticket dataset surname title f_name maiden_name
PassengerId
1 22 NaN S 7.2500 Braund, Mr. Owen Harris 0 1 3 male 1 0 A/5 21171 train Braund Mr Owen Harris None
2 38 C85 C 71.2833 Cumings, Mrs. John Bradley (Florence Briggs Th... 0 2 1 female 1 1 PC 17599 train Cumings Mrs John Bradley (Florence Briggs Thayer)
3 26 NaN S 7.9250 Heikkinen, Miss. Laina 0 3 3 female 0 1 STON/O2. 3101282 train Heikkinen Miss Laina None
4 35 C123 S 53.1000 Futrelle, Mrs. Jacques Heath (Lily May Peel) 0 4 1 female 1 1 113803 train Futrelle Mrs Jacques Heath (Lily May Peel)
5 35 NaN S 8.0500 Allen, Mr. William Henry 0 5 3 male 0 0 373450 train Allen Mr William Henry None

2.1 Extract features from Title variable


In [7]:
print train.groupby(["title","Sex"]).size()


title         Sex   
Capt          male        1
Col           male        4
Don           male        1
Dona          female      1
Dr            female      1
              male        7
Jonkheer      male        1
Lady          female      1
Major         male        2
Master        male       61
Miss          female    260
Mlle          female      2
Mme           female      1
Mr            male      757
Mrs           female    197
Ms            female      2
Rev           male        8
Sir           male        1
the Countess  female      1
dtype: int64

It seems we can extract some info from title

  1. Whether a woman is married Mme/Mrs vs Miss/Mlle vs Ms(Undetermined or single :/? )
  2. Master title apparently given to male kids
  3. Nobility vs laypeople : (Dr, Col, Capt, ...) vs (Mr,Master,Mrs,Miss). Ambiguous cases (Mlle,Mme,Ms,Don/Dona?)

In [8]:
#Encode special title following this logic
train.has_special_title = train.title.apply(lambda x: x not in ["Mr","Mrs","Miss","Mme","Mlle","Master"])

3 Examine marriages / sibling relationships


In [14]:
def is_married(couple_rows):
    are_married=False
    if couple_rows.irow(0).Sex != couple_rows.irow(1).Sex:
        #Get who is the husband and whose the wife
        man = couple_rows.irow(0) if couple_rows.irow(0).Sex == "male" else couple_rows.irow(1)
        woman = couple_rows.irow(0) if couple_rows.irow(0).Sex == "female" else couple_rows.irow(1)

        #Marriage tests
        marriage_tests = {}
        marriage_tests["same_f_name"] = woman.f_name is not None and woman.f_name in man.f_name
        marriage_tests["consistent_title"] =  woman.title not in ("Miss","Mlle") and man.title != "Master"
        marriage_tests["same_ticket"] = woman.Ticket == man.Ticket
        marriage_tests["same_pclass"] = woman.Pclass == man.Pclass
        marriage_tests["legal_age"] = (woman.title in ("Mme","Mrs") or woman.Age >= 10) and man.Age > 10
        marriage_tests["consistent_SibSp"] = (woman.SibSp > 0 and man.SibSp > 0) or (woman.SibSp == man.SibSp)

        are_married = marriage_tests["same_f_name"] and marriage_tests["legal_age"] or ( )
        
        consistency_checks = ( marriage_tests["consistent_title"] and 
                               marriage_tests["legal_age"] and 
                              marriage_tests["same_pclass"] and 
                              marriage_tests["same_ticket"] and
                              marriage_tests["consistent_SibSp"])

        if are_married and not consistency_checks:
            failed_tests = ", ".join("{}:{}".format(x,marriage_tests[x]) for x in marriage_tests if not marriage_tests[x])
            print "WARNING: Sketchy marriage: {}".format(failed_tests)
            print couple_rows
            
            print

    return are_married

Initialize data structures for algorithm


In [15]:
#Data structures - sets to keep track which ones have already been assigned
married_people = set()
people_with_parents = set()

In [16]:
links_to_assign = train[["SibSp","Parch"]]
#Matches a couple with the Max amount of kids they can have 
#Which is the min(husband.Parch, wife.Parch)
marriages_table = {}

1. Extract marriages in greedy fashion. Assume is_married has no fp ( might have actually :/ )


In [17]:
#Subset only people who have spouses/siblings on the boat
train_sibsp = train.ix[ train.SibSp > 0]
#People grouped by surname
surname_groups = train_sibsp.groupby("surname").groups

In [18]:
for surname in surname_groups:
    surname_rows = surname_groups[surname]
    couples = itertools.combinations(surname_rows,2)
    for cpl in couples:
        cpl_rows = train_sibsp.ix[list(cpl)]
        if is_married(cpl_rows):
            #Make sure we're not marrying somebody twice :p
            assert cpl[0] not in married_people,"{} is already married :/".format(cpl[0])
            assert cpl[1] not in married_people,"{} is already married :/".format(cpl[1])
            
            #add couples to married set
            married_people.add(cpl[0])
            married_people.add(cpl[1])
            
            marriages_table[cpl] = min(links_to_assign.ix[cpl[0]]["Parch"], links_to_assign.ix[cpl[1]]["Parch"] )
            
            
            #print
#    break

In [19]:
marriages_table


Out[19]:
{(26, 1066): 5, (94, 924): 2, (152, 337): 0, (609, 686): 2, (737, 1059): 2}

In [ ]:
train.ix[list((26,1066))]

In [26]:
train.ix[ (train.SibSp > 0) | (train.Parch > 0) ].shape


Out[26]:
(519, 17)

In [22]:
train


Out[22]:
Age Cabin Embarked Fare Name Parch PassengerId Pclass Sex SibSp Survived Ticket dataset surname title f_name maiden_name
PassengerId
1 22.0 NaN S 7.2500 Braund, Mr. Owen Harris 0 1 3 male 1 0 A/5 21171 train Braund Mr Owen Harris None
2 38.0 C85 C 71.2833 Cumings, Mrs. John Bradley (Florence Briggs Th... 0 2 1 female 1 1 PC 17599 train Cumings Mrs John Bradley (Florence Briggs Thayer)
3 26.0 NaN S 7.9250 Heikkinen, Miss. Laina 0 3 3 female 0 1 STON/O2. 3101282 train Heikkinen Miss Laina None
4 35.0 C123 S 53.1000 Futrelle, Mrs. Jacques Heath (Lily May Peel) 0 4 1 female 1 1 113803 train Futrelle Mrs Jacques Heath (Lily May Peel)
5 35.0 NaN S 8.0500 Allen, Mr. William Henry 0 5 3 male 0 0 373450 train Allen Mr William Henry None
6 NaN NaN Q 8.4583 Moran, Mr. James 0 6 3 male 0 0 330877 train Moran Mr James None
7 54.0 E46 S 51.8625 McCarthy, Mr. Timothy J 0 7 1 male 0 0 17463 train McCarthy Mr Timothy J None
8 2.0 NaN S 21.0750 Palsson, Master. Gosta Leonard 1 8 3 male 3 0 349909 train Palsson Master Gosta Leonard None
9 27.0 NaN S 11.1333 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) 2 9 3 female 0 1 347742 train Johnson Mrs Oscar W (Elisabeth Vilhelmina Berg)
10 14.0 NaN C 30.0708 Nasser, Mrs. Nicholas (Adele Achem) 0 10 2 female 1 1 237736 train Nasser Mrs Nicholas (Adele Achem)
11 4.0 G6 S 16.7000 Sandstrom, Miss. Marguerite Rut 1 11 3 female 1 1 PP 9549 train Sandstrom Miss Marguerite Rut None
12 58.0 C103 S 26.5500 Bonnell, Miss. Elizabeth 0 12 1 female 0 1 113783 train Bonnell Miss Elizabeth None
13 20.0 NaN S 8.0500 Saundercock, Mr. William Henry 0 13 3 male 0 0 A/5. 2151 train Saundercock Mr William Henry None
14 39.0 NaN S 31.2750 Andersson, Mr. Anders Johan 5 14 3 male 1 0 347082 train Andersson Mr Anders Johan None
15 14.0 NaN S 7.8542 Vestrom, Miss. Hulda Amanda Adolfina 0 15 3 female 0 0 350406 train Vestrom Miss Hulda Amanda Adolfina None
16 55.0 NaN S 16.0000 Hewlett, Mrs. (Mary D Kingcome) 0 16 2 female 0 1 248706 train Hewlett Mrs None (Mary D Kingcome)
17 2.0 NaN Q 29.1250 Rice, Master. Eugene 1 17 3 male 4 0 382652 train Rice Master Eugene None
18 NaN NaN S 13.0000 Williams, Mr. Charles Eugene 0 18 2 male 0 1 244373 train Williams Mr Charles Eugene None
19 31.0 NaN S 18.0000 Vander Planke, Mrs. Julius (Emelia Maria Vande... 0 19 3 female 1 0 345763 train Vander Planke Mrs Julius (Emelia Maria Vandemoortele)
20 NaN NaN C 7.2250 Masselmani, Mrs. Fatima 0 20 3 female 0 1 2649 train Masselmani Mrs Fatima None
21 35.0 NaN S 26.0000 Fynney, Mr. Joseph J 0 21 2 male 0 0 239865 train Fynney Mr Joseph J None
22 34.0 D56 S 13.0000 Beesley, Mr. Lawrence 0 22 2 male 0 1 248698 train Beesley Mr Lawrence None
23 15.0 NaN Q 8.0292 McGowan, Miss. Anna "Annie" 0 23 3 female 0 1 330923 train McGowan Miss Anna None
24 28.0 A6 S 35.5000 Sloper, Mr. William Thompson 0 24 1 male 0 1 113788 train Sloper Mr William Thompson None
25 8.0 NaN S 21.0750 Palsson, Miss. Torborg Danira 1 25 3 female 3 0 349909 train Palsson Miss Torborg Danira None
26 38.0 NaN S 31.3875 Asplund, Mrs. Carl Oscar (Selma Augusta Emilia... 5 26 3 female 1 1 347077 train Asplund Mrs Carl Oscar (Selma Augusta Emilia Johansson)
27 NaN NaN C 7.2250 Emir, Mr. Farred Chehab 0 27 3 male 0 0 2631 train Emir Mr Farred Chehab None
28 19.0 C23 C25 C27 S 263.0000 Fortune, Mr. Charles Alexander 2 28 1 male 3 0 19950 train Fortune Mr Charles Alexander None
29 NaN NaN Q 7.8792 O'Dwyer, Miss. Ellen "Nellie" 0 29 3 female 0 1 330959 train O'Dwyer Miss Ellen None
30 NaN NaN S 7.8958 Todoroff, Mr. Lalio 0 30 3 male 0 0 349216 train Todoroff Mr Lalio None
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1280 21.0 NaN Q 7.7500 Canavan, Mr. Patrick 0 1280 3 male 0 NaN 364858 test Canavan Mr Patrick None
1281 6.0 NaN S 21.0750 Palsson, Master. Paul Folke 1 1281 3 male 3 NaN 349909 test Palsson Master Paul Folke None
1282 23.0 B24 S 93.5000 Payne, Mr. Vivian Ponsonby 0 1282 1 male 0 NaN 12749 test Payne Mr Vivian Ponsonby None
1283 51.0 D28 S 39.4000 Lines, Mrs. Ernest H (Elizabeth Lindsey James) 1 1283 1 female 0 NaN PC 17592 test Lines Mrs Ernest H (Elizabeth Lindsey James)
1284 13.0 NaN S 20.2500 Abbott, Master. Eugene Joseph 2 1284 3 male 0 NaN C.A. 2673 test Abbott Master Eugene Joseph None
1285 47.0 NaN S 10.5000 Gilbert, Mr. William 0 1285 2 male 0 NaN C.A. 30769 test Gilbert Mr William None
1286 29.0 NaN S 22.0250 Kink-Heilmann, Mr. Anton 1 1286 3 male 3 NaN 315153 test Kink-Heilmann Mr Anton None
1287 18.0 C31 S 60.0000 Smith, Mrs. Lucien Philip (Mary Eloise Hughes) 0 1287 1 female 1 NaN 13695 test Smith Mrs Lucien Philip (Mary Eloise Hughes)
1288 24.0 NaN Q 7.2500 Colbert, Mr. Patrick 0 1288 3 male 0 NaN 371109 test Colbert Mr Patrick None
1289 48.0 B41 C 79.2000 Frolicher-Stehli, Mrs. Maxmillian (Margaretha ... 1 1289 1 female 1 NaN 13567 test Frolicher-Stehli Mrs Maxmillian (Margaretha Emerentia Stehli)
1290 22.0 NaN S 7.7750 Larsson-Rondberg, Mr. Edvard A 0 1290 3 male 0 NaN 347065 test Larsson-Rondberg Mr Edvard A None
1291 31.0 NaN Q 7.7333 Conlon, Mr. Thomas Henry 0 1291 3 male 0 NaN 21332 test Conlon Mr Thomas Henry None
1292 30.0 C7 S 164.8667 Bonnell, Miss. Caroline 0 1292 1 female 0 NaN 36928 test Bonnell Miss Caroline None
1293 38.0 NaN S 21.0000 Gale, Mr. Harry 0 1293 2 male 1 NaN 28664 test Gale Mr Harry None
1294 22.0 NaN C 59.4000 Gibson, Miss. Dorothy Winifred 1 1294 1 female 0 NaN 112378 test Gibson Miss Dorothy Winifred None
1295 17.0 NaN S 47.1000 Carrau, Mr. Jose Pedro 0 1295 1 male 0 NaN 113059 test Carrau Mr Jose Pedro None
1296 43.0 D40 C 27.7208 Frauenthal, Mr. Isaac Gerald 0 1296 1 male 1 NaN 17765 test Frauenthal Mr Isaac Gerald None
1297 20.0 D38 C 13.8625 Nourney, Mr. Alfred (Baron von Drachstedt")" 0 1297 2 male 0 NaN SC/PARIS 2166 test Nourney Mr Alfred None
1298 23.0 NaN S 10.5000 Ware, Mr. William Jeffery 0 1298 2 male 1 NaN 28666 test Ware Mr William Jeffery None
1299 50.0 C80 C 211.5000 Widener, Mr. George Dunton 1 1299 1 male 1 NaN 113503 test Widener Mr George Dunton None
1300 NaN NaN Q 7.7208 Riordan, Miss. Johanna Hannah"" 0 1300 3 female 0 NaN 334915 test Riordan Miss Johanna Hannah None
1301 3.0 NaN S 13.7750 Peacock, Miss. Treasteall 1 1301 3 female 1 NaN SOTON/O.Q. 3101315 test Peacock Miss Treasteall None
1302 NaN NaN Q 7.7500 Naughton, Miss. Hannah 0 1302 3 female 0 NaN 365237 test Naughton Miss Hannah None
1303 37.0 C78 Q 90.0000 Minahan, Mrs. William Edward (Lillian E Thorpe) 0 1303 1 female 1 NaN 19928 test Minahan Mrs William Edward (Lillian E Thorpe)
1304 28.0 NaN S 7.7750 Henriksson, Miss. Jenny Lovisa 0 1304 3 female 0 NaN 347086 test Henriksson Miss Jenny Lovisa None
1305 NaN NaN S 8.0500 Spector, Mr. Woolf 0 1305 3 male 0 NaN A.5. 3236 test Spector Mr Woolf None
1306 39.0 C105 C 108.9000 Oliva y Ocana, Dona. Fermina 0 1306 1 female 0 NaN PC 17758 test Oliva y Ocana Dona Fermina None
1307 38.5 NaN S 7.2500 Saether, Mr. Simon Sivertsen 0 1307 3 male 0 NaN SOTON/O.Q. 3101262 test Saether Mr Simon Sivertsen None
1308 NaN NaN S 8.0500 Ware, Mr. Frederick 0 1308 3 male 0 NaN 359309 test Ware Mr Frederick None
1309 NaN NaN C 22.3583 Peter, Master. Michael J 1 1309 3 male 1 NaN 2668 test Peter Master Michael J None

1309 rows × 17 columns


In [ ]: