Titanic: Machine Learning from Disaster

Olivier RISSER-MAROIX (VieVie31)

In [1]:
import graphlab as gl

In [2]:
data_train = gl.load_sframe("train.csv")
data_test = gl.load_sframe("test.csv")

In [3]:

PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket
1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171
2 1 1 Cumings, Mrs. John
Bradley (Florence Briggs ...
female 38.0 1 0 PC 17599
3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282
Fare Cabin Embarked
7.25 S
71.2833 C85 C
7.925 S
[3 rows x 12 columns]

Cleanning trainning data

In [4]:
data_train["male"] = data_train["Sex"] == "male"
data_train["female"] = data_train["Sex"] == "female"
data_train = data_train.remove_column("Sex")

In [5]:
data_train["no_age"] = data_train["Age"] == None
data_train["Age"] = gl.SArray([0 if v == None else v for v in data_train["Age"]])

In [6]:
data_train["embarked_s"] = data_train["Embarked"] == "S"
data_train["embarked_c"] = data_train["Embarked"] == "C"
data_train["embarked_q"] = data_train["Embarked"] == "Q"
data_train["embarked_none"] = data_train["Embarked"] == None
data_train = data_train.remove_column("Embarked")

In [7]:
data_train["1_class"] = data_train["Pclass"] == 1
data_train["2_class"] = data_train["Pclass"] == 2
data_train["3_class"] = data_train["Pclass"] == 3
data_train = data_train.remove_column("Pclass")

In [8]:
print data_train.head(3)
print data_train["Ticket"]
for v in data_train["Ticket"]:
    print v, " ",

| PassengerId | Survived |              Name             | Age  | SibSp | Parch |
|      1      |    0     |    Braund, Mr. Owen Harris    | 22.0 |   1   |   0   |
|      2      |    1     | Cumings, Mrs. John Bradley... | 38.0 |   1   |   0   |
|      3      |    1     |     Heikkinen, Miss. Laina    | 26.0 |   0   |   0   |
|      Ticket      |   Fare  | Cabin | male | female | no_age | embarked_s | embarked_c |
|    A/5 21171     |   7.25  |       |  1   |   0    |   0    |     1      |     0      |
|     PC 17599     | 71.2833 |  C85  |  0   |   1    |   0    |     0      |     1      |
| STON/O2. 3101282 |  7.925  |       |  0   |   1    |   0    |     1      |     0      |
| embarked_q | embarked_none | 1_class | 2_class | 3_class |
|     0      |       0       |    0    |    0    |    1    |
|     0      |       0       |    1    |    0    |    0    |
|     0      |       0       |    0    |    0    |    1    |
[3 rows x 19 columns]

In [9]:
#processing the tickets numbers
#for try remove the non alpha numerics numbers
def toNumber(string):
    s = "0"
    for v in string:
        if v in ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]:
            s += v
    return int(s)

data_train["PC"] = gl.SArray(["PC" in v for v in data_train["Ticket"]])
data_train["CA"] = gl.SArray(["CA" in v for v in data_train["Ticket"]])
data_train["C.A."] = gl.SArray(["C.A." in v for v in data_train["Ticket"]])
data_train["W./C."] = gl.SArray(["W./C." in v for v in data_train["Ticket"]])
data_train["SOTON"] = gl.SArray(["SOTON" in v for v in data_train["Ticket"]])
data_train["number"] = gl.SArray([toNumber(v) for v in data_train["Ticket"]])

PassengerId Survived Name Age SibSp Parch Ticket Fare Cabin male
1 0 Braund, Mr. Owen Harris 22.0 1 0 A/5 21171 7.25 1
2 1 Cumings, Mrs. John
Bradley (Florence Briggs ...
38.0 1 0 PC 17599 71.2833 C85 0
3 1 Heikkinen, Miss. Laina 26.0 0 0 STON/O2. 3101282 7.925 0
female no_age embarked_s embarked_c embarked_q embarked_none 1_class 2_class 3_class PC CA C.A.
0 0 1 0 0 0 0 0 1 0 0 0
1 0 0 1 0 0 1 0 0 1 0 0
1 0 1 0 0 0 0 0 1 0 0 0
W./C. SOTON number
0 0 521171
0 0 17599
0 0 23101282
[3 rows x 25 columns]

In [10]:
import re

civilite_pattern = re.compile(r" ([A-Za-z])+.")

def get_civilite(name):
        return civilite_pattern.search(name).group(0)
        return ""

civilites_lst = set([get_civilite(v) if get_civilite(v)[-1] == '.' else '' for v in data_train["Name"]])
print civilites_lst

for c in civilites_lst:
    data_train[c] = gl.SArray([get_civilite(v) == c for v in data_train["Name"]])


set([' Miss.', ' Rev.', ' Capt.', ' Mlle.', ' Mrs.', ' Master.', ' Col.', ' Jonkheer.', ' Mr.', ' Ms.', ' Mme.', ' Major.', ' Dr.', ' Don.'])
PassengerId Survived Name Age SibSp Parch Ticket Fare Cabin male female no_age
1 0 Braund, Mr. Owen Harris 22.0 1 0 A/5 21171 7.25 1 0 0
embarked_s embarked_c embarked_q embarked_none 1_class 2_class 3_class PC CA C.A. W./C. SOTON
1 0 0 0 0 0 1 0 0 0 0 0
number Miss. Rev. Capt. Mlle. Mrs. Master. Col. Jonkheer. Mr. Ms. Mme. Major. Dr.
521171 0 0 0 0 0 0 0 0 1 0 0 0 0
[1 rows x 39 columns]

In [11]:
def cabin_letter(cabin):
        return cabin[0]
        return ""

cabin_letters = set([cabin_letter(v) for v in data_train["Cabin"]])
print cabin_letters

for c in cabin_letters:
    data_train[c] = gl.SArray([cabin_letter(v) == c for v in data_train["Cabin"]])


set(['A', 'C', 'B', 'E', 'D', 'G', 'F', 'T'])
In [12]:
def cabin_number(cabin):
    return toNumber(cabin)

data_train["cabin_number"] = gl.SArray([cabin_number(v) for v in data_train["Cabin"]])

In [33]:
train_set_1, train_set_2 = data_train.random_split(.8)

In [14]:
print train_set_1.head(1)
features = ["Age", "SibSp", "Parch", "Fare", "male", "female", "no_age", 
            "embarked_s", "embarked_c", "embarked_q", "embarked_none",
            "1_class", "2_class", "3_class", 
            "CA", "C.A.", "W./C.", "SOTON", 
            "cabin_number"] + list(civilites_lst) + list(cabin_letters) #, "number"]

| PassengerId | Survived |           Name          | Age  | SibSp | Parch |
|      1      |    0     | Braund, Mr. Owen Harris | 22.0 |   1   |   0   |
|   Ticket  | Fare | Cabin | male | female | no_age | embarked_s | embarked_c |
| A/5 21171 | 7.25 |       |  1   |   0    |   0    |     1      |     0      |
| embarked_q | embarked_none | 1_class | 2_class | 3_class | PC | ... |
|     0      |       0       |    0    |    0    |    1    | 0  | ... |
[1 rows x 48 columns]

Create logistic model

In [15]:

In [34]:
simple_logistic_classifier = gl.classifier.logistic_classifier.create(train_set_1, target="Survived", 
                                                                      features=features, validation_set=train_set_2)

WARNING: Detected extremely low variance for feature(s) 'embarked_none' because all entries are nearly the same.
Proceeding with model training using all features. If the model does not provide results of adequate quality, exclude the above mentioned feature(s) from the input dataset.
Logistic regression:
Number of examples          : 726
Number of classes           : 2
Number of feature columns   : 41
Number of unpacked features : 41
Number of coefficients    : 42
Starting Newton Method
| Iteration | Passes   | Elapsed Time | Training-accuracy | Validation-accuracy |
| 1         | 2        | 0.010240     | 0.842975          | 0.824242            |
| 2         | 3        | 0.016024     | 0.842975          | 0.818182            |
| 3         | 4        | 0.022155     | 0.841598          | 0.812121            |
| 4         | 5        | 0.027513     | 0.841598          | 0.812121            |
| 5         | 6        | 0.033767     | 0.841598          | 0.812121            |
| 6         | 7        | 0.038965     | 0.841598          | 0.812121            |
SUCCESS: Optimal solution found.

Create SVM model

In [35]:
simple_svm_classifier = gl.classifier.svm_classifier.create(train_set_1, target="Survived", 
                                                            features=features, validation_set=train_set_2, 

WARNING: Detected extremely low variance for feature(s) 'embarked_none' because all entries are nearly the same.
Proceeding with model training using all features. If the model does not provide results of adequate quality, exclude the above mentioned feature(s) from the input dataset.
Number of examples          : 726
Number of classes           : 2
Number of feature columns   : 41
Number of unpacked features : 41
Number of coefficients    : 42
Starting L-BFGS
| Iteration | Passes   | Step size | Elapsed Time | Training-accuracy | Validation-accuracy |
| 1         | 3        | 0.001377  | 0.005733     | 0.793388          | 0.751515            |
| 2         | 5        | 1.000000  | 0.012988     | 0.823691          | 0.751515            |
| 3         | 6        | 1.000000  | 0.016872     | 0.823691          | 0.751515            |
| 4         | 7        | 1.000000  | 0.020784     | 0.754821          | 0.757576            |
| 5         | 9        | 1.000000  | 0.025912     | 0.836088          | 0.818182            |
| 6         | 10       | 1.000000  | 0.029391     | 0.841598          | 0.824242            |
| 11        | 16       | 1.000000  | 0.050618     | 0.851240          | 0.824242            |
| 51        | 66       | 0.500000  | 0.212055     | 0.844353          | 0.824242            |
| 101       | 149      | 0.250000  | 0.430480     | 0.844353          | 0.824242            |
SUCCESS: Optimal solution found.

Create a decision tree model

In [36]:
decision_tree_model = gl.decision_tree_classifier.create(train_set_1, validation_set=train_set_2,
                                                               target="Survived", features=features)

WARNING: Detected extremely low variance for feature(s) 'embarked_none' because all entries are nearly the same.
Proceeding with model training using all features. If the model does not provide results of adequate quality, exclude the above mentioned feature(s) from the input dataset.
Decision tree classifier:
Number of examples          : 726
Number of classes           : 2
Number of feature columns   : 41
Number of unpacked features : 41
| Iteration | Elapsed Time | Training-accuracy | Training-log_loss | Validation-accuracy | Validation-log_loss |
| 1         | 0.006789     | 0.888430          | 0.543812          | 0.824242            | 0.562930            |

Boosted Tree model

In [37]:
boosted_tree_model = gl.classifier.boosted_trees_classifier.create(train_set_1, validation_set=train_set_2,
                                                                   target="Survived", features=features)

WARNING: Detected extremely low variance for feature(s) 'embarked_none' because all entries are nearly the same.
Proceeding with model training using all features. If the model does not provide results of adequate quality, exclude the above mentioned feature(s) from the input dataset.
Boosted trees classifier:
Number of examples          : 726
Number of classes           : 2
Number of feature columns   : 41
Number of unpacked features : 41
| Iteration | Elapsed Time | Training-accuracy | Training-log_loss | Validation-accuracy | Validation-log_loss |
| 1         | 0.007663     | 0.888430          | 0.543812          | 0.824242            | 0.562930            |
| 2         | 0.013029     | 0.889807          | 0.455969          | 0.830303            | 0.494265            |
| 3         | 0.020145     | 0.891185          | 0.403053          | 0.836364            | 0.452951            |
| 4         | 0.025106     | 0.900826          | 0.363194          | 0.836364            | 0.434464            |
| 5         | 0.030710     | 0.909091          | 0.327662          | 0.836364            | 0.416233            |
| 6         | 0.036800     | 0.920110          | 0.304839          | 0.836364            | 0.413776            |

Random Forest model

In [38]:
random_forest_model = gl.classifier.random_forest_classifier.create(train_set_1, validation_set=train_set_2,
                                                                    target="Survived", features=features, num_trees=100)

WARNING: Detected extremely low variance for feature(s) 'embarked_none' because all entries are nearly the same.
Proceeding with model training using all features. If the model does not provide results of adequate quality, exclude the above mentioned feature(s) from the input dataset.
Random forest classifier:
Number of examples          : 726
Number of classes           : 2
Number of feature columns   : 41
Number of unpacked features : 41
| Iteration | Elapsed Time | Training-accuracy | Training-log_loss | Validation-accuracy | Validation-log_loss |
| 1         | 0.006848     | 0.880165          | 0.687526          | 0.836364            | 0.687980            |
| 2         | 0.011477     | 0.882920          | 0.682088          | 0.836364            | 0.683120            |
| 3         | 0.015954     | 0.889807          | 0.676643          | 0.830303            | 0.678081            |
| 4         | 0.022401     | 0.881543          | 0.671554          | 0.836364            | 0.673123            |
| 5         | 0.026616     | 0.884297          | 0.665970          | 0.842424            | 0.668126            |
| 6         | 0.031530     | 0.891185          | 0.660692          | 0.830303            | 0.663256            |
| 11        | 0.053991     | 0.893939          | 0.635446          | 0.818182            | 0.641393            |
| 51        | 0.226360     | 0.898072          | 0.473822          | 0.830303            | 0.503634            |

Cleanning testing data


In [21]:
data_test["male"] = data_test["Sex"] == "male"
data_test["female"] = data_test["Sex"] == "female"
data_test = data_test.remove_column("Sex")

In [22]:
data_test["no_age"] = data_test["Age"] == None
data_test["Age"] = gl.SArray([0 if v == None else v for v in data_test["Age"]])

In [23]:
data_test["embarked_s"] = data_test["Embarked"] == "S"
data_test["embarked_c"] = data_test["Embarked"] == "C"
data_test["embarked_q"] = data_test["Embarked"] == "Q"
data_test["embarked_none"] = data_test["Embarked"] == None
data_test = data_test.remove_column("Embarked")

In [24]:
data_test["1_class"] = data_test["Pclass"] == 1
data_test["2_class"] = data_test["Pclass"] == 2
data_test["3_class"] = data_test["Pclass"] == 3
data_test = data_test.remove_column("Pclass")

In [25]:
data_test["number"] = gl.SArray([toNumber(v) for v in data_test["Ticket"]])
data_test["PC"] = gl.SArray(["PC" in v for v in data_test["Ticket"]])
data_test["CA"] = gl.SArray(["CA" in v for v in data_test["Ticket"]])
data_test["C.A."] = gl.SArray(["C.A." in v for v in data_test["Ticket"]])
data_test["W./C."] = gl.SArray(["W./C." in v for v in data_test["Ticket"]])
data_test["SOTON"] = gl.SArray(["SOTON" in v for v in data_test["Ticket"]])
data_test["number"] = gl.SArray([toNumber(v) for v in data_test["Ticket"]])

In [26]:
for c in civilites_lst:
    data_test[c] = gl.SArray([get_civilite(v) == c for v in data_test["Name"]])

In [27]:
for c in cabin_letters:
    data_test[c] = gl.SArray([cabin_letter(v) == c for v in data_test["Cabin"]])

In [29]:
data_test["cabin_number"] = gl.SArray([cabin_number(v) for v in data_test["Cabin"]])

Making Predictions

In [43]:
data_test["Survived"] = boosted_tree_model.predict(data_test)

In [44]:
submission = gl.SFrame()

In [45]:
submission["PassengerId"] = data_test["PassengerId"]
submission["Survived"] = data_test["Survived"]

In [46]:
submission.save("kaggle.csv", format="csv")

