Titanic: Machine Learning from Disaster

Olivier RISSER-MAROIX (VieVie31)


In [1]:
import graphlab as gl


A newer version of GraphLab Create (v1.8.5) is available! Your current version is v1.8.4.

You can use pip to upgrade the graphlab-create package. For more information see https://dato.com/products/create/upgrade.

In [2]:
data_train = gl.load_sframe("train.csv")
data_test = gl.load_sframe("test.csv")


2016-03-19 12:56:17,024 [INFO] graphlab.cython.cy_server, 176: GraphLab Create v1.8.4 started. Logging: /tmp/graphlab_server_1458388574.log
Finished parsing file /Users/mac/Documents/Programmation/Kaggle/titanic/train.csv
Parsing completed. Parsed 100 lines in 0.033549 secs.
This non-commercial license of GraphLab Create is assigned to orissermaroix@gmail.com and will expire on March 12, 2017. For commercial licensing options, visit https://dato.com/buy/.
------------------------------------------------------
Finished parsing file /Users/mac/Documents/Programmation/Kaggle/titanic/train.csv
Parsing completed. Parsed 891 lines in 0.040443 secs.
Finished parsing file /Users/mac/Documents/Programmation/Kaggle/titanic/test.csv
Parsing completed. Parsed 100 lines in 0.007262 secs.
Inferred types from first line of file as 
column_type_hints=[int,int,int,str,str,float,int,int,str,float,str,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------
------------------------------------------------------
Finished parsing file /Users/mac/Documents/Programmation/Kaggle/titanic/test.csv
Parsing completed. Parsed 418 lines in 0.009007 secs.
Inferred types from first line of file as 
column_type_hints=[int,int,str,str,float,int,int,str,float,str,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------

In [3]:
data_train.head(3)


Out[3]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket
1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171
2 1 1 Cumings, Mrs. John
Bradley (Florence Briggs ...
female 38.0 1 0 PC 17599
3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282
Fare Cabin Embarked
7.25 S
71.2833 C85 C
7.925 S
[3 rows x 12 columns]

Cleanning trainning data


In [4]:
data_train["male"] = data_train["Sex"] == "male"
data_train["female"] = data_train["Sex"] == "female"
data_train = data_train.remove_column("Sex")

In [5]:
data_train["no_age"] = data_train["Age"] == None
data_train["Age"] = gl.SArray([0 if v == None else v for v in data_train["Age"]])

In [6]:
data_train["embarked_s"] = data_train["Embarked"] == "S"
data_train["embarked_c"] = data_train["Embarked"] == "C"
data_train["embarked_q"] = data_train["Embarked"] == "Q"
data_train["embarked_none"] = data_train["Embarked"] == None
data_train = data_train.remove_column("Embarked")

In [7]:
data_train["1_class"] = data_train["Pclass"] == 1
data_train["2_class"] = data_train["Pclass"] == 2
data_train["3_class"] = data_train["Pclass"] == 3
data_train = data_train.remove_column("Pclass")

In [8]:
gl.canvas.set_target("ipynb")
print data_train.head(3)
print data_train["Ticket"]
for v in data_train["Ticket"]:
    print v, " ",


+-------------+----------+-------------------------------+------+-------+-------+
| PassengerId | Survived |              Name             | Age  | SibSp | Parch |
+-------------+----------+-------------------------------+------+-------+-------+
|      1      |    0     |    Braund, Mr. Owen Harris    | 22.0 |   1   |   0   |
|      2      |    1     | Cumings, Mrs. John Bradley... | 38.0 |   1   |   0   |
|      3      |    1     |     Heikkinen, Miss. Laina    | 26.0 |   0   |   0   |
+-------------+----------+-------------------------------+------+-------+-------+
+------------------+---------+-------+------+--------+--------+------------+------------+
|      Ticket      |   Fare  | Cabin | male | female | no_age | embarked_s | embarked_c |
+------------------+---------+-------+------+--------+--------+------------+------------+
|    A/5 21171     |   7.25  |       |  1   |   0    |   0    |     1      |     0      |
|     PC 17599     | 71.2833 |  C85  |  0   |   1    |   0    |     0      |     1      |
| STON/O2. 3101282 |  7.925  |       |  0   |   1    |   0    |     1      |     0      |
+------------------+---------+-------+------+--------+--------+------------+------------+
+------------+---------------+---------+---------+---------+
| embarked_q | embarked_none | 1_class | 2_class | 3_class |
+------------+---------------+---------+---------+---------+
|     0      |       0       |    0    |    0    |    1    |
|     0      |       0       |    1    |    0    |    0    |
|     0      |       0       |    0    |    0    |    1    |
+------------+---------------+---------+---------+---------+
[3 rows x 19 columns]

['A/5 21171', 'PC 17599', 'STON/O2. 3101282', '113803', '373450', '330877', '17463', '349909', '347742', '237736', 'PP 9549', '113783', 'A/5. 2151', '347082', '350406', '248706', '382652', '244373', '345763', '2649', '239865', '248698', '330923', '113788', '349909', '347077', '2631', '19950', '330959', '349216', 'PC 17601', 'PC 17569', '335677', 'C.A. 24579', 'PC 17604', '113789', '2677', 'A./5. 2152', '345764', '2651', '7546', '11668', '349253', 'SC/Paris 2123', '330958', 'S.C./A.4. 23567', '370371', '14311', '2662', '349237', '3101295', 'A/4. 39886', 'PC 17572', '2926', '113509', '19947', 'C.A. 31026', '2697', 'C.A. 34651', 'CA 2144', '2669', '113572', '36973', '347088', 'PC 17605', '2661', 'C.A. 29395', 'S.P. 3464', '3101281', '315151', 'C.A. 33111', 'CA 2144', 'S.O.C. 14879', '2680', '1601', '348123', '349208', '374746', '248738', '364516', '345767', '345779', '330932', '113059', 'SO/C 14885', '3101278', 'W./C. 6608', 'SOTON/OQ 392086', '19950', '343275', '343276', '347466', 'W.E.P. 5734', 'C.A. 2315', '364500', '374910', 'PC 17754', 'PC 17759', '231919', '244367', ... ]
A/5 21171   PC 17599   STON/O2. 3101282   113803   373450   330877   17463   349909   347742   237736   PP 9549   113783   A/5. 2151   347082   350406   248706   382652   244373   345763   2649   239865   248698   330923   113788   349909   347077   2631   19950   330959   349216   PC 17601   PC 17569   335677   C.A. 24579   PC 17604   113789   2677   A./5. 2152   345764   2651   7546   11668   349253   SC/Paris 2123   330958   S.C./A.4. 23567   370371   14311   2662   349237   3101295   A/4. 39886   PC 17572   2926   113509   19947   C.A. 31026   2697   C.A. 34651   CA 2144   2669   113572   36973   347088   PC 17605   2661   C.A. 29395   S.P. 3464   3101281   315151   C.A. 33111   CA 2144   S.O.C. 14879   2680   1601   348123   349208   374746   248738   364516   345767   345779   330932   113059   SO/C 14885   3101278   W./C. 6608   SOTON/OQ 392086   19950   343275   343276   347466   W.E.P. 5734   C.A. 2315   364500   374910   PC 17754   PC 17759   231919   244367   349245   349215   35281   7540   3101276   349207   343120   312991   349249   371110   110465   2665   324669   4136   2627   STON/O 2. 3101294   370369   11668   PC 17558   347082   S.O.C. 14879   A4. 54510   237736   27267   35281   2651   370372   C 17369   2668   347061   349241   SOTON/O.Q. 3101307   A/5. 3337   228414   C.A. 29178   SC/PARIS 2133   11752   113803   7534   PC 17593   2678   347081   STON/O2. 3101279   365222   231945   C.A. 33112   350043   W./C. 6608   230080   244310   S.O.P. 1166   113776   A.5. 11206   A/5. 851   Fa 265302   PC 17597   35851   SOTON/OQ 392090   315037   CA. 2343   371362   C.A. 33595   347068   315093   3101295   363291   113505   347088   PC 17318   1601   111240   382652   347742   STON/O 2. 3101280   17764   350404   4133   PC 17595   250653   LINE   CA. 2343   SC/PARIS 2131   347077   230136   315153   113767   370365   111428   364849   349247   234604   28424   350046   230080   PC 17610   PC 17569   368703   4579   370370   248747   345770   CA. 2343   3101264   2628   A/5 3540   347054   3101278   2699   367231   112277   SOTON/O.Q. 3101311   F.C.C. 13528   A/5 21174   250646   367229   35273   STON/O2. 3101283   243847   11813   W/C 14208   SOTON/OQ 392089   220367   21440   349234   19943   PP 4348   SW/PP 751   A/5 21173   236171   4133   36973   347067   237442   347077   C.A. 29566   W./C. 6609   26707   C.A. 31921   28665   SCO/W 1585   2665   367230   W./C. 14263   STON/O 2. 3101275   2694   19928   347071   250649   11751   244252   362316   347054   113514   A/5. 3336   370129   2650   PC 17585   110152   PC 17755   230433   384461   347077   110413   112059   382649   C.A. 17248   3101295   347083   PC 17582   PC 17760   113798   LINE   250644   PC 17596   370375   13502   347073   239853   382652   C.A. 2673   336439   347464   345778   A/5. 10482   113056   349239   345774   349206   237798   370373   19877   11967   SC/Paris 2163   349236   349233   PC 17612   2693   113781   19988   PC 17558   9234   367226   LINE   226593   A/5 2466   113781   17421   PC 17758   P/PP 3381   PC 17485   11767   PC 17608   250651   349243   F.C.C. 13529   347470   244367   29011   36928   16966   A/5 21172   349219   234818   248738   CA. 2343   PC 17760   345364   28551   363291   111361   367226   113043   PC 17582   345764   PC 17611   349225   113776   16966   7598   113784   230080   19950   248740   244361   229236   248733   31418   386525   C.A. 37671   315088   7267   113510   2695   349237   2647   345783   113505   237671   330931   330980   347088   SC/PARIS 2167   2691   SOTON/O.Q. 3101310   370365   C 7076   110813   2626   14313   PC 17477   11765   3101267   323951   PC 17760   349909   PC 17604   C 7077   113503   2648   347069   PC 17757   2653   STON/O 2. 3101293   113789   349227   S.O.C. 14879   CA 2144   27849   367655   SC 1748   113760   350034   3101277   35273   PP 9549   350052   350407   28403   244278   240929   STON/O 2. 3101289   341826   4137   STON/O2. 3101279   315096   28664   347064   29106   312992   4133   349222   394140   19928   239853   STON/O 2. 3101269   343095   28220   250652   28228   345773   349254   A/5. 13032   315082   347080   370129   A/4. 34244   2003   250655   364851   SOTON/O.Q. 392078   110564   376564   SC/AH 3085   STON/O 2. 3101274   13507   113760   W./C. 6608   29106   19950   C.A. 18723   F.C.C. 13529   345769   347076   230434   65306   33638   250644   113794   2666   113786   C.A. 34651   65303   113051   17453   A/5 2817   349240   13509   17464   F.C.C. 13531   371060   19952   364506   111320   234360   A/S 2816   SOTON/O.Q. 3101306   239853   113792   36209   2666   323592   315089   C.A. 34651   SC/AH Basle 541   7553   110465   31027   3460   350060   3101298   CA 2144   239854   A/5 3594   4134   11967   4133   19943   11771   A.5. 18509   C.A. 37671   65304   SOTON/OQ 3101317   113787   PC 17609   A/4 45380   2627   36947   C.A. 6212   113781   350035   315086   364846   330909   4135   110152   PC 17758   26360   111427   C 4001   1601   382651   SOTON/OQ 3101316   PC 17473   PC 17603   349209   36967   C.A. 34260   371110   226875   349242   12749   349252   2624   111361   2700   367232   W./C. 14258   PC 17483   3101296   29104   26360   2641   2690   2668   315084   F.C.C. 13529   113050   PC 17761   364498   13568   WE/P 5735   347082   347082   2908   PC 17761   693   2908   SC/PARIS 2146   363291   C.A. 33112   17421   244358   330979   2620   347085   113807   11755   PC 17757   110413   345572   372622   349251   218629   SOTON/OQ 392082   SOTON/O.Q. 392087   A/4 48871   349205   349909   2686   350417   S.W./PP 752   11769   PC 17474   14312   A/4. 20589   358585   243880   13507   2689   STON/O 2. 3101286   237789   17421   28403   13049   3411   110413   237565   13567   14973   A./5. 3235   STON/O 2. 3101273   36947   A/5 3902   364848   SC/AH 29037   345773   248727   LINE   2664   PC 17485   243847   349214   113796   364511   111426   349910   349246   113804   SC/Paris 2123   PC 17582   347082   SOTON/O.Q. 3101305   367230   370377   364512   220845   347080   A/5. 3336   230136   31028   2659   11753   2653   350029   54636   36963   219533   13502   349224   334912   27042   347743   13214   112052   347088   237668   STON/O 2. 3101292   C.A. 31921   3101295   376564   350050   PC 17477   347088   1601   2666   PC 17572   349231   13213   S.O./P.P. 751   CA. 2314   349221   231919   8475   330919   365226   S.O.C. 14879   349223   364849   29751   35273   PC 17611   2623   5727   349210   STON/O 2. 3101285   S.O.C. 14879   234686   312993   A/5 3536   19996   29750   F.C. 12750   C.A. 24580   244270   239856   349912   342826   4138   CA 2144   PC 17755   330935   PC 17572   6563   CA 2144   29750   SC/Paris 2123   3101295   349228   350036   24160   17474   349256   1601   2672   113800   248731   363592   35852   17421   348121   PC 17757   PC 17475   2691   36864   350025   250655   223596   PC 17476   113781   2661   PC 17482   113028   19996   7545   250647   348124   PC 17757   34218   36568   347062   248727   350048   12233   250643   113806   315094   31027   36866   236853   STON/O2. 3101271   24160   2699   239855   28425   233639   54636   W./C. 6608   PC 17755   349201   349218   16988   19877   PC 17608   376566   STON/O 2. 3101288   WE/P 5735   C.A. 2673   250648   113773   335097   29103   392096   345780   349204   220845   250649   350042   29108   363294   110152   358585   SOTON/O2 3101272   2663   113760   347074   13502   112379   364850   371110   8471   345781   350047   S.O./P.P. 3   2674   29105   347078   383121   364516   36865   24160   2687   17474   113501   W./C. 6607   SOTON/O.Q. 3101312   374887   3101265   382652   C.A. 2315   PC 17593   12460   239865   CA. 2343   PC 17600   349203   28213   17465   349244   2685   345773   250647   C.A. 31921   113760   2625   347089   347063   112050   347087   248723   113806   3474   A/4 48871   28206   347082   364499   112058   STON/O2. 3101290   S.C./PARIS 2079   C 7075   347088   12749   315098   19972   392096   3101295   368323   1601   S.C./PARIS 2079   367228   113572   2659   29106   2671   347468   2223   PC 17756   315097   392092   1601   11774   SOTON/O2 3101287   S.O./P.P. 3   113798   2683   315090   C.A. 5547   CA. 2343   349213   248727   17453   347082   347060   2678   PC 17592   244252   392091   36928   113055   2666   2629   350026   28134   17466   CA. 2343   233866   236852   SC/PARIS 2149   PC 17590   345777   347742   349248   11751   695   345765   P/PP 3381   2667   7534   349212   349217   11767   230433   349257   7552   C.A./SOTON 34068   SOTON/OQ 392076   382652   211536   112053   W./C. 6607   111369   370376  

In [9]:
#processing the tickets numbers
#for try remove the non alpha numerics numbers
def toNumber(string):
    s = "0"
    for v in string:
        if v in ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]:
            s += v
    return int(s)

data_train["PC"] = gl.SArray(["PC" in v for v in data_train["Ticket"]])
data_train["CA"] = gl.SArray(["CA" in v for v in data_train["Ticket"]])
data_train["C.A."] = gl.SArray(["C.A." in v for v in data_train["Ticket"]])
data_train["W./C."] = gl.SArray(["W./C." in v for v in data_train["Ticket"]])
data_train["SOTON"] = gl.SArray(["SOTON" in v for v in data_train["Ticket"]])
data_train["number"] = gl.SArray([toNumber(v) for v in data_train["Ticket"]])
data_train.head(3)


Out[9]:
PassengerId Survived Name Age SibSp Parch Ticket Fare Cabin male
1 0 Braund, Mr. Owen Harris 22.0 1 0 A/5 21171 7.25 1
2 1 Cumings, Mrs. John
Bradley (Florence Briggs ...
38.0 1 0 PC 17599 71.2833 C85 0
3 1 Heikkinen, Miss. Laina 26.0 0 0 STON/O2. 3101282 7.925 0
female no_age embarked_s embarked_c embarked_q embarked_none 1_class 2_class 3_class PC CA C.A.
0 0 1 0 0 0 0 0 1 0 0 0
1 0 0 1 0 0 1 0 0 1 0 0
1 0 1 0 0 0 0 0 1 0 0 0
W./C. SOTON number
0 0 521171
0 0 17599
0 0 23101282
[3 rows x 25 columns]


In [10]:
import re

civilite_pattern = re.compile(r" ([A-Za-z])+.")

def get_civilite(name):
    try:
        return civilite_pattern.search(name).group(0)
    except: 
        return ""

civilites_lst = set([get_civilite(v) if get_civilite(v)[-1] == '.' else '' for v in data_train["Name"]])
civilites_lst.remove('')
print civilites_lst

for c in civilites_lst:
    data_train[c] = gl.SArray([get_civilite(v) == c for v in data_train["Name"]])

data_train.head(1)


set([' Miss.', ' Rev.', ' Capt.', ' Mlle.', ' Mrs.', ' Master.', ' Col.', ' Jonkheer.', ' Mr.', ' Ms.', ' Mme.', ' Major.', ' Dr.', ' Don.'])
Out[10]:
PassengerId Survived Name Age SibSp Parch Ticket Fare Cabin male female no_age
1 0 Braund, Mr. Owen Harris 22.0 1 0 A/5 21171 7.25 1 0 0
embarked_s embarked_c embarked_q embarked_none 1_class 2_class 3_class PC CA C.A. W./C. SOTON
1 0 0 0 0 0 1 0 0 0 0 0
number Miss. Rev. Capt. Mlle. Mrs. Master. Col. Jonkheer. Mr. Ms. Mme. Major. Dr.
521171 0 0 0 0 0 0 0 0 1 0 0 0 0
Don.
0
[1 rows x 39 columns]

In [11]:
def cabin_letter(cabin):
    try:
        return cabin[0]
    except:
        return ""

cabin_letters = set([cabin_letter(v) for v in data_train["Cabin"]])
cabin_letters.remove('')
print cabin_letters

for c in cabin_letters:
    data_train[c] = gl.SArray([cabin_letter(v) == c for v in data_train["Cabin"]])

data_train.column_names()


set(['A', 'C', 'B', 'E', 'D', 'G', 'F', 'T'])
Out[11]:
['PassengerId',
 'Survived',
 'Name',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'male',
 'female',
 'no_age',
 'embarked_s',
 'embarked_c',
 'embarked_q',
 'embarked_none',
 '1_class',
 '2_class',
 '3_class',
 'PC',
 'CA',
 'C.A.',
 'W./C.',
 'SOTON',
 'number',
 ' Miss.',
 ' Rev.',
 ' Capt.',
 ' Mlle.',
 ' Mrs.',
 ' Master.',
 ' Col.',
 ' Jonkheer.',
 ' Mr.',
 ' Ms.',
 ' Mme.',
 ' Major.',
 ' Dr.',
 ' Don.',
 'A',
 'C',
 'B',
 'E',
 'D',
 'G',
 'F',
 'T']

In [12]:
def cabin_number(cabin):
    return toNumber(cabin)

data_train["cabin_number"] = gl.SArray([cabin_number(v) for v in data_train["Cabin"]])

In [33]:
train_set_1, train_set_2 = data_train.random_split(.8)

In [14]:
print train_set_1.head(1)
features = ["Age", "SibSp", "Parch", "Fare", "male", "female", "no_age", 
            "embarked_s", "embarked_c", "embarked_q", "embarked_none",
            "1_class", "2_class", "3_class", 
            "CA", "C.A.", "W./C.", "SOTON", 
            "cabin_number"] + list(civilites_lst) + list(cabin_letters) #, "number"]


+-------------+----------+-------------------------+------+-------+-------+
| PassengerId | Survived |           Name          | Age  | SibSp | Parch |
+-------------+----------+-------------------------+------+-------+-------+
|      1      |    0     | Braund, Mr. Owen Harris | 22.0 |   1   |   0   |
+-------------+----------+-------------------------+------+-------+-------+
+-----------+------+-------+------+--------+--------+------------+------------+
|   Ticket  | Fare | Cabin | male | female | no_age | embarked_s | embarked_c |
+-----------+------+-------+------+--------+--------+------------+------------+
| A/5 21171 | 7.25 |       |  1   |   0    |   0    |     1      |     0      |
+-----------+------+-------+------+--------+--------+------------+------------+
+------------+---------------+---------+---------+---------+----+-----+
| embarked_q | embarked_none | 1_class | 2_class | 3_class | PC | ... |
+------------+---------------+---------+---------+---------+----+-----+
|     0      |       0       |    0    |    0    |    1    | 0  | ... |
+------------+---------------+---------+---------+---------+----+-----+
[1 rows x 48 columns]

Create logistic model


In [15]:
#help(gl.classifier.logistic_classifier.create)

In [34]:
simple_logistic_classifier = gl.classifier.logistic_classifier.create(train_set_1, target="Survived", 
                                                                      features=features, validation_set=train_set_2)


WARNING: Detected extremely low variance for feature(s) 'embarked_none' because all entries are nearly the same.
Proceeding with model training using all features. If the model does not provide results of adequate quality, exclude the above mentioned feature(s) from the input dataset.
Logistic regression:
--------------------------------------------------------
Number of examples          : 726
Number of classes           : 2
Number of feature columns   : 41
Number of unpacked features : 41
Number of coefficients    : 42
Starting Newton Method
--------------------------------------------------------
+-----------+----------+--------------+-------------------+---------------------+
| Iteration | Passes   | Elapsed Time | Training-accuracy | Validation-accuracy |
+-----------+----------+--------------+-------------------+---------------------+
| 1         | 2        | 0.010240     | 0.842975          | 0.824242            |
| 2         | 3        | 0.016024     | 0.842975          | 0.818182            |
| 3         | 4        | 0.022155     | 0.841598          | 0.812121            |
| 4         | 5        | 0.027513     | 0.841598          | 0.812121            |
| 5         | 6        | 0.033767     | 0.841598          | 0.812121            |
| 6         | 7        | 0.038965     | 0.841598          | 0.812121            |
+-----------+----------+--------------+-------------------+---------------------+
SUCCESS: Optimal solution found.

Create SVM model


In [35]:
simple_svm_classifier = gl.classifier.svm_classifier.create(train_set_1, target="Survived", 
                                                            features=features, validation_set=train_set_2, 
                                                            max_iterations=1000)


WARNING: Detected extremely low variance for feature(s) 'embarked_none' because all entries are nearly the same.
Proceeding with model training using all features. If the model does not provide results of adequate quality, exclude the above mentioned feature(s) from the input dataset.
SVM:
--------------------------------------------------------
Number of examples          : 726
Number of classes           : 2
Number of feature columns   : 41
Number of unpacked features : 41
Number of coefficients    : 42
Starting L-BFGS
--------------------------------------------------------
+-----------+----------+-----------+--------------+-------------------+---------------------+
| Iteration | Passes   | Step size | Elapsed Time | Training-accuracy | Validation-accuracy |
+-----------+----------+-----------+--------------+-------------------+---------------------+
| 1         | 3        | 0.001377  | 0.005733     | 0.793388          | 0.751515            |
| 2         | 5        | 1.000000  | 0.012988     | 0.823691          | 0.751515            |
| 3         | 6        | 1.000000  | 0.016872     | 0.823691          | 0.751515            |
| 4         | 7        | 1.000000  | 0.020784     | 0.754821          | 0.757576            |
| 5         | 9        | 1.000000  | 0.025912     | 0.836088          | 0.818182            |
| 6         | 10       | 1.000000  | 0.029391     | 0.841598          | 0.824242            |
| 11        | 16       | 1.000000  | 0.050618     | 0.851240          | 0.824242            |
| 51        | 66       | 0.500000  | 0.212055     | 0.844353          | 0.824242            |
| 101       | 149      | 0.250000  | 0.430480     | 0.844353          | 0.824242            |
+-----------+----------+-----------+--------------+-------------------+---------------------+
SUCCESS: Optimal solution found.

Create a decision tree model


In [36]:
decision_tree_model = gl.decision_tree_classifier.create(train_set_1, validation_set=train_set_2,
                                                               target="Survived", features=features)


WARNING: Detected extremely low variance for feature(s) 'embarked_none' because all entries are nearly the same.
Proceeding with model training using all features. If the model does not provide results of adequate quality, exclude the above mentioned feature(s) from the input dataset.
Decision tree classifier:
--------------------------------------------------------
Number of examples          : 726
Number of classes           : 2
Number of feature columns   : 41
Number of unpacked features : 41
+-----------+--------------+-------------------+-------------------+---------------------+---------------------+
| Iteration | Elapsed Time | Training-accuracy | Training-log_loss | Validation-accuracy | Validation-log_loss |
+-----------+--------------+-------------------+-------------------+---------------------+---------------------+
| 1         | 0.006789     | 0.888430          | 0.543812          | 0.824242            | 0.562930            |
+-----------+--------------+-------------------+-------------------+---------------------+---------------------+

Boosted Tree model


In [37]:
boosted_tree_model = gl.classifier.boosted_trees_classifier.create(train_set_1, validation_set=train_set_2,
                                                                   target="Survived", features=features)


WARNING: Detected extremely low variance for feature(s) 'embarked_none' because all entries are nearly the same.
Proceeding with model training using all features. If the model does not provide results of adequate quality, exclude the above mentioned feature(s) from the input dataset.
Boosted trees classifier:
--------------------------------------------------------
Number of examples          : 726
Number of classes           : 2
Number of feature columns   : 41
Number of unpacked features : 41
+-----------+--------------+-------------------+-------------------+---------------------+---------------------+
| Iteration | Elapsed Time | Training-accuracy | Training-log_loss | Validation-accuracy | Validation-log_loss |
+-----------+--------------+-------------------+-------------------+---------------------+---------------------+
| 1         | 0.007663     | 0.888430          | 0.543812          | 0.824242            | 0.562930            |
| 2         | 0.013029     | 0.889807          | 0.455969          | 0.830303            | 0.494265            |
| 3         | 0.020145     | 0.891185          | 0.403053          | 0.836364            | 0.452951            |
| 4         | 0.025106     | 0.900826          | 0.363194          | 0.836364            | 0.434464            |
| 5         | 0.030710     | 0.909091          | 0.327662          | 0.836364            | 0.416233            |
| 6         | 0.036800     | 0.920110          | 0.304839          | 0.836364            | 0.413776            |
+-----------+--------------+-------------------+-------------------+---------------------+---------------------+

Random Forest model


In [38]:
random_forest_model = gl.classifier.random_forest_classifier.create(train_set_1, validation_set=train_set_2,
                                                                    target="Survived", features=features, num_trees=100)


WARNING: Detected extremely low variance for feature(s) 'embarked_none' because all entries are nearly the same.
Proceeding with model training using all features. If the model does not provide results of adequate quality, exclude the above mentioned feature(s) from the input dataset.
Random forest classifier:
--------------------------------------------------------
Number of examples          : 726
Number of classes           : 2
Number of feature columns   : 41
Number of unpacked features : 41
+-----------+--------------+-------------------+-------------------+---------------------+---------------------+
| Iteration | Elapsed Time | Training-accuracy | Training-log_loss | Validation-accuracy | Validation-log_loss |
+-----------+--------------+-------------------+-------------------+---------------------+---------------------+
| 1         | 0.006848     | 0.880165          | 0.687526          | 0.836364            | 0.687980            |
| 2         | 0.011477     | 0.882920          | 0.682088          | 0.836364            | 0.683120            |
| 3         | 0.015954     | 0.889807          | 0.676643          | 0.830303            | 0.678081            |
| 4         | 0.022401     | 0.881543          | 0.671554          | 0.836364            | 0.673123            |
| 5         | 0.026616     | 0.884297          | 0.665970          | 0.842424            | 0.668126            |
| 6         | 0.031530     | 0.891185          | 0.660692          | 0.830303            | 0.663256            |
| 11        | 0.053991     | 0.893939          | 0.635446          | 0.818182            | 0.641393            |
| 51        | 0.226360     | 0.898072          | 0.473822          | 0.830303            | 0.503634            |
+-----------+--------------+-------------------+-------------------+---------------------+---------------------+

In [ ]:

Cleanning testing data

DO NOT FORGOT ANY CLEANING OPERATION MADE IN THE TRAINING DATA AND USED BY CLASSIFIER !!!


In [21]:
data_test["male"] = data_test["Sex"] == "male"
data_test["female"] = data_test["Sex"] == "female"
data_test = data_test.remove_column("Sex")

In [22]:
data_test["no_age"] = data_test["Age"] == None
data_test["Age"] = gl.SArray([0 if v == None else v for v in data_test["Age"]])

In [23]:
data_test["embarked_s"] = data_test["Embarked"] == "S"
data_test["embarked_c"] = data_test["Embarked"] == "C"
data_test["embarked_q"] = data_test["Embarked"] == "Q"
data_test["embarked_none"] = data_test["Embarked"] == None
data_test = data_test.remove_column("Embarked")

In [24]:
data_test["1_class"] = data_test["Pclass"] == 1
data_test["2_class"] = data_test["Pclass"] == 2
data_test["3_class"] = data_test["Pclass"] == 3
data_test = data_test.remove_column("Pclass")

In [25]:
data_test["number"] = gl.SArray([toNumber(v) for v in data_test["Ticket"]])
data_test["PC"] = gl.SArray(["PC" in v for v in data_test["Ticket"]])
data_test["CA"] = gl.SArray(["CA" in v for v in data_test["Ticket"]])
data_test["C.A."] = gl.SArray(["C.A." in v for v in data_test["Ticket"]])
data_test["W./C."] = gl.SArray(["W./C." in v for v in data_test["Ticket"]])
data_test["SOTON"] = gl.SArray(["SOTON" in v for v in data_test["Ticket"]])
data_test["number"] = gl.SArray([toNumber(v) for v in data_test["Ticket"]])

In [26]:
for c in civilites_lst:
    data_test[c] = gl.SArray([get_civilite(v) == c for v in data_test["Name"]])

In [27]:
for c in cabin_letters:
    data_test[c] = gl.SArray([cabin_letter(v) == c for v in data_test["Cabin"]])

In [29]:
data_test["cabin_number"] = gl.SArray([cabin_number(v) for v in data_test["Cabin"]])

Making Predictions


In [43]:
data_test["Survived"] = boosted_tree_model.predict(data_test)
#random_forest_model.predict(data_test)
#boosted_tree_model.predict(data_test)
#decision_tree_model.predict(data_test)
#simple_svm_classifier.predict(data_test) 
#simple_logistic_classifier.predict(data_test)

In [44]:
submission = gl.SFrame()

In [45]:
submission["PassengerId"] = data_test["PassengerId"]
submission["Survived"] = data_test["Survived"]

In [46]:
submission.save("kaggle.csv", format="csv")

In [85]:
#data_train.show()

In [86]:
#data_train.head(10)

In [ ]:


In [ ]: