In [1]:

    
source("C:\\Work\\myRfunctions.R")
fnRunDate()
fnInstallPackages()









    




'Project last run on Fri Sep 22 3:38:30 PM 2017'






    




'Package install completed'



In [2]:

    
train <- as_tibble(titanic_train)
test <- as_tibble(titanic_test)

dim(train)
dim(test)



In [3]:

    
glimpse(train)









    



Observations: 891
Variables: 12
$ PassengerId <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, ...
$ Survived    <int> 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0...
$ Pclass      <int> 3, 1, 3, 1, 3, 3, 1, 3, 3, 2, 3, 1, 3, 3, 3, 2, 3, 2, 3...
$ Name        <chr> "Braund, Mr. Owen Harris", "Cumings, Mrs. John Bradley ...
$ Sex         <chr> "male", "female", "female", "female", "male", "male", "...
$ Age         <dbl> 22, 38, 26, 35, 35, NA, 54, 2, 27, 14, 4, 58, 20, 39, 1...
$ SibSp       <int> 1, 1, 0, 1, 0, 0, 0, 3, 0, 1, 1, 0, 0, 1, 0, 0, 4, 0, 1...
$ Parch       <int> 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 1, 0, 0, 5, 0, 0, 1, 0, 0...
$ Ticket      <chr> "A/5 21171", "PC 17599", "STON/O2. 3101282", "113803", ...
$ Fare        <dbl> 7.2500, 71.2833, 7.9250, 53.1000, 8.0500, 8.4583, 51.86...
$ Cabin       <chr> "", "C85", "", "C123", "", "", "E46", "", "", "", "G6",...
$ Embarked    <chr> "S", "C", "S", "S", "S", "Q", "S", "S", "S", "C", "S", ...



In [4]:

    
fnClassDistribution(Class = train$Survived)









    





freq percentage

	0 549     61.61616
	1 342     38.38384



In [5]:

    
# combine test and train.
dataset <- dplyr::full_join(train, test)









    



Joining, by = c("PassengerId", "Pclass", "Name", "Sex", "Age", "SibSp", "Parch", "Ticket", "Fare", "Cabin", "Embarked")



In [6]:

    
glimpse(dataset)









    



Observations: 1,309
Variables: 12
$ PassengerId <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, ...
$ Survived    <int> 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0...
$ Pclass      <int> 3, 1, 3, 1, 3, 3, 1, 3, 3, 2, 3, 1, 3, 3, 3, 2, 3, 2, 3...
$ Name        <chr> "Braund, Mr. Owen Harris", "Cumings, Mrs. John Bradley ...
$ Sex         <chr> "male", "female", "female", "female", "male", "male", "...
$ Age         <dbl> 22, 38, 26, 35, 35, NA, 54, 2, 27, 14, 4, 58, 20, 39, 1...
$ SibSp       <int> 1, 1, 0, 1, 0, 0, 0, 3, 0, 1, 1, 0, 0, 1, 0, 0, 4, 0, 1...
$ Parch       <int> 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 1, 0, 0, 5, 0, 0, 1, 0, 0...
$ Ticket      <chr> "A/5 21171", "PC 17599", "STON/O2. 3101282", "113803", ...
$ Fare        <dbl> 7.2500, 71.2833, 7.9250, 53.1000, 8.0500, 8.4583, 51.86...
$ Cabin       <chr> "", "C85", "", "C123", "", "", "E46", "", "", "", "G6",...
$ Embarked    <chr> "S", "C", "S", "S", "S", "Q", "S", "S", "S", "C", "S", ...



In [7]:

    
# change blanks to NAs
dataset[dataset == ""] <- NA



In [8]:

    
Hmisc::describe(dataset, listunique=1)









    





dataset 

 12  Variables      1309  Observations
--------------------------------------------------------------------------------
PassengerId 
      n missing  unique    Info    Mean     .05     .10     .25     .50     .75 
   1309       0    1309       1     655    66.4   131.8   328.0   655.0   982.0 
    .90     .95 
 1178.2  1243.6 

lowest :    1    2    3    4    5, highest: 1305 1306 1307 1308 1309 
--------------------------------------------------------------------------------
Survived 
      n missing  unique    Info     Sum    Mean 
    891     418       2    0.71     342  0.3838 
--------------------------------------------------------------------------------
Pclass 
      n missing  unique    Info    Mean 
   1309       0       3    0.82   2.295 

1 (323, 25%), 2 (277, 21%), 3 (709, 54%) 
--------------------------------------------------------------------------------
Name 
      n missing  unique 
   1309       0    1307 

lowest : Abbing, Mr. Anthony              Abbott, Master. Eugene Joseph    Abbott, Mr. Rossmore Edward      Abbott, Mrs. Stanton (Rosa Hunt) Abelseth, Miss. Karen Marie     
highest: Zabour, Miss. Hileni             Zabour, Miss. Thamine            Zakarian, Mr. Mapriededer        Zakarian, Mr. Ortin              Zimmerman, Mr. Leo               
--------------------------------------------------------------------------------
Sex 
      n missing  unique 
   1309       0       2 

female (466, 36%), male (843, 64%) 
--------------------------------------------------------------------------------
Age 
      n missing  unique    Info    Mean     .05     .10     .25     .50     .75 
   1046     263      98       1   29.88       5      14      21      28      39 
    .90     .95 
     50      57 

lowest :  0.17  0.33  0.42  0.67  0.75, highest: 70.50 71.00 74.00 76.00 80.00 
--------------------------------------------------------------------------------
SibSp 
      n missing  unique    Info    Mean 
   1309       0       7    0.67  0.4989 

            0   1  2  3  4 5 8
Frequency 891 319 42 20 22 6 9
%          68  24  3  2  2 0 1
--------------------------------------------------------------------------------
Parch 
      n missing  unique    Info    Mean 
   1309       0       8    0.55   0.385 

             0   1   2 3 4 5 6 9
Frequency 1002 170 113 8 6 6 2 2
%           77  13   9 1 0 0 0 0
--------------------------------------------------------------------------------
Ticket 
      n missing  unique 
   1309       0     929 

lowest : 110152      110413      110465      110469      110489     
highest: W./C. 6608  W./C. 6609  W.E.P. 5734 W/C 14208   WE/P 5735   
--------------------------------------------------------------------------------
Fare 
      n missing  unique    Info    Mean     .05     .10     .25     .50     .75 
   1308       1     281       1    33.3   7.225   7.568   7.896  14.454  31.275 
    .90     .95 
 78.051 133.650 

lowest :   0.000   3.171   4.013   5.000   6.237
highest: 227.525 247.521 262.375 263.000 512.329 
--------------------------------------------------------------------------------
Cabin 
      n missing  unique 
    295    1014     186 

lowest : A10 A11 A14 A16 A18, highest: F33 F38 F4  G6  T   
--------------------------------------------------------------------------------
Embarked 
      n missing  unique 
   1307       2       3 

C (270, 21%), Q (123, 9%), S (914, 70%) 
--------------------------------------------------------------------------------



In [9]:

    
psych::describe(dataset, check = T)









    





vars n mean sd median trimmed mad min max range skew kurtosis se

	PassengerId  1          1309        6.550000e+02 3.780201e+02    655.0000 6.550000e+02    484.81020   1.00         1309.0000    1308.0000  0.0000000  -1.2027505  1.044829e+01
	Survived  2           891        3.838384e-01 4.865925e-01      0.0000 3.548387e-01      0.00000   0.00            1.0000       1.0000  0.4769135  -1.7745414  1.630146e-02
	Pclass  3          1309        2.294882e+00 8.378360e-01      3.0000 2.367969e+00      0.00000   1.00            3.0000       2.0000 -0.5972758  -1.3172152  2.315737e-02
	Name*  4          1309                 NaN           NA          NA          NaN           NA    Inf              -Inf         -Inf         NA          NA            NA
	Sex*  5          1309                 NaN           NA          NA          NaN           NA    Inf              -Inf         -Inf         NA          NA            NA
	Age  6          1046        2.988114e+01 1.441349e+01     28.0000 2.939081e+01     11.86080   0.17           80.0000      79.8300  0.4065061   0.1345136  4.456597e-01
	SibSp  7          1309        4.988541e-01 1.041658e+00      0.0000 2.745472e-01      0.00000   0.00            8.0000       8.0000  3.8354145  19.9271236  2.879092e-02
	Parch  8          1309        3.850267e-01 8.655603e-01      0.0000 1.754051e-01      0.00000   0.00            9.0000       9.0000  3.6606736  21.4169569  2.392365e-02
	Ticket*  9          1309        2.490391e+05 4.426853e+05 234604.0000 1.928660e+05 183901.70400 680.00      3101298.0000 3100618.0000  5.5385373  33.0458445  1.223560e+04
	Fare 10          1308        3.329548e+01 5.175867e+01     14.4542 2.157439e+01     10.23617   0.00          512.3292     512.3292  4.3576966  26.8744632  1.431130e+00
	Cabin* 11           295                 NaN           NA          NA          NaN           NA    Inf              -Inf         -Inf         NA          NA            NA
	Embarked* 12          1307                 NaN           NA          NA          NaN           NA    Inf              -Inf         -Inf         NA          NA            NA



In [10]:

    
VIM::aggr(dataset, prop = FALSE, combined = TRUE, numbers = TRUE, sortVars = TRUE, sortCombs = TRUE)









    



 Variables sorted by number of missings: 
    Variable Count
       Cabin  1014
    Survived   418
         Age   263
    Embarked     2
        Fare     1
 PassengerId     0
      Pclass     0
        Name     0
         Sex     0
       SibSp     0
       Parch     0
      Ticket     0



In [11]:

    
# summarize correlations between input variables
  options(warn=-1) 
  PerformanceAnalytics::chart.Correlation(dplyr::select_if(dataset, is.numeric), histogram=TRUE, pch=".")



In [12]:

    
tabplot::tableplot(
  dataset, sortCol = Survived)



In [11]:

    
#crosstable
gmodels::CrossTable(
  x = u_dataset$Sex, #categorical or continuous
  y = u_dataset$Survived #needs to be categorical
  )









    



 
   Cell Contents
|-------------------------|
|                       N |
| Chi-square contribution |
|           N / Row Total |
|           N / Col Total |
|         N / Table Total |
|-------------------------|

 
Total Observations in Table:  891 

 
              | u_dataset$Survived 
u_dataset$Sex |         0 |         1 | Row Total | 
--------------|-----------|-----------|-----------|
       female |        81 |       233 |       314 | 
              |    65.386 |   104.962 |           | 
              |     0.258 |     0.742 |     0.352 | 
              |     0.148 |     0.681 |           | 
              |     0.091 |     0.262 |           | 
--------------|-----------|-----------|-----------|
         male |       468 |       109 |       577 | 
              |    35.583 |    57.120 |           | 
              |     0.811 |     0.189 |     0.648 | 
              |     0.852 |     0.319 |           | 
              |     0.525 |     0.122 |           | 
--------------|-----------|-----------|-----------|
 Column Total |       549 |       342 |       891 | 
              |     0.616 |     0.384 |           | 
--------------|-----------|-----------|-----------|



In [16]:

    
# Feature Engineering
# Extract title from Name, creating a new variable
dataset <- mutate(dataset, Title = stringr::str_sub(Name, stringr::str_locate(Name, ",")[ , 1] + 2, stringr::str_locate(Name, "\\.")[ , 1] - 1))
fnClassDistribution(Class = dataset$Title)









    





freq percentage

	Capt   1         0.07639419
	Col   4         0.30557678
	Don   1         0.07639419
	Dona   1         0.07639419
	Dr   8         0.61115355
	Jonkheer   1         0.07639419
	Lady   1         0.07639419
	Major   2         0.15278839
	Master  61         4.66004584
	Miss 260        19.86249045
	Mlle   2         0.15278839
	Mme   1         0.07639419
	Mr 757        57.83040489
	Mrs 197        15.04965623
	Ms   2         0.15278839
	Rev   8         0.61115355
	Sir   1         0.07639419
	the Countess   1         0.07639419



In [ ]:

    
titanic <- titanic %>%
          mutate(Mother = ifelse(c(titanic$Title == "Mrs" | titanic$Title == "Mme" | 
                                titanic$Title == "the Countess" | titanic$Title == "Dona" | 
                                titanic$Title == "Lady") & titanic$Parch > 0, "Yes", "No"))



In [14]:

    
# count number of characters in cabin and name
u_dataset$Cabin_nchar <- as.numeric(nchar(u_dataset$Cabin))
u_dataset$Name_nchar <- as.numeric(nchar(u_dataset$Name))



In [15]:

    
# is missing data significant in Cabin
u_dataset$Cabin_isna <- as.numeric(is.na(u_dataset$Cabin))
head(u_dataset$Cabin_isna)



In [16]:

    
#create new variables for character versions of few that could be used as char or int
u_dataset$Pclass_c <- as.character(u_dataset$Pclass)
u_dataset$SibSp_c <- as.character(u_dataset$SibSp)
u_dataset$Parch_c <- as.character(u_dataset$Parch)

ggjoy

ggplot(data = u_dataset, aes( x = Age, #can be continous or categorical. is supposed to be continous y = as.factor(Title) #needs to be discrete )) + geom_joy( scale = .8, na.rm = T) + theme_joy(center_axis_labels = T)



In [17]:

    
#create dummy variables 
dmy <- caret::dummyVars(" ~ Title + Sex + Embarked + Pclass_c + SibSp_c + Parch_c + Pclass:Sex + Embarked:Sex + Embarked:Pclass_c + Pclass:Cabin_isna", data = u_dataset, fullRank = T)
dataset_d <- data.frame(predict(dmy, newdata = u_dataset))



In [18]:

    
# recombine data
u_dataset <- cbind(u_dataset, dataset_d)



In [19]:

    
# split back out the dataset and test
sktrain <- dplyr::filter(u_dataset, d_set == "train")
sktest <- dplyr::filter(u_dataset, d_set == "test")

#drop the sktest target column full of NAs
sktest$Survived <- NULL



In [20]:

    
# create only numeric df for use with sklearn
sktrain <- dplyr::select_if(sktrain, is.numeric)
sktest <- dplyr::select_if(sktest, is.numeric)



In [21]:

    
#y <- dplyr::select(sktrain, Survived) #%>% #dplyr::slice(., 1:100000)

#remove id field
sktrain <- dplyr::select(sktrain, -dplyr::one_of(c("PassengerId"))) #%>% #dplyr::slice(., 1:100000)

#dplyr::glimpse(y)
sktrain$Survived <- as.factor(sktrain$Survived)

sktrain$Survived <- sub("^", "class", sktrain$Survived)

dplyr::glimpse(sktrain)









    



Observations: 891
Variables: 53
$ Survived            <chr> "class0", "class1", "class1", "class1", "class0...
$ Pclass              <int> 3, 1, 3, 1, 3, 3, 1, 3, 3, 2, 3, 1, 3, 3, 3, 2,...
$ Age                 <dbl> 22, 38, 26, 35, 35, NA, 54, 2, 27, 14, 4, 58, 2...
$ SibSp               <int> 1, 1, 0, 1, 0, 0, 0, 3, 0, 1, 1, 0, 0, 1, 0, 0,...
$ Parch               <int> 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 1, 0, 0, 5, 0, 0,...
$ Fare                <dbl> 7.2500, 71.2833, 7.9250, 53.1000, 8.0500, 8.458...
$ Cabin_nchar         <dbl> NA, 3, NA, 4, NA, NA, 3, NA, NA, NA, 2, 4, NA, ...
$ Name_nchar          <dbl> 23, 51, 22, 44, 24, 16, 23, 30, 49, 35, 31, 24,...
$ Cabin_isna          <dbl> 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1,...
$ TitleCol            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
$ TitleDon            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
$ TitleDona           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
$ TitleDr             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
$ TitleJonkheer       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
$ TitleLady           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
$ TitleMajor          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
$ TitleMaster         <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,...
$ TitleMiss           <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0,...
$ TitleMlle           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
$ TitleMme            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
$ TitleMr             <dbl> 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,...
$ TitleMrs            <dbl> 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,...
$ TitleMs             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
$ TitleRev            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
$ TitleSir            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
$ Titlethe.Countess   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
$ Sexmale             <dbl> 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0,...
$ EmbarkedQ           <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
$ EmbarkedS           <dbl> 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,...
$ Pclass_c2           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,...
$ Pclass_c3           <dbl> 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0,...
$ SibSp_c1            <dbl> 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,...
$ SibSp_c2            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
$ SibSp_c3            <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,...
$ SibSp_c4            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
$ SibSp_c5            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
$ SibSp_c8            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
$ Parch_c1            <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,...
$ Parch_c2            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,...
$ Parch_c3            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
$ Parch_c4            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
$ Parch_c5            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,...
$ Parch_c6            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
$ Parch_c9            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
$ Sexfemale.Pclass    <dbl> 0, 1, 3, 1, 0, 0, 0, 0, 3, 2, 3, 1, 0, 0, 3, 2,...
$ Sexmale.Pclass      <dbl> 3, 0, 0, 0, 3, 3, 1, 3, 0, 0, 0, 0, 3, 3, 0, 0,...
$ Sexmale.EmbarkedQ   <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
$ Sexmale.EmbarkedS   <dbl> 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0,...
$ EmbarkedQ.Pclass_c2 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
$ EmbarkedS.Pclass_c2 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,...
$ EmbarkedQ.Pclass_c3 <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
$ EmbarkedS.Pclass_c3 <dbl> 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0,...
$ Pclass.Cabin_isna   <dbl> 3, 0, 3, 0, 3, 3, 0, 3, 3, 2, 0, 0, 3, 3, 3, 2,...



In [22]:

    
# Split out validation dataset
# create a list of 80% of the rows in the original dataset we can use for training
validation_index <- createDataPartition(sktrain$Survived, p = 0.67, list = FALSE)
# select 20% of the data for validation
validation <- sktrain[-validation_index,]
# use the remaining 80% of data to training and testing the models
sktrainval <- sktrain[validation_index,]



In [23]:

    
# 10-fold cross validation with 3 repeats
control <- trainControl(method = "repeatedcv", number = 10, repeats = 3
                        , sampling = "up" # up down rose smote
                        , summaryFunction=twoClassSummary
                        , classProbs = TRUE
                       )
metric <- "ROC"



In [24]:

    
# RF
set.seed(7)
#fit.cforest <- train(Survived~., data = sktrain, method = "rf", preProcess = c('zv','medianImpute','BoxCox'), metric = metric, trControl = control, na.action = na.pass)
# C5.0
set.seed(7)
fit.c5 <- train(Survived~., data = sktrainval, method = "C5.0", preProcess = c('zv','medianImpute','BoxCox'), metric = metric, trControl = control, na.action = na.pass)
# GLMNET
set.seed(7)
fit.glmnet <- train(Survived~., data = sktrainval, method = "glmnet", preProcess = c('zv','medianImpute','BoxCox'), metric = metric, trControl = control, na.action = na.pass)
# KNN
set.seed(7)
#fit.knn <- train(Survived~., data = sktrain, method = "knn", preProcess = c('zv', "center", "scale",'medianImpute','BoxCox'), metric = metric, trControl = control, na.action = na.pass)
# SVM
set.seed(7)
#fit.svm <- train(Survived~., data = sktrain, method = "svmRadial", preProcess = c('zv','medianImpute','BoxCox'), metric = metric, trControl = control, na.action = na.pass)
#xgb
set.seed(9)
fit.xgb <- train(Survived~., data = sktrainval, method = "xgbTree", preProcess =c('medianImpute','BoxCox','zv'), metric = metric, trControl = control, na.action = na.pass)









    



Loading required package: C50
Loading required package: plyr
Warning message:
"package 'plyr' was built under R version 3.3.3"------------------------------------------------------------------------------
You have loaded plyr after dplyr - this is likely to cause problems.
If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
library(plyr); library(dplyr)
------------------------------------------------------------------------------

Attaching package: 'plyr'

The following object is masked from 'package:lubridate':

    here

The following objects are masked from 'package:dplyr':

    arrange, count, desc, failwith, id, mutate, rename, summarise,
    summarize

The following object is masked from 'package:purrr':

    compact

Loading required package: glmnet
Warning message:
"package 'glmnet' was built under R version 3.3.3"Loading required package: Matrix

Attaching package: 'Matrix'

The following object is masked from 'package:tidyr':

    expand

Loading required package: foreach
Warning message:
"package 'foreach' was built under R version 3.3.3"
Attaching package: 'foreach'

The following objects are masked from 'package:purrr':

    accumulate, when

Loaded glmnet 2.0-10

Loading required package: xgboost
Warning message:
"package 'xgboost' was built under R version 3.3.3"
Attaching package: 'xgboost'

The following object is masked from 'package:dplyr':

    slice



In [25]:

    
# Compare results
results <- resamples(list(
#RF = fit.cforest,
C5 = fit.c5,
GLMNET = fit.glmnet,
#KNN = fit.knn,
xgb = fit.xgb
#SVM = fit.svm
))
summary(results)
bwplot(results)
dotplot(results)


varImp(fit.glmnet)
varImp(fit.c5)
varImp(fit.xgb)
#varImp(fit.cforest)









    





Call:
summary.resamples(object = results)

Models: C5, GLMNET, xgb 
Number of resamples: 30 

ROC 
         Min. 1st Qu. Median   Mean 3rd Qu.   Max. NA's
C5     0.7791  0.8519 0.8707 0.8723  0.9064 0.9448    0
GLMNET 0.7790  0.8408 0.8606 0.8689  0.9016 0.9530    0
xgb    0.7844  0.8573 0.8813 0.8793  0.9038 0.9395    0

Sens 
         Min. 1st Qu. Median   Mean 3rd Qu.   Max. NA's
C5     0.7297  0.8108 0.8649 0.8641  0.8919 0.9730    0
GLMNET 0.6486  0.7635 0.8108 0.8152  0.8649 0.9459    0
xgb    0.7297  0.7905 0.8611 0.8390  0.8649 0.9459    0

Spec 
         Min. 1st Qu. Median   Mean 3rd Qu.  Max. NA's
C5     0.5217  0.6957 0.7391 0.7304  0.7826 0.913    0
GLMNET 0.5217  0.7391 0.7826 0.7855  0.8261 0.913    0
xgb    0.6522  0.7391 0.7826 0.7884  0.8696 0.913    0







    









    









    












    





glmnet variable importance

  only 20 most important variables shown (out of 46)

                    Overall
Parch_c5             100.00
TitleDon              85.74
Name_nchar            81.15
TitleJonkheer         78.93
TitleRev              72.77
SibSp_c5              67.71
TitleMaster           54.40
SibSp_c8              49.58
SibSp_c3              47.04
Sexmale               41.08
TitleMr               39.21
Sexmale.EmbarkedQ     37.42
TitleSir              30.90
TitleMrs              27.36
SibSp_c4              26.88
Parch_c3              24.48
EmbarkedS.Pclass_c3   20.61
TitleMiss             20.05
Cabin_isna            13.92
Sexmale.EmbarkedS     11.41






    





C5.0 variable importance

  only 20 most important variables shown (out of 46)

                    Overall
Parch_c1             100.00
Sexmale              100.00
Sexmale.EmbarkedQ    100.00
Fare                 100.00
Age                  100.00
SibSp                100.00
TitleMrs             100.00
Name_nchar           100.00
TitleMr               96.60
EmbarkedS.Pclass_c3   93.75
Cabin_isna            90.63
Pclass_c3             88.86
Parch                 86.68
EmbarkedQ             84.92
Cabin_nchar           83.29
Pclass_c2             75.00
EmbarkedS.Pclass_c2   75.00
Pclass                72.55
SibSp_c1              63.45
Parch_c2              58.56






    





xgbTree variable importance

  only 20 most important variables shown (out of 46)

                     Overall
Sexmale             100.0000
TitleMr              56.3902
Sexmale.Pclass       51.7553
Age                  26.4446
EmbarkedS.Pclass_c3  25.7154
Fare                 19.5565
Pclass.Cabin_isna    17.3780
SibSp                12.0568
Name_nchar           11.7924
Sexfemale.Pclass      9.7009
Cabin_isna            6.4139
Sexmale.EmbarkedQ     3.9429
Parch                 2.3006
Sexmale.EmbarkedS     2.0122
TitleMaster           1.6360
Parch_c1              1.5184
SibSp_c3              0.3042
TitleDr               0.0000
SibSp_c5              0.0000
TitleSir              0.0000



In [26]:

    
#test predicting power on unseen validation set
validation$prediction <- predict(fit.c5, newdata = validation, na.action = na.pass)
#Checking the accuracy of the random forest model
confusionMatrix(validation$prediction, validation$Survived, positive = "class1") #convention is positive class is the rarest one









    





Confusion Matrix and Statistics

          Reference
Prediction class0 class1
    class0    154     29
    class1     27     83
                                          
               Accuracy : 0.8089          
                 95% CI : (0.7591, 0.8523)
    No Information Rate : 0.6177          
    P-Value [Acc > NIR] : 1.227e-12       
                                          
                  Kappa : 0.5939          
 Mcnemar's Test P-Value : 0.8937          
                                          
            Sensitivity : 0.7411          
            Specificity : 0.8508          
         Pos Pred Value : 0.7545          
         Neg Pred Value : 0.8415          
             Prevalence : 0.3823          
         Detection Rate : 0.2833          
   Detection Prevalence : 0.3754          
      Balanced Accuracy : 0.7960          
                                          
       'Positive' Class : class1



In [27]:

    
# set up k-fold cross validation and metric
control <- trainControl(
    method = "repeatedcv", number = 10, repeats = 3
    , sampling = "up"
    , search='random'
    , summaryFunction=twoClassSummary
    , classProbs = TRUE
)
metric <- "ROC"


set.seed(13)
fit.c5s <- train(Survived~., data = sktrainval, method = "C5.0", preProcess = c('zv','medianImpute','BoxCox')
                , metric = metric
                , trControl = control
                , na.action = na.pass
                , tuneLengh=20)



In [28]:

    
fit.c5s









    





C5.0 

130 samples
 52 predictor
  2 classes: 'class0', 'class1' 

Pre-processing: median imputation (46), Box-Cox transformation (4), remove (6) 
Resampling: Cross-Validated (10 fold, repeated 3 times) 
Summary of sample sizes: 538, 539, 538, 538, 539, 538, ... 
Addtional sampling using up-sampling prior to pre-processing

Resampling results across tuning parameters:

  winnow  trials  ROC        Sens       Spec     
  FALSE   99      0.8680093  0.8570070  0.7333333
   TRUE   42      0.8670954  0.8533784  0.7376812
   TRUE   75      0.8679560  0.8587337  0.7304348

Tuning parameter 'model' was held constant at a value of tree
ROC was used to select the optimal model using  the largest value.
The final values used for the model were trials = 99, model = tree and winnow
 = FALSE.



In [34]:

    
#test predicting power on unseen validation set
validation$prediction <- predict(fit.c5s, newdata = validation, na.action = na.pass)
#Checking the accuracy of the random forest model
confusionMatrix(validation$prediction, validation$Survived, positive = "class1") #convention is positive class is the rarest one









    





Confusion Matrix and Statistics

          Reference
Prediction class0 class1
    class0    146     28
    class1     35     84
                                          
               Accuracy : 0.785           
                 95% CI : (0.7335, 0.8306)
    No Information Rate : 0.6177          
    P-Value [Acc > NIR] : 6.478e-10       
                                          
                  Kappa : 0.5501          
 Mcnemar's Test P-Value : 0.4497          
                                          
            Sensitivity : 0.7500          
            Specificity : 0.8066          
         Pos Pred Value : 0.7059          
         Neg Pred Value : 0.8391          
             Prevalence : 0.3823          
         Detection Rate : 0.2867          
   Detection Prevalence : 0.4061          
      Balanced Accuracy : 0.7783          
                                          
       'Positive' Class : class1



In [30]:

    
# build a model using full data
# 10-fold cross validation with 3 repeats
control <- trainControl(method = "repeatedcv", number = 10, repeats = 3
                        , sampling = "up" # up down rose smote
                        , summaryFunction=twoClassSummary
                        , classProbs = TRUE
                       )
metric <- "ROC"

set.seed(7)
fit.c5fin <- train(Survived~., data = sktrain, method = "C5.0", preProcess = c('zv','medianImpute','BoxCox'), metric = metric, trControl = control, na.action = na.pass)



In [31]:

    
sktest$prediction <- predict(fit.c5fin, newdata = sktest, na.action = na.pass)

glimpse(sktest)
nrow(data.frame(sktest))









    



Observations: 418
Variables: 54
$ PassengerId         <int> 892, 893, 894, 895, 896, 897, 898, 899, 900, 90...
$ Pclass              <int> 3, 3, 2, 3, 3, 3, 3, 2, 3, 3, 3, 1, 1, 2, 1, 2,...
$ Age                 <dbl> 34.5, 47.0, 62.0, 27.0, 22.0, 14.0, 30.0, 26.0,...
$ SibSp               <int> 0, 1, 0, 0, 1, 0, 0, 1, 0, 2, 0, 0, 1, 1, 1, 1,...
$ Parch               <int> 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,...
$ Fare                <dbl> 7.8292, 7.0000, 9.6875, 8.6625, 12.2875, 9.2250...
$ Cabin_nchar         <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ Name_nchar          <dbl> 16, 32, 25, 16, 44, 26, 20, 28, 41, 23, 16, 26,...
$ Cabin_isna          <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,...
$ TitleCol            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
$ TitleDon            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
$ TitleDona           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
$ TitleDr             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
$ TitleJonkheer       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
$ TitleLady           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
$ TitleMajor          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
$ TitleMaster         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
$ TitleMiss           <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
$ TitleMlle           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
$ TitleMme            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
$ TitleMr             <dbl> 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0,...
$ TitleMrs            <dbl> 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1,...
$ TitleMs             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
$ TitleRev            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
$ TitleSir            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
$ Titlethe.Countess   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
$ Sexmale             <dbl> 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0,...
$ EmbarkedQ           <dbl> 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
$ EmbarkedS           <dbl> 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0,...
$ Pclass_c2           <dbl> 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1,...
$ Pclass_c3           <dbl> 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0,...
$ SibSp_c1            <dbl> 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1,...
$ SibSp_c2            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,...
$ SibSp_c3            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
$ SibSp_c4            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
$ SibSp_c5            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
$ SibSp_c8            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
$ Parch_c1            <dbl> 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,...
$ Parch_c2            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
$ Parch_c3            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
$ Parch_c4            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
$ Parch_c5            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
$ Parch_c6            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
$ Parch_c9            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
$ Sexfemale.Pclass    <dbl> 0, 3, 0, 0, 3, 0, 3, 0, 3, 0, 0, 0, 1, 0, 1, 2,...
$ Sexmale.Pclass      <dbl> 3, 0, 2, 3, 0, 3, 0, 2, 0, 3, 3, 1, 0, 2, 0, 0,...
$ Sexmale.EmbarkedQ   <dbl> 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
$ Sexmale.EmbarkedS   <dbl> 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0,...
$ EmbarkedQ.Pclass_c2 <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
$ EmbarkedS.Pclass_c2 <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,...
$ EmbarkedQ.Pclass_c3 <dbl> 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
$ EmbarkedS.Pclass_c3 <dbl> 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,...
$ Pclass.Cabin_isna   <dbl> 3, 3, 2, 3, 3, 3, 3, 2, 3, 3, 3, 1, 0, 2, 0, 2,...
$ prediction          <fctr> class0, class0, class0, class0, class1, class0...






    




418



In [32]:

    
sktest$prediction <- ifelse(sktest$prediction == 'class0',0,1)



In [33]:

    
#my_solution <- cbind(PassengerId = sktest$PassengerId, Survived = prediction)
my_solution <- dplyr::select(sktest, 
                             PassengerId = PassengerId, 
                             Survived = prediction
                              )
readr::write_csv(x = data.frame(my_solution), path = "C:\\Work\\my_solution.csv")

head(my_solution, n=20)
tail(my_solution, n=20)









    





PassengerId Survived

	892 0  
	893 0  
	894 0  
	895 0  
	896 1  
	897 0  
	898 0  
	899 0  
	900 1  
	901 0  
	902 0  
	903 0  
	904 1  
	905 0  
	906 1  
	907 1  
	908 0  
	909 0  
	910 0  
	911 0  









    





PassengerId Survived

	399 1290 0   
	400 1291 0   
	401 1292 1   
	402 1293 0   
	403 1294 1   
	404 1295 0   
	405 1296 0   
	406 1297 1   
	407 1298 0   
	408 1299 0   
	409 1300 1   
	410 1301 1   
	411 1302 1   
	412 1303 1   
	413 1304 0   
	414 1305 0   
	415 1306 1   
	416 1307 0   
	417 1308 0   
	418 1309 1

Right now this works better when trained on less data. When I split a validation set and train, final results are better than training on full data.

	vars	n	mean	sd	median	trimmed	mad	min	max	range	skew	kurtosis	se
PassengerId	1	1309	6.550000e+02	3.780201e+02	655.0000	6.550000e+02	484.81020	1.00	1309.0000	1308.0000	0.0000000	-1.2027505	1.044829e+01
Survived	2	891	3.838384e-01	4.865925e-01	0.0000	3.548387e-01	0.00000	0.00	1.0000	1.0000	0.4769135	-1.7745414	1.630146e-02
Pclass	3	1309	2.294882e+00	8.378360e-01	3.0000	2.367969e+00	0.00000	1.00	3.0000	2.0000	-0.5972758	-1.3172152	2.315737e-02
Name*	4	1309	NaN	NA	NA	NaN	NA	Inf	-Inf	-Inf	NA	NA	NA
Sex*	5	1309	NaN	NA	NA	NaN	NA	Inf	-Inf	-Inf	NA	NA	NA
Age	6	1046	2.988114e+01	1.441349e+01	28.0000	2.939081e+01	11.86080	0.17	80.0000	79.8300	0.4065061	0.1345136	4.456597e-01
SibSp	7	1309	4.988541e-01	1.041658e+00	0.0000	2.745472e-01	0.00000	0.00	8.0000	8.0000	3.8354145	19.9271236	2.879092e-02
Parch	8	1309	3.850267e-01	8.655603e-01	0.0000	1.754051e-01	0.00000	0.00	9.0000	9.0000	3.6606736	21.4169569	2.392365e-02
Ticket*	9	1309	2.490391e+05	4.426853e+05	234604.0000	1.928660e+05	183901.70400	680.00	3101298.0000	3100618.0000	5.5385373	33.0458445	1.223560e+04
Fare	10	1308	3.329548e+01	5.175867e+01	14.4542	2.157439e+01	10.23617	0.00	512.3292	512.3292	4.3576966	26.8744632	1.431130e+00
Cabin*	11	295	NaN	NA	NA	NaN	NA	Inf	-Inf	-Inf	NA	NA	NA
Embarked*	12	1307	NaN	NA	NA	NaN	NA	Inf	-Inf	-Inf	NA	NA	NA

	freq	percentage
Capt	1	0.07639419
Col	4	0.30557678
Don	1	0.07639419
Dona	1	0.07639419
Dr	8	0.61115355
Jonkheer	1	0.07639419
Lady	1	0.07639419
Major	2	0.15278839
Master	61	4.66004584
Miss	260	19.86249045
Mlle	2	0.15278839
Mme	1	0.07639419
Mr	757	57.83040489
Mrs	197	15.04965623
Ms	2	0.15278839
Rev	8	0.61115355
Sir	1	0.07639419
the Countess	1	0.07639419

PassengerId	Survived
892	0
893	0
894	0
895	0
896	1
897	0
898	0
899	0
900	1
901	0
902	0
903	0
904	1
905	0
906	1
907	1
908	0
909	0
910	0
911	0

	PassengerId	Survived
399	1290	0
400	1291	0
401	1292	1
402	1293	0
403	1294	1
404	1295	0
405	1296	0
406	1297	1
407	1298	0
408	1299	0
409	1300	1
410	1301	1
411	1302	1
412	1303	1
413	1304	0
414	1305	0
415	1306	1
416	1307	0
417	1308	0
418	1309	1

PassengerId	Survived
892	0
893	0
894	0
895	0
896	1
897	0
898	0
899	0
900	1
901	0
902	0
903	0
904	1
905	0
906	1
907	1
908	0
909	0
910	0
911	0

PassengerId	Survived
892	0
893	0
894	0
895	0
896	1
897	0
898	0
899	0
900	1
901	0
902	0
903	0
904	1
905	0
906	1
907	1
908	0
909	0
910	0
911	0