First the data is loaded into Pandas data frames


In [1]:
import numpy as np
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Read the input datasets
train_data = pd.read_csv('../input/train.csv')
test_data = pd.read_csv('../input/test.csv')

# Fill missing numeric values with median for that column
train_data['Age'].fillna(train_data['Age'].mean(), inplace=True)
test_data['Age'].fillna(test_data['Age'].mean(), inplace=True)
test_data['Fare'].fillna(test_data['Fare'].mean(), inplace=True)

print(train_data.info())
print(test_data.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            418 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           418 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB
None

Next select a subset of our train_data to use for training the model


In [2]:
# Encode sex as int 0=female, 1=male
train_data['Sex'] = train_data['Sex'].apply(lambda x: int(x == 'male'))

# Extract the features we want to use
X = train_data[['Pclass', 'Sex', 'Age', 'Fare', 'SibSp', 'Parch']].as_matrix()
print(np.shape(X))

# Extract survival target
y = train_data[['Survived']].values.ravel()
print(np.shape(y))


(891, 6)
(891,)

Now train the SVM classifier and get validation accuracy using K-Folds cross validation


In [3]:
from sklearn.svm import SVC
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import MinMaxScaler

# Build the classifier
kf = KFold(n_splits=3)
model = SVC(kernel='rbf', C=300)

scores = []
for train, test in kf.split(X):
    # Normalize training and test data using train data norm parameters
    normalizer = MinMaxScaler().fit(X[train])
    X_train = normalizer.transform(X[train])
    X_test = normalizer.transform(X[test])
    
    scores.append(model.fit(X_train, y[train]).score(X_test, y[test]))
    
print("Mean 3-fold cross validation accuracy: %s" % np.mean(scores))


Mean 3-fold cross validation accuracy: 0.822671156004

Make predictions on the test data and output the results


In [4]:
# Create model with all training data
normalizer = MinMaxScaler().fit(X)
X = normalizer.transform(X)
classifier = model.fit(X, y)

# Encode sex as int 0=female, 1=male
test_data['Sex'] = test_data['Sex'].apply(lambda x: int(x == 'male'))

# Extract desired features
X_ = test_data[['Pclass', 'Sex', 'Age', 'Fare', 'SibSp', 'Parch']].as_matrix()
X_ = normalizer.transform(X_)

# Predict if passengers survived using model
y_ = classifier.predict(X_)

# Append the survived attribute to the test data
test_data['Survived'] = y_
predictions = test_data[['PassengerId', 'Survived']]
print(predictions)

# Save the output for submission
predictions.to_csv('submission.csv', index=False)


     PassengerId  Survived
0            892         0
1            893         1
2            894         0
3            895         0
4            896         1
5            897         0
6            898         1
7            899         0
8            900         1
9            901         0
10           902         0
11           903         0
12           904         1
13           905         0
14           906         1
15           907         1
16           908         0
17           909         0
18           910         1
19           911         1
20           912         0
21           913         0
22           914         1
23           915         0
24           916         1
25           917         0
26           918         1
27           919         0
28           920         0
29           921         0
..           ...       ...
388         1280         0
389         1281         0
390         1282         0
391         1283         1
392         1284         0
393         1285         0
394         1286         0
395         1287         1
396         1288         0
397         1289         1
398         1290         0
399         1291         0
400         1292         1
401         1293         0
402         1294         1
403         1295         0
404         1296         0
405         1297         0
406         1298         0
407         1299         0
408         1300         1
409         1301         1
410         1302         1
411         1303         1
412         1304         1
413         1305         0
414         1306         1
415         1307         0
416         1308         0
417         1309         0

[418 rows x 2 columns]