We import the original train.csv and test.csv files and use PassengerID as the index column.
The clean_data function then performs the following:
Name, Ticket and Cabin columns which we currently are not using.Fare column to indicate difference from the median fare paid by class.Age column.SibSp and Parch columns.The cleaned data is saved to cl_train.csv and cl_test.csv.
In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression
import pandas as pd
train = pd.read_csv('cl_train.csv', index_col='PassengerId')
# create dummy variables
train = pd.get_dummies(train, columns=['Sex', 'Pclass', 'Embarked'])
# create cross validation set
X = train.drop('Survived', axis=1)
y = train['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=53)
# feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# logistic regression
polynomial_features = PolynomialFeatures(degree=3, include_bias=True)
logistic_regression = LogisticRegression(C=0.005)
pipeline = Pipeline([('polynomial_features', polynomial_features),
('logistic_regression', logistic_regression)])
# prediction score
pipeline.fit(X_train, y_train)
print('Logistic Regression Train Score: %s' % pipeline.score(X_train, y_train))
print('Logistic Regression CV Score: %s' % pipeline.score(X_test, y_test))
In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
train = pd.read_csv('cl_train.csv', index_col='PassengerId')
# impute missing 'Embarked' values with 'S' (most common)
train['Embarked'].fillna(value='S', inplace=True)
# encode categorical variables
le = LabelEncoder()
train['Sex'] = le.fit_transform(train['Sex'])
train['Embarked'] = le.fit_transform(train['Embarked'])
# create cross validation set
X = train.drop('Survived', axis=1)
y = train['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=134)
# random forest
clf = RandomForestClassifier(n_estimators=300, max_depth=6)
# prediction score
clf.fit(X_train, y_train)
print('Random Forest Train Score: %s' % clf.score(X_train, y_train))
print('Random Forest CV Score: %s' % clf.score(X_test, y_test))
print('Feature Importance:\n%s' % pd.Series(clf.feature_importances_,
index=X_train.columns))
In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
import pandas as pd
train = pd.read_csv('cl_train.csv', index_col='PassengerId')
# create dummy variables
train = pd.get_dummies(train, columns=['Sex', 'Pclass', 'Embarked'])
# create cross validation set
X = train.drop('Survived', axis=1)
y = train['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=116)
# feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# support vector machine
clf = SVC(C=5, gamma='auto')
# prediction score
clf.fit(X_train, y_train)
print('SVC Train Score: %s' % clf.score(X_train, y_train))
print('SVC CV Score: %s' % clf.score(X_test, y_test))
cl_train.csv and cl_test.csv.Format: degree / C
In [254]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np
train = pd.read_csv('cl_train.csv', index_col='PassengerId')
test = pd.read_csv('cl_test.csv', index_col='PassengerId')
# create training set X and y
X_train = train.drop('Survived', axis=1)
y_train = train['Survived']
# combine X train and test for preprocessing
tr_len = len(X_train)
df = pd.concat(objs=[X_train, test], axis=0)
# create dummy variables on train/test
df = pd.get_dummies(df, columns=['Sex', 'Pclass', 'Embarked'])
# split X train and test
X_train = df[:tr_len]
test = df[tr_len:]
# feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(test)
# L2 logistic polynomial regression with C = 1
polynomial_features = PolynomialFeatures(degree=3, include_bias=True)
logistic_regression = LogisticRegression(C=0.005)
pipeline = Pipeline([('polynomial_features', polynomial_features),
('logistic_regression', logistic_regression)])
# fit and predict
pipeline.fit(X_train, y_train)
prediction = pipeline.predict(X_test)
# save survival predictions to a CSV file
predicted = np.column_stack((test.index.values, prediction))
np.savetxt("pr_logistic.csv", predicted.astype(int), fmt='%d', delimiter=",",
header="PassengerId,Survived", comments='')
cl_train.csv and cl_test.csv.Format: n_estimators / max_depth
In [234]:
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np
train = pd.read_csv('cl_train.csv', index_col='PassengerId')
test = pd.read_csv('cl_test.csv', index_col='PassengerId')
# create training set X and y
X_train = train.drop('Survived', axis=1)
y_train = train['Survived']
# combine X train and test for preprocessing
tr_len = len(X_train)
df = pd.concat(objs=[X_train, test], axis=0)
# impute missing 'Embarked' values with 'S' (most common)
df['Embarked'].fillna(value='S', inplace=True)
# encode categorical variables
le = LabelEncoder()
df['Sex'] = le.fit_transform(df['Sex'])
df['Embarked'] = le.fit_transform(df['Embarked'])
# split X train and test
X_train = df[:tr_len]
test = df[tr_len:]
# random forest with 200 estimators, max depth 10
clf = RandomForestClassifier(n_estimators=300, max_depth=6)
# fit and predict
clf.fit(X_train, y_train)
prediction = clf.predict(test)
# save survival predictions to a CSV file
predicted = np.column_stack((test.index.values, prediction))
np.savetxt("pr_forest.csv", predicted.astype(int), fmt='%d', delimiter=",",
header="PassengerId,Survived", comments='')
cl_train.csv and cl_test.csv.Format: gamma / C
In [29]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
import pandas as pd
import numpy as np
train = pd.read_csv('cl_train.csv', index_col='PassengerId')
test = pd.read_csv('cl_test.csv', index_col='PassengerId')
# create training set X and y
X_train = train.drop('Survived', axis=1)
y_train = train['Survived']
# combine X train and test for preprocessing
tr_len = len(X_train)
df = pd.concat(objs=[X_train, test], axis=0)
# create dummy variables on train/test
df = pd.get_dummies(df, columns=['Sex', 'Pclass', 'Embarked'])
# split X train and test
X_train = df[:tr_len]
test = df[tr_len:]
# feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(test)
# support vector machine
clf = SVC(C=3, gamma='auto')
# fit and predict
clf.fit(X_train, y_train)
prediction = clf.predict(X_test)
# save survival predictions to a CSV file
predicted = np.column_stack((test.index.values, prediction))
np.savetxt("pr_SVM.csv", predicted.astype(int), fmt='%d', delimiter=",",
header="PassengerId,Survived", comments='')