In [13]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import accuracy_score
In [2]:
train_df = pd.read_csv('../input/train.csv')
test_df = pd.read_csv('../input/test.csv')
all_df = train_df.append(test_df)
all_df['is_test'] = all_df.Survived.isnull()
all_df.index = all_df.Survived
del all_df['Survived']
In [3]:
all_df.head()
Out[3]:
In [4]:
train_df.describe()
Out[4]:
Target variable is Survived.
Your score is the percentage of passengers you correctly predict. That means - accuracy.
In [36]:
def select_features(df):
non_obj_feats = df.columns[ df.dtypes != 'object' ]
black_list = ['is_test']
return [feat for feat in non_obj_feats if feat not in black_list ]
def get_X_y(df):
feats = select_features(df)
X = df[feats].values
y = df.index.values.astype(int)
return X, y
def check_quality(model, X, y, n_folds=5, random_state=0, shuffle=False):
skf = StratifiedKFold(y, n_folds=n_folds, random_state=random_state, shuffle=shuffle)
scores = []
for train_index, test_index in skf:
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
score = accuracy_score(y_test, y_pred)
scores.append(score)
return np.mean(scores), np.std(scores)
def train_and_verify(all_df, model):
X, y = get_X_y( all_df[ all_df.is_test == False ] )
return check_quality(model, X, y)
In [5]:
class SingleVariableModel(BaseEstimator, ClassifierMixin):
def __init__(self, seed=1):
np.random.seed(seed)
def fit(self, X, y):
return self
def predict(self, X):
return [0] * len(X)
def __repr__(self):
return 'SingleVariableModel'
In [37]:
train_and_verify(all_df, SingleVariableModel())
Out[37]:
What do you think about this result?
In [7]:
all_df.fillna(-1, inplace=True)
In [38]:
train_and_verify(all_df, RandomForestClassifier())
Out[38]:
The result looks better than previous (0.616 vs 0.683).
Let's improve it... by using those features ['Cabin', 'Embarked', 'Name', 'Sex', 'Ticket']
In [ ]:
In [ ]: