In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import cross_val_score
%matplotlib inline
In [ ]:
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")
In [13]:
# train = pd.read_csv('s3a://aws-s3-data/kaggle/titanic/train.csv')
# test = pd.read_csv('s3a://aws-s3-data/kaggle/titanic/test.csv')
In [14]:
test.info()
In [15]:
train.describe()
Out[15]:
In [ ]:
# train.Cabin.str.split().str.get(-1).str[0]
# train.Cabin.str.split(expand=True)
In [16]:
# train.Ticket.str.split().str.get(0).str.extract
train.Ticket.str.split()[0:].str[0].head()
Out[16]:
In [20]:
print train[train['Survived']==1]["Age"].mean(),
print train[train['Survived']==0]["Age"].mean(),
print test.Age.mean()
In [21]:
def clean_data(titanic):
titanic = titanic.copy()
titanic["Age"] = titanic["Age"].fillna(titanic["Age"].median())
titanic["Fare"] = titanic["Fare"].fillna(titanic["Fare"].median())
titanic['Cabin'] = titanic['Cabin'].str.split().str.get(-1).str[0]
titanic['Ticket'] = titanic.Ticket.str.split()[0:].str[0]
titanic.loc[titanic["Sex"] == "male", "Sex"] = -10
titanic.loc[titanic["Sex"] == "female", "Sex"] = 10
titanic["Embarked"] = titanic["Embarked"].fillna("S")
titanic['Title'] = titanic['Name'].apply(lambda x: x.split(',')[1].split()[0])
# d = {'Mr.':'Mr', 'Mrs.':'Mrs', 'Miss.':'Miss', 'Master.':'Master', 'Don.':'Mr', 'Rev.':'Mr', 'Dr.':'Dr', 'Mme.':'Mrs',
# 'Ms.':'Miss', 'Major.':'Mr', 'Lady.':'Miss', 'Sir.':'Mr', 'Mlle.':'Miss', 'Col.':'Mr', 'Capt.':'Mr', 'the':'Mr',
# 'Jonkheer.':'Mr', 'Dona.':'Mrs'}
d = {'Mr.':28, 'Mrs.':80, 'Miss.':50, 'Master.':28, 'Don.':40, 'Rev.':60, 'Dr.':60, 'Mme.':80,
'Ms.':50, 'Major.':60, 'Lady.':70, 'Sir.':40, 'Mlle.':50, 'Col.':60, 'Capt.':60, 'the':28,
'Jonkheer.':28, 'Dona.':70}
titanic['Title'].replace(d, inplace =True)
colnames = ['Embarked','Cabin','Ticket']
for colname in colnames:
titanic[colname] = pd.Categorical(titanic[colname]).codes
# # Grab all the features that can be included in a Random Forest Regressor
# age_titanic = titanic[['Age','Fare','Ticket','Pclass','Cabin','Title']]
# # Split into sets with known and unknown Age values
# knownAge = age_titanic.loc[ (titanic.Age.notnull()) ]
# unknownAge = age_titanic.loc[ (titanic.Age.isnull()) ]
# # All age values are stored in a target array
# y = knownAge.pop('Age').values
# # All the other values are stored in the feature array
# X = knownAge.values
# # Create and fit a model
# rtr = RandomForestRegressor(20)
# rtr.fit(X, y)
# # Use the fitted model to predict the missing values
# predictedAges = rtr.predict(unknownAge.values[:, 1::])
# # Assign those predictions to the full data set
# titanic.loc[ (titanic.Age.isnull()), 'Age' ] = predictedAges
# StandardScaler will subtract the mean from each value then scale to the unit variance
# scaler = StandardScaler()
# titanic['Age_scaled'] = scaler.fit_transform(titanic['Age'])
# titanic['Fare_scaled'] = scaler.fit_transform(titanic['Fare'])
titanic.Age = titanic.Age/titanic.Age.max()
titanic.Fare = titanic.Fare/titanic.Fare.max()
titanic['AgeSex'] = titanic.Age * titanic.Sex
titanic['AgeSexFare'] = titanic.Age * titanic.Sex * titanic.Fare
# titanic['TitlePclass'] = titanic.Title * titanic.Pclass
# titanic['CabinPclass'] = titanic.Cabin * titanic.Pclass
# titanic['PclassSq'] = titanic.Pclass ** 2
# titanic['SexFare'] = titanic.Sex * titanic.Fare
# titanic["FamilySize"] = titanic['Parch'] + titanic['SibSp']
# titanic.loc[(titanic["Sex"] == "female") , "Age"] = \
# titanic.loc[(titanic["Sex"] == "female") , "Age"].fillna(28.34)
# titanic.loc[(titanic["Sex"] == "male") , "Age"] = \
# titanic.loc[(titanic["Sex"] == "male") , "Age"].fillna(30.62)
# (titanic[titanic['Survived']==0]["Age"].mean())
# titanic.loc[titanic["Embarked"] == "S", "Embarked"] = 1
# titanic.loc[titanic["Embarked"] == "C", "Embarked"] = 2
# titanic.loc[titanic["Embarked"] == "Q", "Embarked"] = 3
titanic.drop(titanic[['Name',
# 'Ticket',
# 'Cabin',
# 'Age',
# 'Sex',
# 'Fare',
'SibSp',
'Parch',
# 'Title',
# 'Pclass',
]], axis = 1, inplace=True)
return titanic
In [24]:
df = clean_data(train)
df_train = df.copy()
df_train.drop('PassengerId', axis=1, inplace=True)
df_test = clean_data(test)
In [28]:
df.describe().T
Out[28]:
In [26]:
df_train.info()
In [27]:
plt.figure(figsize=(12,5))
plt.subplot(1,2,1)
train[train['Survived']==1]["Age"].hist(bins=20, label='survived')
plt.title('Survived')
plt.subplot(1,2,2)
train[train['Survived']==0]["Age"].hist(bins=20)
plt.title('Did not survive')
Out[27]:
In [29]:
df.head()
Out[29]:
In [30]:
y = df_train.pop('Survived').values
X = df_train.values
X_test = df_test.values
In [31]:
rf = RandomForestClassifier(40, n_jobs=-1)
rf.fit(X,y)
Out[31]:
In [32]:
feat_rank = np.argsort(rf.feature_importances_)[::-1]
feat_rank
Out[32]:
In [33]:
df_train.columns[feat_rank]
Out[33]:
In [34]:
df_features = pd.DataFrame(rf.feature_importances_,df_train.columns, columns = ['feature_value'])
In [35]:
df_features.sort_values('feature_value', ascending=False)
Out[35]:
In [36]:
scores = np.zeros((feat_rank.shape[0],2))
for i in range(1,feat_rank.shape[0]+1):
features = [df_train.columns[feat_rank][x] for x in range(i)]
scores[i-1:] = (i,(cross_val_score(rf, df[features], df['Survived'], cv=10)).mean())
scores
Out[36]:
In [37]:
plt.plot(scores[:,:1],scores[:,1:2])
Out[37]:
In [39]:
cross_val_score(rf, df[features], df['Survived'], cv=10).mean()
Out[39]:
In [43]:
importances = rf.feature_importances_
std = np.std([tree.feature_importances_ for tree in rf.estimators_], axis=0)
indices = np.argsort(importances)[::-1]
# Print the feature ranking
print("Feature ranking:")
for f in range(X.shape[1]):
print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
# Plot the feature importances of the forest
plt.figure(figsize=(12,5))
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],
color="r", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), df_train.columns[indices])
plt.xlim([-1, X.shape[1]])
plt.show()
In [38]:
features = [df_train.columns[feat_rank][x] for x in range(9)]
features
Out[38]:
In [51]:
# features = [df_train.columns[indices][x] for x in range(9)]
# features
In [45]:
X = df_train[features].values
X
Out[45]:
In [46]:
def create_submission(model, train, test, features, filename):
# model.fit(train[features], train['Survived'])
predictions = model.predict(test[features])
submission = pd.DataFrame({
"PassengerId": test["PassengerId"],
"Survived": predictions
})
submission.to_csv(filename, index=False)
In [47]:
from time import time
from operator import itemgetter
from scipy.stats import randint as sp_randint
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.datasets import load_digits
from sklearn.ensemble import RandomForestClassifier
# build a classifier
clf = RandomForestClassifier()
# Utility function to report best scores
def report(grid_scores, n_top=3):
top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
for i, score in enumerate(top_scores):
print("Model with rank: {0}".format(i + 1))
print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
score.mean_validation_score,
np.std(score.cv_validation_scores)))
print("Parameters: {0}".format(score.parameters))
print("")
# specify parameters and distributions to sample from
param_dist = {"max_depth": [3, None],
"max_features": sp_randint(1, 6),
"min_samples_split": sp_randint(1, 11),
"min_samples_leaf": sp_randint(1, 11),
"bootstrap": [True, False],
'n_estimators': [10, 40, 50, 60],
"criterion": ["gini", "entropy"]}
# run randomized search
n_iter_search = 20
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
n_iter=n_iter_search, n_jobs=-1)
start = time()
random_search.fit(X, y)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
" parameter settings." % ((time() - start), n_iter_search))
report(random_search.grid_scores_)
In [50]:
# use a full grid over all parameters
param_grid = {'max_depth': [1, 2, 4, None],
'max_features': ['sqrt', 'log2', None],
'min_samples_split': [1, 2, 6, 8, 10],
'min_samples_leaf': [1, 2, 4, 6],
'bootstrap': [True, False],
'n_estimators': [30, 40, 50, 60, 100],
"criterion": ["gini", "entropy"]}
# run grid search
grid_search = GridSearchCV(clf, param_grid=param_grid, n_jobs=-1)
start = time()
grid_search.fit(X, y)
print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
% (time() - start, len(grid_search.grid_scores_)))
report(grid_search.grid_scores_)
In [49]:
grid_search.best_estimator_
Out[49]:
In [52]:
create_submission(grid_search.best_estimator_,
df, df_test, features, "../submissions/rf_submission.csv")
0.79426
['AgeSex', 'AgeSexFare', 'Fare', 'Sex', 'Pclass', 'Age']
create_submission(RandomForestClassifier(
bootstrap= True,
min_samples_leaf= 3,
n_estimators= 20,
min_samples_split= 9,
criterion= 'entropy',
max_features= 4,
max_depth= None)
0.78469
['AgeSex', 'AgeSexFare', 'Fare', 'Age', 'Pclass', 'Sex']
create_submission(RandomForestClassifier(50, min_samples_split=4, min_samples_leaf=2), \
df, df_test, predictors, "submission.csv")
0.76555
['AgeSex', 'AgeSexFare', 'Fare', 'Age']
create_submission(RandomForestClassifier(50, min_samples_split=4, min_samples_leaf=2), \
df, df_test, features, "submission.csv")
In [ ]:
trees_accuracy = []
for i in xrange(1,X.shape[1]):
rf = RandomForestClassifier(50, max_features = i, min_samples_split=4, min_samples_leaf=2)
rf.fit(X, y)
trees_accuracy.append(rf.score(X,y))
In [ ]:
plt.plot(range(1, X.shape[1]), trees_accuracy, '-o')
In [ ]:
pipeline = Pipeline([('scaler', StandardScaler()),
('svc', SVC(kernel='linear'))])
pipeline.fit(X, y)
In [ ]:
parameters = {'kernel':['linear','rbf'],
'C':np.linspace(.001,10,5),'degree':np.linspace(0,10,5)}
gsCV = GridSearchCV(estimator=pipeline.steps[1][1],
param_grid=parameters,scoring='accuracy', cv=5)
In [ ]:
X = pipeline.steps[0][1].fit_transform(X)
In [ ]:
gsCV.fit(X,y)
In [ ]:
gsCV.grid_scores_, gsCV.best_params_
mean: 0.78151, std: 0.03323, params: {'C': 25.00075, 'degree': 0.0}
In [ ]:
def svm_submission(model, train, test, features, filename):
model.fit(train[features], train['Survived'])
predictions = model.predict(test[features])
submission = pd.DataFrame({
"PassengerId": test["PassengerId"],
"Survived": predictions
})
submission.to_csv(filename, index=False)
In [ ]:
svm_features = [df_train.columns[feat_rank][x] for x in range(8)]
svm_features
In [ ]:
create_submission(Pipeline([('scaler', StandardScaler()),
('svc', SVC(kernel='rbf', C=2.5, degree=2.5))]), \
df, df_test, svm_features, "../submissions/svm_submission.csv")
In [ ]:
X = df_train
X.head()
In [ ]:
gdb = GradientBoostingClassifier(
n_estimators=3000,
learning_rate = 0.01,
max_depth = 4,
max_features = 0.1,
min_samples_leaf = 17)
gdb.fit(X,y)
In [ ]:
feat_rank = np.argsort(gdb.feature_importances_)[::-1]
feat_rank
df_train.columns[feat_rank]
In [ ]:
boost_features = [df_train.columns[feat_rank][x] for x in range(8)]
boost_features
In [ ]:
df_train[boost_features].head()
In [ ]:
X = df_train[boost_features]
X.head()
In [ ]:
param_grid = {'learning_rate': [0.1, 0.05, 0.02, 0.01],
'max_depth': [4, 6],
'min_samples_leaf': [3, 5, 9, 17],
'max_features': [1.0, 0.3, 0.1]}
gdb_grid = GradientBoostingClassifier(n_estimators=6000)
gs_cv = GridSearchCV(gdb_grid, param_grid).fit(X,y)
gs_cv.best_params_
In [ ]:
gs_cv.grid_scores_
BEST PARAMS
{'learning_rate': 0.01,
'max_depth': 4,
'max_features': 0.1,
'min_samples_leaf': 17}
In [ ]:
create_submission(GradientBoostingClassifier(
n_estimators=3000,
learning_rate = 0.01,
max_depth = 4,
max_features = 0.1,
min_samples_leaf = 9),
df, df_test, boost_features, "../submissions/gdboost_submission.csv")
In [ ]:
X = df_train
X.head()
In [ ]:
ada = AdaBoostClassifier(
n_estimators=3000,
learning_rate = 0.01)
ada.fit(X,y)
In [ ]:
feat_rank = np.argsort(ada.feature_importances_)[::-1]
ada_features = [df_train.columns[feat_rank][x] for x in range(6)]
ada_features
In [ ]:
X = df_train[ada_features]
X.head()
In [ ]:
param_grid = {'learning_rate': [1, 0.1, 0.05, 0.02, 0.01]}
ada_grid = AdaBoostClassifier(n_estimators=6000)
ada_cv = GridSearchCV(ada_grid, param_grid).fit(X,y)
ada_cv.best_params_
In [ ]:
create_submission(AdaBoostClassifier(
n_estimators=3000,
learning_rate = 0.01),
df, df_test, ada_features, "../submissions/adaboost_submission.csv")
In [ ]: