In [1]:
import warnings
warnings.filterwarnings('ignore')
# SKLearn Model Algorithms
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression , Perceptron
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC
# SKLearn ensemble classifiers
from sklearn.ensemble import RandomForestClassifier , GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier , BaggingClassifier
from sklearn.ensemble import VotingClassifier , AdaBoostClassifier
# SKLearn Modelling Helpers
from sklearn.preprocessing import Imputer , Normalizer , scale
from sklearn.cross_validation import train_test_split , StratifiedKFold
from sklearn.feature_selection import RFECV
# Handle table-like data and matrices
import numpy as np
import pandas as pd
# Visualisation
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
# plot functions
import plotFunctions as pfunc
import dataFunctions as dfunc
# Configure visualisations
%matplotlib inline
mpl.style.use( 'ggplot' )
sns.set_style( 'white' )
pylab.rcParams[ 'figure.figsize' ] = 8 , 6
Best score: 0.8406285072951739 Best parameters: { 'bootstrap': True, 'max_depth': 4, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'min_samples_split': 5, 'n_estimators': 10}
In [2]:
full = dfunc.loadData()
In [3]:
full.keys()
Out[3]:
In [4]:
#full["Embarked"]
In [5]:
full=full.drop(['Name','Ticket'], axis=1)
#full=full.drop(['PassengerId','Name','Ticket'], axis=1)
In [6]:
full = dfunc.fillEmbarked(full)
In [7]:
full = dfunc.fillEmbarkedDummies(full)
In [8]:
full = dfunc.fillFare(full)
In [9]:
full = dfunc.convertSexToNum(full,True)
In [10]:
full = dfunc.fillMissingAgeDrop(full)
In [11]:
print(full.isnull().sum())
In [12]:
full.drop("Cabin",axis=1,inplace=True)
In [13]:
full = dfunc.fillFamily(full)
In [14]:
full = dfunc.fillPerson(full)
In [15]:
full.drop("Person",axis=1,inplace=True)
In [16]:
full = dfunc.fillPclass(full)
In [17]:
full.drop("Pclass",axis=1,inplace=True)
In [ ]:
In [18]:
test = full.head(891)
titanic = full.iloc[891:]
In [ ]:
In [19]:
parameters = {'bootstrap': True,
'max_depth': 6,
'max_features': 'auto',
'min_samples_leaf': 1,
'min_samples_split': 10,
'n_estimators': 100,
'n_jobs': -1
}
rf = RandomForestClassifier(**parameters)
In [ ]:
In [20]:
predictors = ["Age",
"Fare",
"C",
"Q",
"Sex",
"Family",
"Child",
"Female",
"Class_1",
"Class_2"
]
In [21]:
test = full.head(891)
titanic = full.iloc[891:]
In [22]:
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import KFold
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
In [23]:
rf.fit(test[predictors],test["Survived"])
kf = KFold(test.shape[0], n_folds=10, random_state=1)
cv = ShuffleSplit(n_splits=10, test_size=0.1, random_state=50)
In [24]:
predictions = cross_validation.cross_val_predict(rf, test[predictors],test["Survived"],cv=kf)
predictions = pd.Series(predictions)
scores = cross_val_score(rf, test[predictors], test["Survived"],scoring='accuracy', cv=kf)
In [25]:
print(scores)
print(scores.mean())
In [26]:
from sklearn.grid_search import GridSearchCV
In [27]:
#parameter_grid = {
# 'max_depth' : [4, 6, 8],
# 'n_estimators': [150, 100, 50, 10],
# 'max_features': ['sqrt', 'auto', 'log2'],
# 'min_samples_split': [2, 3, 5, 10],
# 'min_samples_leaf': [1, 3, 5, 10],
# 'bootstrap': [True, False],
# 'n_jobs': [-1, 6]
# }
#forest = RandomForestClassifier()
#cross_validation = StratifiedKFold(test["Survived"], n_folds=10, random_state=1)
#grid_search = GridSearchCV(forest, scoring='accuracy', param_grid=parameter_grid, cv=cross_validation)
#grid_search.fit(test[predictors],test["Survived"])
#model = grid_search
#parameters = grid_search.best_params_
#print('Best score: {}'.format(grid_search.best_score_))
#print('Best parameters: {}'.format(grid_search.best_params_))
In [31]:
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(test[predictors],test["Survived"])
Y_pred = random_forest.predict(titanic[predictors])
Y_pred = Y_pred.astype(int)
random_forest.score(test[predictors],test["Survived"])
Out[31]:
In [32]:
titanic.keys()
Out[32]:
In [33]:
submission = pd.DataFrame({
"PassengerId": titanic["PassengerId"],
"Survived": Y_pred
})
submission.to_csv('titanic.csv', index=False)
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: