In [1]:
import warnings
warnings.filterwarnings('ignore')
# SKLearn Model Algorithms
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression , Perceptron
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC
# SKLearn ensemble classifiers
from sklearn.ensemble import RandomForestClassifier , GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier , BaggingClassifier
from sklearn.ensemble import VotingClassifier , AdaBoostClassifier
# SKLearn Modelling Helpers
from sklearn.preprocessing import Imputer , Normalizer , scale
from sklearn.cross_validation import train_test_split , StratifiedKFold
from sklearn.feature_selection import RFECV
# Handle table-like data and matrices
import numpy as np
import pandas as pd
# Visualisation
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
# plot functions
import plotFunctions as pfunc
import dataFunctions as dfunc
# Configure visualisations
%matplotlib inline
mpl.style.use( 'ggplot' )
sns.set_style( 'white' )
pylab.rcParams[ 'figure.figsize' ] = 8 , 6
Best score: 0.8406285072951739 Best parameters: { 'bootstrap': True, 'max_depth': 4, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'min_samples_split': 5, 'n_estimators': 10}
Best score: 0.8417508417508418 Best parameters: {'bootstrap': True, 'max_depth': 8, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 10, 'n_jobs': 6}
In [2]:
full = dfunc.loadData()
In [ ]:
In [3]:
full.keys()
Out[3]:
In [ ]:
In [4]:
full=dfunc.featureProcessing(full)
In [5]:
#full = dfunc.fillAge(full)
full["Title"]
#full["Pclass"]
#full["Sex"]
Out[5]:
In [6]:
print(full.isnull().sum())
In [7]:
full.info()
In [8]:
#full["Title"].head(50)
In [9]:
test = full.head(891)
titanic = full.iloc[891:]
In [10]:
parameters = {'bootstrap': True,
'max_depth': 6,
'max_features': 'auto',
'min_samples_leaf': 1,
'min_samples_split': 10,
'n_estimators': 100,
'n_jobs': -1
}
rf = RandomForestClassifier(**parameters)
In [ ]:
In [11]:
predictors = ["Age",
"Embarked",
"Fare",
"Pclass",
"Sex",
"SibSp",
"Parch",
"Deck",
"Family",
"FsizeD",
"NameLength",
"NlengthD",
"Title",
"TicketNumber",
"Master",
"Miss",
"Mr",
"Mrs",
"Officer",
"Royalty",
"Shared_ticket",
"Ticket_group",
"Fare_cat",
"Fare_eff",
"Fare_eff_cat",
"Child",
#"Young",
"Adult"
]
In [12]:
test = full.head(891)
titanic = full.iloc[891:]
In [13]:
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import KFold
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
In [14]:
rf.fit(test[predictors],test["Survived"])
kf = KFold(test.shape[0], n_folds=10, random_state=1)
cv = ShuffleSplit(n_splits=10, test_size=0.3, random_state=50)
In [15]:
predictions = cross_validation.cross_val_predict(rf, test[predictors],test["Survived"],cv=kf)
predictions = pd.Series(predictions)
scores = cross_val_score(rf, test[predictors], test["Survived"],scoring='accuracy', cv=kf)
In [16]:
print(scores)
print(scores.mean())
In [17]:
from sklearn.grid_search import GridSearchCV
In [18]:
parameter_grid = {
'max_depth' : [4, 6, 8],
'n_estimators': [150, 100, 50, 10],
'max_features': ['sqrt', 'auto', 'log2'],
'min_samples_split': [2, 3, 5, 10],
'min_samples_leaf': [1, 3, 5, 10],
'bootstrap': [True, False],
'n_jobs': [-1, 6]
}
forest = RandomForestClassifier()
cross_validation = StratifiedKFold(test["Survived"], n_folds=10, random_state=1)
grid_search = GridSearchCV(forest, scoring='accuracy', param_grid=parameter_grid, cv=cross_validation)
grid_search.fit(test[predictors],test["Survived"])
model = grid_search
parameters = grid_search.best_params_
print('Best score: {}'.format(grid_search.best_score_))
print('Best parameters: {}'.format(grid_search.best_params_))
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: