In [1]:
import warnings
warnings.filterwarnings('ignore')
# SKLearn Model Algorithms
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression , Perceptron
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC
# SKLearn ensemble classifiers
from sklearn.ensemble import RandomForestClassifier , GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier , BaggingClassifier
from sklearn.ensemble import VotingClassifier , AdaBoostClassifier
# SKLearn Modelling Helpers
from sklearn.preprocessing import Imputer , Normalizer , scale
from sklearn.cross_validation import train_test_split , StratifiedKFold
from sklearn.feature_selection import RFECV
# Handle table-like data and matrices
import numpy as np
import pandas as pd
# Visualisation
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
# plot functions
import plotFunctions as pfunc
import dataFunctions as dfunc
# Configure visualisations
%matplotlib inline
mpl.style.use( 'ggplot' )
sns.set_style( 'white' )
pylab.rcParams[ 'figure.figsize' ] = 8 , 6
In [2]:
full = dfunc.loadData()
In [3]:
full.keys()
Out[3]:
In [ ]:
In [4]:
full=dfunc.featureProcessing(full)
In [5]:
#full = dfunc.fillAge(full)
full["Title"]
#full["Pclass"]
#full["Sex"]
Out[5]:
In [6]:
print(full.isnull().sum())
In [7]:
full.info()
In [8]:
#full["Title"].head(50)
In [9]:
test = full.head(891)
titanic = full.iloc[891:]
In [10]:
parameters = {'bootstrap': True,
'max_depth': 6,
'max_features': 'auto',
'min_samples_leaf': 1,
'min_samples_split': 10,
'n_estimators': 100,
'n_jobs': -1
}
rf = RandomForestClassifier(**parameters)
In [ ]:
In [11]:
predictors = ["Age",
"Embarked",
"Fare",
"Pclass",
"Sex",
"SibSp",
"Parch",
"Deck",
"Family",
"FsizeD",
"NameLength",
"NlengthD",
"Title",
"TicketNumber",
"Master",
"Miss",
"Mr",
"Mrs",
"Officer",
"Royalty",
"Shared_ticket",
"Ticket_group",
"Fare_cat",
"Fare_eff",
"Fare_eff_cat",
"Child",
#"Young",
"Adult"
]
In [12]:
test = full.head(891)
titanic = full.iloc[891:]
In [13]:
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import KFold
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
In [14]:
rf.fit(test[predictors],test["Survived"])
kf = KFold(test.shape[0], n_folds=10, random_state=1)
cv = ShuffleSplit(n_splits=10, test_size=0.3, random_state=50)
In [15]:
predictions = cross_validation.cross_val_predict(rf, test[predictors],test["Survived"],cv=kf)
predictions = pd.Series(predictions)
scores = cross_val_score(rf, test[predictors], test["Survived"],scoring='accuracy', cv=kf)
In [16]:
print(scores)
print(scores.mean())
In [17]:
from sklearn.grid_search import GridSearchCV
importances=rf.feature_importances_
std = np.std([rf.feature_importances_ for tree in rf.estimators_],
axis=0)
indices = np.argsort(importances)[::-1]
sorted_important_features=[]
for i in indices:
sorted_important_features.append(predictors[i])
#predictors=titanic.columns
plt.figure()
plt.title("Feature Importances By Random Forest Model")
plt.bar(range(np.size(predictors)), importances[indices],
color="b", yerr=std[indices], align="center")
plt.xticks(range(np.size(predictors)), sorted_important_features, rotation='vertical')
plt.xlim([-1, np.size(predictors)]);
In [ ]:
In [ ]:
In [18]:
output = rf.predict(titanic[predictors]).astype(int)
df_out = pd.DataFrame()
In [19]:
df_out['PassengerId']=titanic['PassengerId']
In [20]:
df_out['Survived'] = output
In [21]:
#df_out[['PassengerId','Survived']].to_csv('./output/output-20171015.csv',index=False)
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: