In [1]:
import warnings
warnings.filterwarnings('ignore')
from sklearn.grid_search import GridSearchCV
# SKLearn Model Algorithms
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression , Perceptron
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC
# SKLearn ensemble classifiers
from sklearn.ensemble import RandomForestClassifier , GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier , BaggingClassifier
from sklearn.ensemble import VotingClassifier , AdaBoostClassifier
# SKLearn Modelling Helpers
from sklearn.preprocessing import Imputer , Normalizer , scale
from sklearn.cross_validation import train_test_split , StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.feature_selection import RFECV
# Handle table-like data and matrices
import numpy as np
import pandas as pd
# Visualisation
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
# plot functions
import plotFunctions as pfunc
import dataFunctions as dfunc
# Configure visualisations
%matplotlib inline
mpl.style.use( 'ggplot' )
sns.set_style( 'white' )
pylab.rcParams[ 'figure.figsize' ] = 8 , 6
In [2]:
full = dfunc.loadData()
In [3]:
full.isnull().sum()
Out[3]:
In [4]:
full = dfunc.fillEmbarked(full)
In [ ]:
full = dfunc.fillFare(full)
In [ ]:
full = dfunc.fillDeck(full)
In [ ]:
full = dfunc.fillFamilySize(full)
In [ ]:
full = dfunc.fillNameSize(full)
In [ ]:
full = dfunc.fillTitles(full)
In [ ]:
full['Title'].value_counts()
In [ ]:
full = dfunc.fillTicket(full)
In [ ]:
full.groupby('Ticket')[ 'Name' ].transform('count')
In [ ]:
full = dfunc.categoricalToNum(full)
In [ ]:
full = dfunc.fillMissingAge(full)
In [ ]:
full = dfunc.featureScale(full)
In [ ]:
print(full.isnull().sum())
In [ ]:
full.info()
In [ ]:
parameters = {'bootstrap': True,
'max_depth': 6,
'max_features': 'auto',
'min_samples_leaf': 1,
'min_samples_split': 10,
'n_estimators': 100,
'n_jobs': -1
}
rf = RandomForestClassifier(**parameters)
In [ ]:
predictors = ["Age","Embarked","Fare",
"Pclass","Sex","SibSp",
"Parch","Deck","Family",
"FsizeD","NameLength","NlengthD","Title",
"TicketNumber"]
In [ ]:
test = full.head(891)
titanic = full.iloc[891:]
In [ ]:
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import KFold
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
In [ ]:
rf.fit(test[predictors],test["Survived"])
kf = KFold(test.shape[0], n_folds=10, random_state=1)
cv = ShuffleSplit(n_splits=20, test_size=0.3, random_state=50)
In [ ]:
predictions = cross_validation.cross_val_predict(rf, test[predictors],test["Survived"],cv=kf)
predictions = pd.Series(predictions)
scores = cross_val_score(rf, test[predictors], test["Survived"],scoring='f1', cv=kf)
In [ ]:
print(scores)
print(scores.mean())
In [ ]:
importances=rf.feature_importances_
std = np.std([rf.feature_importances_ for tree in rf.estimators_],
axis=0)
indices = np.argsort(importances)[::-1]
sorted_important_features=[]
for i in indices:
sorted_important_features.append(predictors[i])
#predictors=titanic.columns
plt.figure()
plt.title("Feature Importances By Random Forest Model")
plt.bar(range(np.size(predictors)), importances[indices],
color="r", yerr=std[indices], align="center")
plt.xticks(range(np.size(predictors)), sorted_important_features, rotation='vertical')
plt.xlim([-1, np.size(predictors)]);
In [ ]:
output = rf.predict(titanic[predictors]).astype(int)
In [ ]:
df_out = pd.DataFrame()
In [ ]:
print(output)
In [ ]:
In [ ]:
df_out['PassengerId']=titanic['PassengerId']
In [ ]:
df_out['Survived'] = output
In [ ]:
print(df_out)
In [ ]:
df_out[['PassengerId','Survived']].to_csv('./output/second_test.csv',index=False)
In [ ]: