In [1]:
import warnings
warnings.filterwarnings('ignore')


from sklearn.grid_search import GridSearchCV
# SKLearn Model Algorithms
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression , Perceptron

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC

# SKLearn ensemble classifiers
from sklearn.ensemble import RandomForestClassifier , GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier , BaggingClassifier
from sklearn.ensemble import VotingClassifier , AdaBoostClassifier

# SKLearn Modelling Helpers
from sklearn.preprocessing import Imputer , Normalizer , scale
from sklearn.cross_validation import train_test_split , StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.feature_selection import RFECV

# Handle table-like data and matrices
import numpy as np
import pandas as pd

# Visualisation
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns

# plot functions
import plotFunctions as pfunc
import dataFunctions as dfunc

# Configure visualisations
%matplotlib inline
mpl.style.use( 'ggplot' )
sns.set_style( 'white' )
pylab.rcParams[ 'figure.figsize' ] = 8 , 6


C:\Users\lvarr\AppData\Local\conda\conda\envs\tensorflow\lib\site-packages\sklearn\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
C:\Users\lvarr\AppData\Local\conda\conda\envs\tensorflow\lib\site-packages\sklearn\grid_search.py:43: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. This module will be removed in 0.20.
  DeprecationWarning)

In [2]:
full = dfunc.loadData()

In [3]:
full.isnull().sum()


Out[3]:
Age             263
Cabin          1014
Embarked          2
Fare              1
Name              0
Parch             0
PassengerId       0
Pclass            0
Sex               0
SibSp             0
Survived        418
Ticket            0
dtype: int64

In [4]:
full = dfunc.fillEmbarked(full)


Process  Embarked  : Concluded!

In [ ]:
full = dfunc.fillFare(full)

In [ ]:
full = dfunc.fillDeck(full)

In [ ]:
full = dfunc.fillFamilySize(full)

In [ ]:
full = dfunc.fillNameSize(full)

In [ ]:
full = dfunc.fillTitles(full)

In [ ]:
full['Title'].value_counts()

In [ ]:
full = dfunc.fillTicket(full)

In [ ]:
full.groupby('Ticket')[ 'Name' ].transform('count')

In [ ]:
full = dfunc.categoricalToNum(full)

In [ ]:
full = dfunc.fillMissingAge(full)

In [ ]:
full = dfunc.featureScale(full)

In [ ]:
print(full.isnull().sum())

In [ ]:
full.info()

In [ ]:
parameters = {'bootstrap': True,
              'max_depth': 6, 
              'max_features': 'auto', 
              'min_samples_leaf': 1, 
              'min_samples_split': 10, 
              'n_estimators': 100,
              'n_jobs': -1
             }
rf = RandomForestClassifier(**parameters)

In [ ]:
predictors = ["Age","Embarked","Fare",
             "Pclass","Sex","SibSp",
             "Parch","Deck","Family",
             "FsizeD","NameLength","NlengthD","Title",
             "TicketNumber"]

In [ ]:
test = full.head(891)
titanic = full.iloc[891:]

In [ ]:
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import KFold
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score

from sklearn.model_selection import ShuffleSplit

In [ ]:
rf.fit(test[predictors],test["Survived"])

kf = KFold(test.shape[0], n_folds=10, random_state=1)
cv = ShuffleSplit(n_splits=20, test_size=0.3, random_state=50)

In [ ]:
predictions = cross_validation.cross_val_predict(rf, test[predictors],test["Survived"],cv=kf)
predictions = pd.Series(predictions)
scores = cross_val_score(rf, test[predictors], test["Survived"],scoring='f1', cv=kf)

In [ ]:
print(scores)
print(scores.mean())

In [ ]:
importances=rf.feature_importances_
std = np.std([rf.feature_importances_ for tree in rf.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]
sorted_important_features=[]
for i in indices:
    sorted_important_features.append(predictors[i])
#predictors=titanic.columns
plt.figure()
plt.title("Feature Importances By Random Forest Model")
plt.bar(range(np.size(predictors)), importances[indices],
       color="r", yerr=std[indices], align="center")
plt.xticks(range(np.size(predictors)), sorted_important_features, rotation='vertical')

plt.xlim([-1, np.size(predictors)]);

In [ ]:
output = rf.predict(titanic[predictors]).astype(int)

In [ ]:
df_out = pd.DataFrame()

In [ ]:
print(output)

In [ ]:


In [ ]:
df_out['PassengerId']=titanic['PassengerId']

In [ ]:
df_out['Survived'] = output

In [ ]:
print(df_out)

In [ ]:
df_out[['PassengerId','Survived']].to_csv('./output/second_test.csv',index=False)

In [ ]: