notebook.community

Edit and run



In [1]:

    
import warnings
warnings.filterwarnings('ignore')



# SKLearn Model Algorithms
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression , Perceptron

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC

# SKLearn ensemble classifiers
from sklearn.ensemble import RandomForestClassifier , GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier , BaggingClassifier
from sklearn.ensemble import VotingClassifier , AdaBoostClassifier

# SKLearn Modelling Helpers
from sklearn.preprocessing import Imputer , Normalizer , scale
from sklearn.cross_validation import train_test_split , StratifiedKFold
from sklearn.feature_selection import RFECV

# Handle table-like data and matrices
import numpy as np
import pandas as pd

# Visualisation
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns

# plot functions
import plotFunctions as pfunc
import dataFunctions as dfunc

# Configure visualisations
%matplotlib inline
mpl.style.use( 'ggplot' )
sns.set_style( 'white' )
pylab.rcParams[ 'figure.figsize' ] = 8 , 6









    



C:\Users\lvarr\AppData\Local\conda\conda\envs\tensorflow\lib\site-packages\sklearn\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

Best score: 0.8406285072951739 Best parameters: { 'bootstrap': True, 'max_depth': 4, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'min_samples_split': 5, 'n_estimators': 10}



In [2]:

    
full = dfunc.loadData()



In [3]:

    
full.keys()









    Out[3]:





Index(['Age', 'Cabin', 'Embarked', 'Fare', 'Name', 'Parch', 'PassengerId',
       'Pclass', 'Sex', 'SibSp', 'Survived', 'Ticket'],
      dtype='object')



In [4]:

    
#full["Embarked"]



In [5]:

    
full=full.drop(['Name','Ticket'], axis=1)
#full=full.drop(['PassengerId','Name','Ticket'], axis=1)



In [6]:

    
full = dfunc.fillEmbarked(full)









    



Process  Embarked  : Concluded!



In [7]:

    
full = dfunc.fillEmbarkedDummies(full)



In [8]:

    
full = dfunc.fillFare(full)









    



Process  Converting Fare  : Concluded!



In [9]:

    
full = dfunc.convertSexToNum(full,True)









    



Process  Converting Sex to num  : Concluded!



In [10]:

    
full = dfunc.fillMissingAgeDrop(full)



In [11]:

    
print(full.isnull().sum())









    



Age               0
Cabin          1014
Embarked          0
Fare              0
Parch             0
PassengerId       0
Pclass            0
SibSp             0
Survived        418
C                 0
Q                 0
Sex               0
dtype: int64



In [12]:

    
full.drop("Cabin",axis=1,inplace=True)



In [13]:

    
full = dfunc.fillFamily(full)









    



Process  Family  : Concluded!



In [14]:

    
full = dfunc.fillPerson(full)









    



Process  Person  : Concluded!



In [15]:

    
full.drop("Person",axis=1,inplace=True)



In [16]:

    
full = dfunc.fillPclass(full)



In [17]:

    
full.drop("Pclass",axis=1,inplace=True)



In [ ]:



In [18]:

    
test = full.head(891)
titanic = full.iloc[891:]



In [ ]:



In [19]:

    
parameters = {'bootstrap': True,
              'max_depth': 6, 
              'max_features': 'auto', 
              'min_samples_leaf': 1, 
              'min_samples_split': 10, 
              'n_estimators': 100,
              'n_jobs': -1
             }
rf = RandomForestClassifier(**parameters)



In [ ]:



In [20]:

    
predictors = ["Age",
            "Fare",
            "C",
            "Q",
            "Sex",
            "Family",
            "Child",
            "Female",
            "Class_1",
            "Class_2"
             ]



In [21]:

    
test = full.head(891)
titanic = full.iloc[891:]



In [22]:

    
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import KFold
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score

from sklearn.model_selection import ShuffleSplit



In [23]:

    
rf.fit(test[predictors],test["Survived"])

kf = KFold(test.shape[0], n_folds=10, random_state=1)
cv = ShuffleSplit(n_splits=10, test_size=0.1, random_state=50)



In [24]:

    
predictions = cross_validation.cross_val_predict(rf, test[predictors],test["Survived"],cv=kf)
predictions = pd.Series(predictions)
scores = cross_val_score(rf, test[predictors], test["Survived"],scoring='accuracy', cv=kf)



In [25]:

    
print(scores)
print(scores.mean())









    



[ 0.81111111  0.88764045  0.79775281  0.86516854  0.82022472  0.80898876
  0.80898876  0.7752809   0.87640449  0.84269663]
0.829425717853



In [26]:

    
from sklearn.grid_search import GridSearchCV









    



C:\Users\lvarr\AppData\Local\conda\conda\envs\tensorflow\lib\site-packages\sklearn\grid_search.py:43: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. This module will be removed in 0.20.
  DeprecationWarning)



In [27]:

    
#parameter_grid = {
#    'max_depth' : [4, 6, 8],
#    'n_estimators': [150, 100, 50, 10],
#    'max_features': ['sqrt', 'auto', 'log2'],
#    'min_samples_split': [2, 3, 5, 10],
#    'min_samples_leaf': [1, 3, 5, 10],
#    'bootstrap': [True, False],
#    'n_jobs': [-1, 6]
#    }
#forest = RandomForestClassifier()
#cross_validation = StratifiedKFold(test["Survived"], n_folds=10, random_state=1)
#grid_search = GridSearchCV(forest, scoring='accuracy', param_grid=parameter_grid, cv=cross_validation)
#grid_search.fit(test[predictors],test["Survived"])
#model = grid_search
#parameters = grid_search.best_params_
#print('Best score: {}'.format(grid_search.best_score_))
#print('Best parameters: {}'.format(grid_search.best_params_))



In [31]:

    
random_forest = RandomForestClassifier(n_estimators=100)

random_forest.fit(test[predictors],test["Survived"])

Y_pred = random_forest.predict(titanic[predictors])
Y_pred = Y_pred.astype(int)
random_forest.score(test[predictors],test["Survived"])









    Out[31]:





0.9640852974186308



In [32]:

    
titanic.keys()









    Out[32]:





Index(['Age', 'Embarked', 'Fare', 'PassengerId', 'Survived', 'C', 'Q', 'Sex',
       'Family', 'Child', 'Female', 'Class_1', 'Class_2'],
      dtype='object')



In [33]:

    
submission = pd.DataFrame({
        "PassengerId": titanic["PassengerId"],
        "Survived": Y_pred
    })
submission.to_csv('titanic.csv', index=False)



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]: