In [47]:
import warnings
warnings.filterwarnings('ignore')
from sklearn.grid_search import GridSearchCV
# SKLearn Model Algorithms
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression , Perceptron
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC
# SKLearn ensemble classifiers
from sklearn.ensemble import RandomForestClassifier , GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier , BaggingClassifier
from sklearn.ensemble import VotingClassifier , AdaBoostClassifier
# SKLearn Modelling Helpers
from sklearn.preprocessing import Imputer , Normalizer , scale
from sklearn.cross_validation import train_test_split , StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.feature_selection import RFECV
# Handle table-like data and matrices
import numpy as np
import pandas as pd
# Visualisation
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
# plot functions
import plotFunctions as pfunc
import dataFunctions as dfunc
# Configure visualisations
%matplotlib inline
mpl.style.use( 'ggplot' )
sns.set_style( 'white' )
pylab.rcParams[ 'figure.figsize' ] = 8 , 6
In [2]:
full = dfunc.loadData()
In [3]:
full = dfunc.convertSexToNum(full, True)
In [4]:
full = dfunc.fillTitles(full)
In [5]:
#full = dfunc.fillMissingAge(full)
full = dfunc.fillAge(full)
In [6]:
full = dfunc.fillMissingFare(full)
In [7]:
full = dfunc.featureEng(full)
In [8]:
full = dfunc.fillEmbarked(full)
In [9]:
full = dfunc.fillTitles(full)
In [10]:
full = dfunc.fillTicket(full)
In [11]:
full = dfunc.fillDeck(full)
In [13]:
#print(full.isnull().sum())
full = full.drop('Cabin',1)
full = full.drop('Name',1)
full = full.drop('Title',1)
full = full.drop('TicketType',1)
full = full.drop('Deck',1)
full = full.drop('Survived',1)
In [14]:
train_X, test_X, target_y = pfunc.prepareTrainTestTarget(full)
model = RandomForestClassifier(n_estimators=100, max_features='sqrt')
In [44]:
print(target_y)
In [15]:
model.fit( train_X , target_y )
Out[15]:
In [16]:
features = pd.DataFrame()
features['feature'] = train_X.columns
features['importance'] = model.feature_importances_
features.sort_values(by=['importance'], ascending=True, inplace=True)
features.set_index('feature', inplace=True)
In [17]:
features.plot(kind='barh', figsize=(20, 20))
Out[17]:
In [67]:
parameters = {'bootstrap': True,
'max_depth': 6,
'max_features': 'auto',
'min_samples_leaf': 1,
'min_samples_split': 10,
'n_estimators': 100,
'n_jobs': -1
}
model = RandomForestClassifier(**parameters)
In [62]:
train_X=train_X.values
test_X=test_X.values
target_y=target_y.values
In [68]:
#crossv = StratifiedKFold(target_y, n_folds=10)
crossv = KFold(n_splits=10)
print(crossv)
type(train_X)
for train_index, test_index in crossv.split(train_X):
#print("TRAIN:", train_index, "TEST:", test_index)
X_train = train_X[train_index]
X_test = train_X[test_index]
y_train = target_y[train_index]
y_test = target_y[test_index]
model.fit( X_train , y_train )
pfunc.calcScore(model, X_test , y_test, scoring='accuracy')
Out[68]:
In [69]:
output = model.predict(test_X).astype(int)
In [70]:
print(output)
In [71]:
df_out = pd.DataFrame()
In [72]:
print(test_X)
In [73]:
aux = pd.read_csv('./input/test.csv')
In [75]:
df_out['PassengerId']=aux['PassengerId']
In [79]:
df_out['Survived'] = pd.DataFrame(output.tolist())
In [81]:
print(df_out)
In [82]:
df_out[['PassengerId','Survived']].to_csv('./output/output.csv',index=False)
In [ ]: