In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import sklearn as sk
import matplotlib.pyplot as plt
%matplotlib inline
from __future__ import division
from collections import Counter
In [2]:
# Read the training data
raw = pd.read_table( '../input/train.csv', delimiter=',')
In [3]:
raw.head()
Out[3]:
In [4]:
# Check which columns have missing values
raw.apply(lambda x: sum(x.isnull())/len(x))
Out[4]:
In [5]:
raw.info()
In [6]:
# Look at estimating the missing ages
ageData = raw.groupby([x.split()[1] for x in raw.Name]).Age
# raw[raw.Age.isnull()]
In [7]:
def print_cats(df):
for col in ['Pclass', 'Sex', 'Embarked']:
print col
print Counter(df[col]).most_common()
In [8]:
print_cats(raw)
In [9]:
def clean(raw):
cleaned = raw.copy()
cleaned.loc[cleaned['Age'].isnull(), ['Age']] = np.nanmedian(raw['Age'])
cleaned.loc[cleaned['Embarked'].isnull(),['Embarked']] = Counter(raw['Embarked']).most_common(1)[0][0]
# Take group median for missing Fares
data = cleaned.groupby('Pclass')['Fare']
cleaned['Fare'] = data.transform( lambda x: x.fillna(x.median()))
cleaned['IsChild'] = 1.0*(cleaned['Age'] < 20)
cleaned['IsFemale'] = 1.0*(cleaned['Sex'] == 'male')
cleaned['IsUpperClass'] = 1.0*(cleaned['Pclass']==1)
cleaned['HighFareBucket'] = 1.0*(cleaned['Fare'] >= np.percentile(cleaned['Fare'], 50))
return cleaned
In [10]:
cleaned = clean(raw)
cleaned.apply(lambda x: sum(x.isnull())/len(x))
Out[10]:
In [14]:
# Histogram of survival vs age
plt.hist([cleaned['Age'][cleaned['Survived']==0],cleaned['Age'][cleaned['Survived']==1]])
Out[14]:
In [15]:
print_cats(cleaned)
In [16]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier
In [17]:
nTrain = 891
features = ['IsFemale', 'IsChild', 'Pclass', 'Fare']
In [18]:
cleaned.groupby(features)['Survived'].mean()
Out[18]:
In [19]:
# clf = MultinomialNB()
clf = RandomForestClassifier()
clf.fit(cleaned[features].values, cleaned['Survived'].values)
Out[19]:
In [20]:
confusion_matrix(cleaned.Survived.values,clf.predict(cleaned[features].values))
Out[20]:
In [21]:
accuracy_score(cleaned.Survived.values,clf.predict(cleaned[features].values), normalize=True)
Out[21]:
In [22]:
# Predict the test set
test = pd.read_table('../input/test.csv', delimiter=',')
cleaned_test = clean(test)
cleaned_test['Survived'] = clf.predict(cleaned_test[features].values )
In [23]:
cleaned_test[['PassengerId', 'Survived']].describe()
Out[23]:
In [24]:
cleaned_test.to_csv('random_forest_prediction_with_pclass.csv', columns=['PassengerId', 'Survived'], index=False)
In [ ]: