In [2]:
%matplotlib inline
import pandas as pd
import numpy as np
In [84]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
In [4]:
train.shape
Out[4]:
In [6]:
train.head()
Out[6]:
In [23]:
train.Age.hist()
Out[23]:
In [29]:
train.Age.describe()
Out[29]:
In [32]:
train[train['Age'] > 60][['Sex', 'Pclass', 'Age', 'Survived']]
Out[32]:
Now I'm starting to see a pattern here. Let's see how many female survived.
In [43]:
females = train[train['Sex'] == 'female']
females_who_survived = females[females['Survived'] == 1]
females_who_survived.shape
Out[43]:
In [42]:
males = train[train['Sex'] == 'male']
males_who_survived = males[males['Survived'] == 1]
males_who_survived.shape
Out[42]:
Looks like the majority of people who survived are females.
In [44]:
import pylab as pl
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import r2_score
In [85]:
test.head()
Out[85]:
In [67]:
cols = ['Age', 'Pclass']
notnull_age = train[cols][train['Age'].notnull()]
notnull_survived = train['Survived'][train['Age'].notnull()]
notnull_age.head()
Out[67]:
In [69]:
clf = RandomForestClassifier(n_estimators=20, max_features=2, min_samples_split=5)
clf.fit(notnull_age, notnull_survived)
Out[69]:
In [79]:
notnull_test = test[cols][test['Age'].notnull()]
In [82]:
clf.predict(notnull_test)
Out[82]:
In [ ]: