In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('Desktop/titanic/train.csv')

In [2]:
df.head(10)


Out[2]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
5 6 0 3 Moran, Mr. James male NaN 0 0 330877 8.4583 NaN Q
6 7 0 1 McCarthy, Mr. Timothy J male 54.0 0 0 17463 51.8625 E46 S
7 8 0 3 Palsson, Master. Gosta Leonard male 2.0 3 1 349909 21.0750 NaN S
8 9 1 3 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) female 27.0 0 2 347742 11.1333 NaN S
9 10 1 2 Nasser, Mrs. Nicholas (Adele Achem) female 14.0 1 0 237736 30.0708 NaN C

In [3]:
df = df.drop(['Name', 'Ticket', 'Cabin'], axis=1)

In [4]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Fare           891 non-null float64
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(2)
memory usage: 62.7+ KB

In [5]:
df = df.dropna()

In [6]:
df['Sex'].unique()


Out[6]:
array(['male', 'female'], dtype=object)

In [7]:
df['Gender'] = df['Sex'].map({'female': 0, 'male':1}).astype(int)

In [8]:
df['Embarked'].unique()


Out[8]:
array(['S', 'C', 'Q'], dtype=object)

In [9]:
df['Port'] = df['Embarked'].map({'C':1, 'S':2, 'Q':3}).astype(int)

In [10]:
df = df.drop(['Sex', 'Embarked'], axis=1)

In [11]:
cols = df.columns.tolist()
print(cols)


['PassengerId', 'Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Gender', 'Port']

In [12]:
cols = [cols[1]] + cols[0:1] + cols[2:]
df = df[cols]

In [13]:
df.head(10)


Out[13]:
Survived PassengerId Pclass Age SibSp Parch Fare Gender Port
0 0 1 3 22.0 1 0 7.2500 1 2
1 1 2 1 38.0 1 0 71.2833 0 1
2 1 3 3 26.0 0 0 7.9250 0 2
3 1 4 1 35.0 1 0 53.1000 0 2
4 0 5 3 35.0 0 0 8.0500 1 2
6 0 7 1 54.0 0 0 51.8625 1 2
7 0 8 3 2.0 3 1 21.0750 1 2
8 1 9 3 27.0 0 2 11.1333 0 2
9 1 10 2 14.0 1 0 30.0708 0 1
10 1 11 3 4.0 1 1 16.7000 0 2

In [14]:
df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 712 entries, 0 to 890
Data columns (total 9 columns):
Survived       712 non-null int64
PassengerId    712 non-null int64
Pclass         712 non-null int64
Age            712 non-null float64
SibSp          712 non-null int64
Parch          712 non-null int64
Fare           712 non-null float64
Gender         712 non-null int64
Port           712 non-null int64
dtypes: float64(2), int64(7)
memory usage: 55.6 KB

In [15]:
train_data = df.values

In [27]:
#Using Random Forest to predict data
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators = 100)
print np.shape(train_data)


(712, 9)

In [17]:
model = model.fit(train_data[0:,2:],train_data[0:,0])

In [28]:
df_test = pd.read_csv('Desktop/titanic/test.csv')

In [19]:
df_test.head(10)


Out[19]:
PassengerId Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 892 3 Kelly, Mr. James male 34.5 0 0 330911 7.8292 NaN Q
1 893 3 Wilkes, Mrs. James (Ellen Needs) female 47.0 1 0 363272 7.0000 NaN S
2 894 2 Myles, Mr. Thomas Francis male 62.0 0 0 240276 9.6875 NaN Q
3 895 3 Wirz, Mr. Albert male 27.0 0 0 315154 8.6625 NaN S
4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female 22.0 1 1 3101298 12.2875 NaN S
5 897 3 Svensson, Mr. Johan Cervin male 14.0 0 0 7538 9.2250 NaN S
6 898 3 Connolly, Miss. Kate female 30.0 0 0 330972 7.6292 NaN Q
7 899 2 Caldwell, Mr. Albert Francis male 26.0 1 1 248738 29.0000 NaN S
8 900 3 Abrahim, Mrs. Joseph (Sophie Halaut Easu) female 18.0 0 0 2657 7.2292 NaN C
9 901 3 Davies, Mr. John Samuel male 21.0 2 0 A/4 48871 24.1500 NaN S

In [20]:
df_test = df_test.drop(['Name', 'Ticket', 'Cabin'], axis=1)

df_test = df_test.dropna()

df_test['Gender'] = df_test['Sex'].map({'female': 0, 'male':1})
df_test['Port'] = df_test['Embarked'].map({'C':1, 'S':2, 'Q':3})

df_test = df_test.drop(['Sex', 'Embarked'], axis=1)

test_data = df_test.values

In [21]:
df_test.head(10)


Out[21]:
PassengerId Pclass Age SibSp Parch Fare Gender Port
0 892 3 34.5 0 0 7.8292 1 3
1 893 3 47.0 1 0 7.0000 0 2
2 894 2 62.0 0 0 9.6875 1 3
3 895 3 27.0 0 0 8.6625 1 2
4 896 3 22.0 1 1 12.2875 0 2
5 897 3 14.0 0 0 9.2250 1 2
6 898 3 30.0 0 0 7.6292 0 3
7 899 2 26.0 1 1 29.0000 1 2
8 900 3 18.0 0 0 7.2292 0 1
9 901 3 21.0 2 0 24.1500 1 2

In [22]:
output = model.predict(test_data[:,1:])

In [23]:
result = np.c_[test_data[:,0].astype(int), output.astype(int)]
df_result = pd.DataFrame(result[:,0:2], columns=['PassengerId', 'Survived'])

In [24]:
df_result.head(10)


Out[24]:
PassengerId Survived
0 892 0
1 893 0
2 894 0
3 895 1
4 896 0
5 897 0
6 898 0
7 899 0
8 900 1
9 901 0

In [25]:
df_result.to_csv('Desktop/titanic/RandomForestresults.csv', index=False)

In [26]:
df_result.shape


Out[26]:
(331, 2)

In [ ]: