In [1]:
import pandas as pd
import numpy as np
df = pd.read_csv('Desktop/titanic/train.csv')
In [2]:
df.head(10)
Out[2]:
In [3]:
df = df.drop(['Name', 'Ticket', 'Cabin'], axis=1)
In [4]:
df.info()
In [5]:
df = df.dropna()
In [6]:
df['Sex'].unique()
Out[6]:
In [7]:
df['Gender'] = df['Sex'].map({'female': 0, 'male':1}).astype(int)
In [8]:
df['Embarked'].unique()
Out[8]:
In [9]:
df['Port'] = df['Embarked'].map({'C':1, 'S':2, 'Q':3}).astype(int)
In [10]:
df = df.drop(['Sex', 'Embarked'], axis=1)
In [11]:
cols = df.columns.tolist()
print(cols)
In [12]:
cols = [cols[1]] + cols[0:1] + cols[2:]
df = df[cols]
In [13]:
df.head(10)
Out[13]:
In [14]:
df.info()
In [15]:
train_data = df.values
In [27]:
#Using Random Forest to predict data
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators = 100)
print np.shape(train_data)
In [17]:
model = model.fit(train_data[0:,2:],train_data[0:,0])
In [28]:
df_test = pd.read_csv('Desktop/titanic/test.csv')
In [19]:
df_test.head(10)
Out[19]:
In [20]:
df_test = df_test.drop(['Name', 'Ticket', 'Cabin'], axis=1)
df_test = df_test.dropna()
df_test['Gender'] = df_test['Sex'].map({'female': 0, 'male':1})
df_test['Port'] = df_test['Embarked'].map({'C':1, 'S':2, 'Q':3})
df_test = df_test.drop(['Sex', 'Embarked'], axis=1)
test_data = df_test.values
In [21]:
df_test.head(10)
Out[21]:
In [22]:
output = model.predict(test_data[:,1:])
In [23]:
result = np.c_[test_data[:,0].astype(int), output.astype(int)]
df_result = pd.DataFrame(result[:,0:2], columns=['PassengerId', 'Survived'])
In [24]:
df_result.head(10)
Out[24]:
In [25]:
df_result.to_csv('Desktop/titanic/RandomForestresults.csv', index=False)
In [26]:
df_result.shape
Out[26]:
In [ ]: