In [34]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
In [35]:
train_data = pd.read_csv('train.csv')
In [36]:
train_data.head()
Out[36]:
In [37]:
mean_male_age = train_data[(train_data['Age'].notnull()) & (train_data['Sex'] == 'male')]['Age'].mean()
mean_female_age = train_data[(train_data['Age'].notnull()) & (train_data['Sex'] == 'female')]['Age'].mean()
In [38]:
train_data['Age'].loc[(train_data['Age'].isnull()) & (train_data['Sex'] == 'male')] = mean_male_age
train_data['Age'].loc[(train_data['Age'].isnull()) & (train_data['Sex'] == 'female')] = mean_female_age
In [39]:
max_pass_embarked = train_data.groupby('Embarked').count()['PassengerId']
train_data['Embarked'].loc[train_data['Embarked'].isnull()] = 'S'
In [40]:
train_target = train_data['Survived']
train_features = train_data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
In [41]:
train_features.head()
Out[41]:
In [42]:
label = LabelEncoder()
label.fit(train_features['Sex'].drop_duplicates())
train_features['Sex'] = label.transform(train_features['Sex'])
label.fit(train_features['Embarked'].drop_duplicates())
train_features['Embarked'] = label.transform(train_features['Embarked'])
train_features.head()
Out[42]:
In [43]:
dtc = DecisionTreeClassifier()
dtc.fit(train_features, train_target)
Out[43]:
In [72]:
test_data = pd.read_csv('test.csv')
In [73]:
test_mean_male_age = test_data[(test_data['Age'].notnull()) & (test_data['Sex'] == 'male')]['Age'].mean()
test_mean_female_age = test_data[(test_data['Age'].notnull()) & (test_data['Sex'] == 'female')]['Age'].mean()
In [74]:
test_data['Age'].loc[(test_data['Age'].isnull()) & (test_data['Sex'] == 'male')] = test_mean_male_age
test_data['Age'].loc[(test_data['Age'].isnull()) & (test_data['Sex'] == 'female')] = test_mean_female_age
In [75]:
test_data['Embarked'].loc[test_data['Embarked'].isnull()] = 'S'
In [77]:
test_features = test_data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
In [78]:
label.fit(test_features['Sex'].drop_duplicates())
test_features['Sex'] = label.transform(test_features['Sex'])
label.fit(test_features['Embarked'].drop_duplicates())
test_features['Embarked'] = label.transform(test_features['Embarked'])
In [89]:
test_features['Fare'][test_features['Fare'].isnull()] = 35.627188489208635
prediction = dtc.predict(test_features)
In [92]:
result = pd.DataFrame(test_data['PassengerId'])
result['Survived'] = prediction
result.to_csv('RESULT.csv', index=False)