In [34]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier

In [35]:
train_data = pd.read_csv('train.csv')

In [36]:
train_data.head()


Out[36]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S

In [37]:
mean_male_age = train_data[(train_data['Age'].notnull()) & (train_data['Sex'] == 'male')]['Age'].mean()
mean_female_age = train_data[(train_data['Age'].notnull()) & (train_data['Sex'] == 'female')]['Age'].mean()

In [38]:
train_data['Age'].loc[(train_data['Age'].isnull()) & (train_data['Sex'] == 'male')] = mean_male_age
train_data['Age'].loc[(train_data['Age'].isnull()) & (train_data['Sex'] == 'female')] = mean_female_age

In [39]:
max_pass_embarked = train_data.groupby('Embarked').count()['PassengerId']
train_data['Embarked'].loc[train_data['Embarked'].isnull()] = 'S'

In [40]:
train_target = train_data['Survived']
train_features = train_data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]

In [41]:
train_features.head()


Out[41]:
Pclass Sex Age SibSp Parch Fare Embarked
0 3 male 22.0 1 0 7.2500 S
1 1 female 38.0 1 0 71.2833 C
2 3 female 26.0 0 0 7.9250 S
3 1 female 35.0 1 0 53.1000 S
4 3 male 35.0 0 0 8.0500 S

In [42]:
label = LabelEncoder()

label.fit(train_features['Sex'].drop_duplicates())
train_features['Sex'] = label.transform(train_features['Sex'])

label.fit(train_features['Embarked'].drop_duplicates())
train_features['Embarked'] = label.transform(train_features['Embarked'])

train_features.head()


/usr/local/lib/python2.7/site-packages/ipykernel_launcher.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
/usr/local/lib/python2.7/site-packages/ipykernel_launcher.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
Out[42]:
Pclass Sex Age SibSp Parch Fare Embarked
0 3 1 22.0 1 0 7.2500 2
1 1 0 38.0 1 0 71.2833 0
2 3 0 26.0 0 0 7.9250 2
3 1 0 35.0 1 0 53.1000 2
4 3 1 35.0 0 0 8.0500 2

In [43]:
dtc = DecisionTreeClassifier()
dtc.fit(train_features, train_target)


Out[43]:
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [72]:
test_data = pd.read_csv('test.csv')

In [73]:
test_mean_male_age = test_data[(test_data['Age'].notnull()) & (test_data['Sex'] == 'male')]['Age'].mean()
test_mean_female_age = test_data[(test_data['Age'].notnull()) & (test_data['Sex'] == 'female')]['Age'].mean()

In [74]:
test_data['Age'].loc[(test_data['Age'].isnull()) & (test_data['Sex'] == 'male')] = test_mean_male_age
test_data['Age'].loc[(test_data['Age'].isnull()) & (test_data['Sex'] == 'female')] = test_mean_female_age

In [75]:
test_data['Embarked'].loc[test_data['Embarked'].isnull()] = 'S'

In [77]:
test_features = test_data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]

In [78]:
label.fit(test_features['Sex'].drop_duplicates())
test_features['Sex'] = label.transform(test_features['Sex'])

label.fit(test_features['Embarked'].drop_duplicates())
test_features['Embarked'] = label.transform(test_features['Embarked'])


/usr/local/lib/python2.7/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
/usr/local/lib/python2.7/site-packages/ipykernel_launcher.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """

In [89]:
test_features['Fare'][test_features['Fare'].isnull()] = 35.627188489208635

prediction = dtc.predict(test_features)


/usr/local/lib/python2.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.

In [92]:
result = pd.DataFrame(test_data['PassengerId'])
result['Survived'] = prediction

result.to_csv('RESULT.csv', index=False)