This example is pandas-ml
transration based on kaggle's "Getting Started With Python II". To run this Jupyter Notebook, you must download data to titanic_data
directory.
https://www.kaggle.com/c/titanic/details/getting-started-with-python-ii
In [1]:
import os
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 6)
In [2]:
import pandas_ml as pdml
In [3]:
train_df = pd.read_csv(os.path.join('titanic_data', 'train.csv'), header=0, index_col=0)
test_df = pd.read_csv(os.path.join('titanic_data', 'test.csv'), header=0, index_col=0)
# convert to pdml.ModelFrame
train_df = pdml.ModelFrame(train_df, target='Survived')
test_df = pdml.ModelFrame(test_df)
train_df
Out[3]:
In [4]:
# display target columns
train_df.target
Out[4]:
In [5]:
# display data columns
train_df.data
Out[5]:
In [6]:
train_df['Gender'] = train_df['Sex'].replace({'female': 0, 'male': 1})
train_df['Gender']
Out[6]:
In [7]:
if train_df['Embarked'].isnull().any():
train_df['Embarked'] = train_df['Embarked'].fillna(train_df['Embarked'].value_counts().argmax())
ports_factor, ports = train_df['Embarked'].factorize()
ports_dict = dict(zip(ports, np.arange(len(ports))))
ports_dict
Out[7]:
In [8]:
train_df['Embarked'] = train_df['Embarked'].replace(ports_dict)
In [9]:
if train_df['Age'].isnull().any():
train_df.loc[ (train_df.Age.isnull()), 'Age'] = train_df['Age'].median()
train_df = train_df.drop(['Name', 'Sex', 'Ticket', 'Cabin'], axis=1)
train_df
Out[9]:
In [10]:
test_df['Gender'] = test_df['Sex'].replace({'female': 0, 'male': 1})
if test_df['Embarked'].isnull().any() > 0:
test_df['Embarked'] = test_df['Embarked'].fillna(test_df['Embarked'].value_counts().argmax())
test_df['Embarked'] = test_df['Embarked'].replace(ports_dict)
if test_df['Age'].isnull().any():
test_df['Age'] = test_df['Age'].fillna(test_df['Age'].median())
if test_df['Fare'].isnull().any():
median_fare = np.zeros(3)
for f in range(0,3):
median_fare[f] = test_df[ test_df.Pclass == f+1 ]['Fare'].median()
for f in range(0,3):
test_df.loc[ (test_df.Fare.isnull()) & (test_df.Pclass == f+1 ), 'Fare'] = median_fare[f]
test_df = test_df.drop(['Name', 'Sex', 'Ticket', 'Cabin'], axis=1)
test_df
Out[10]:
In [11]:
# Training...
forest = train_df.ensemble.RandomForestClassifier(n_estimators=100)
forest = train_df.fit(forest)
forest
Out[11]:
In [12]:
# Predicting...
predicted = test_df.predict(forest)
predicted
Out[12]:
In [13]:
predicted.reset_index()
Out[13]:
In [14]:
predicted.value_counts()
Out[14]: