SkData provide a data class to structure and organize the preprocessing data.
In [1]:
from skdata.data import (
SkDataFrame as DataFrame,
SkDataSeries as Series
)
In [2]:
import pandas as pd
In [3]:
df_train = DataFrame(
pd.read_csv('../data/train.csv', index_col='PassengerId')
)
In [4]:
df_train.head()
Out[4]:
In [5]:
df_train.summary()
Out[5]:
In [6]:
df_train['Sex'].replace({
'male': 'Male', 'female': 'Female'
}, inplace=True)
df_train['Embarked'].replace({
'C': 'Cherbourg', 'Q': 'Queenstown', 'S': 'Southampton'
}, inplace=True)
In [7]:
df_train.summary()
Out[7]:
In [8]:
df_train['Sex'] = df_train['Sex'].astype('category')
df_train['Embarked'] = df_train['Embarked'].astype('category')
df_train.summary()
Out[8]:
In [9]:
survived_dict = {0: 'Died', 1: 'Survived'}
pclass_dict = {1: 'Upper Class', 2: 'Middle Class', 3: 'Lower Class'}
# df_train['Pclass'].categorize(categories=pclass_dict)
# df_train['Survived'].categorize(categories=survived_dict)
In [10]:
print('STEPS:')
df_train.steps
Out[10]: