SkData - Data Specification

SkData provide a data class to structure and organize the preprocessing data.


In [1]:
from skdata.data import (
    SkDataFrame as DataFrame,
    SkDataSeries as Series
)

In [2]:
import pandas as pd

Importing data


In [3]:
df_train = DataFrame(
    pd.read_csv('../data/train.csv', index_col='PassengerId')
)

In [4]:
df_train.head()


Out[4]:
Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
PassengerId
1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S

In [5]:
df_train.summary()


Out[5]:
Types Set Values Count Set # Observations # NaN
Survived int64 [0, 1] 2 891 0
Pclass int64 [1, 2, 3] 3 891 0
Name object ['Abbing, Mr. Anthony', 'Abbott, Mr. Rossmore ... 891 891 0
Sex object ['female', 'male'] 2 891 0
Age float64 [0.42, 0.67, 0.75, 0.83, 0.92, 1.0, 2.0, 3.0, ... 88 714 177
SibSp int64 [0, 1, 2, 3, 4, 5, 8] 7 891 0
Parch int64 [0, 1, 2, 3, 4, 5, 6] 7 891 0
Ticket object ['110152', '110413', '110465', '110564', '1108... 681 891 0
Fare float64 [0.0, 4.0125, 5.0, 6.2375, 6.4375, 6.45, 6.495... 248 891 0
Cabin object ['A10', 'A14', 'A16', 'A19', 'A20', 'A23', 'A2... 147 204 687
Embarked object ['C', 'Q', 'S'] 3 889 2

Data preparing and cleaning


In [6]:
df_train['Sex'].replace({
    'male': 'Male', 'female': 'Female'
}, inplace=True)

df_train['Embarked'].replace({
    'C': 'Cherbourg', 'Q': 'Queenstown', 'S': 'Southampton'
}, inplace=True)

In [7]:
df_train.summary()


Out[7]:
Types Set Values Count Set # Observations # NaN
Survived int64 [0, 1] 2 891 0
Pclass int64 [1, 2, 3] 3 891 0
Name object ['Abbing, Mr. Anthony', 'Abbott, Mr. Rossmore ... 891 891 0
Sex object ['Female', 'Male'] 2 891 0
Age float64 [0.42, 0.67, 0.75, 0.83, 0.92, 1.0, 2.0, 3.0, ... 88 714 177
SibSp int64 [0, 1, 2, 3, 4, 5, 8] 7 891 0
Parch int64 [0, 1, 2, 3, 4, 5, 6] 7 891 0
Ticket object ['110152', '110413', '110465', '110564', '1108... 681 891 0
Fare float64 [0.0, 4.0125, 5.0, 6.2375, 6.4375, 6.45, 6.495... 248 891 0
Cabin object ['A10', 'A14', 'A16', 'A19', 'A20', 'A23', 'A2... 147 204 687
Embarked object ['Cherbourg', 'Queenstown', 'Southampton'] 3 889 2

In [8]:
df_train['Sex'] = df_train['Sex'].astype('category')
df_train['Embarked'] = df_train['Embarked'].astype('category')

df_train.summary()


Out[8]:
Types Set Values Count Set # Observations # NaN
Survived int64 [0, 1] 2 891 0
Pclass int64 [1, 2, 3] 3 891 0
Name object ['Abbing, Mr. Anthony', 'Abbott, Mr. Rossmore ... 891 891 0
Sex object ['Female', 'Male'] 2 891 0
Age float64 [0.42, 0.67, 0.75, 0.83, 0.92, 1.0, 2.0, 3.0, ... 88 714 177
SibSp int64 [0, 1, 2, 3, 4, 5, 8] 7 891 0
Parch int64 [0, 1, 2, 3, 4, 5, 6] 7 891 0
Ticket object ['110152', '110413', '110465', '110564', '1108... 681 891 0
Fare float64 [0.0, 4.0125, 5.0, 6.2375, 6.4375, 6.45, 6.495... 248 891 0
Cabin object ['A10', 'A14', 'A16', 'A19', 'A20', 'A23', 'A2... 147 204 687
Embarked object ['Cherbourg', 'Queenstown', 'Southampton'] 3 889 2

In [9]:
survived_dict = {0: 'Died', 1: 'Survived'}
pclass_dict = {1: 'Upper Class', 2: 'Middle Class', 3: 'Lower Class'}

# df_train['Pclass'].categorize(categories=pclass_dict)
# df_train['Survived'].categorize(categories=survived_dict)

In [10]:
print('STEPS:')
df_train.steps


STEPS:
Out[10]:
[('42ee5948b8a747eebf1a876eefe97b9f',
  "Sex.replace(*({'male': 'Male', 'female': 'Female'},), **{'inplace': True})"),
 ('479e4e88c309488aba106acd2c002169',
  "Embarked.replace(*({'C': 'Cherbourg', 'Q': 'Queenstown', 'S': 'Southampton'},), **{'inplace': True})"),
 ('0dd537ec90d84b45beea403d5e9a6e7b',
  "Sex = Sex.astype(*('category',), **{})"),
 ('f83f89843289450f90d68a49902000ab',
  "Embarked = Embarked.astype(*('category',), **{})")]