SkData Steps

The OpenRefine json file for steps was used as reference:

[
  {
    "op": "core/text-transform",
    "description": "Text transform on cells in column Sex using expression grel:if(value == 'male', 1, 0)",
    "engineConfig": {
      "mode": "row-based",
      "facets": []
    },
    "columnName": "Sex",
    "expression": "grel:if(value == 'male', 1, 0)",
    "onError": "keep-original",
    "repeat": false,
    "repeatCount": 10
  }
]

SkData Steps format proposed:

[
  {
    'data-set': 'data_set_name', // required
    'operation': 'categorize|text-transform|fill-na|drop-na|drop-unique', // required
    'column': 'column_name',  // optional
    'new-column': 'new_column_name', // optional
    'expression': 'dict|inline-if' // required
  }
]

In [1]:
from IPython.display import display
from functools import reduce
# local
from skdata.cleaning import *
from skdata.data import summary

import json
import pandas as pd


/home/xmn/miniconda3/envs/skdata/lib/python3.6/site-packages/odo/backends/pandas.py:102: FutureWarning: pandas.tslib is deprecated and will be removed in a future version.
You can access NaTType as type(pandas.NaT)
  @convert.register((pd.Timestamp, pd.Timedelta), (pd.tslib.NaTType, type(None)))

In [2]:
def replace(value: str, replace_dict: dict):
    """
    """
    if not isinstance(value, str):
        return value
    
    return reduce(
        lambda x, y: x.replace(y, replace_dict[y]), replace_dict, value
    )

In [3]:
def expr(dataset: pd.DataFrame, step: str):
    # aliases
    op = step['operation']
    k = step['column'] if 'column' in step else None
    k_new = k if 'new-column' not in step else step['new-column']
    c_expr = step['expression']
    data = dataset
    
    if op == 'text-transform':
        f_expr = eval('lambda value: %s' % c_expr)
        dataset[k_new] = dataset[k].apply(f_expr)
        
    elif op == 'categorize':
        params = dict(data=dataset, col_name=k, categories=eval(c_expr))
        params.update(
            {'new_col_name': k_new} if 'new-column' in step else {}
        )
        categorize(**params)
        
    elif op == 'fill-na':
        fill = c_expr
        if c_expr in ['mean', 'max', 'min', 'median']:
            fill = dataset.eval('%s.%s()' % (k, c_expr))
        dataset[k].fillna(fill, inplace=True)
        
    elif op == 'drop-na':
        params = eval(c_expr)
        dropna(dataset, **params)
        
    elif op == 'drop-unique':
        params = eval(c_expr)
        drop_columns_with_unique_values(dataset, **params)
        
    return dataset

In [4]:
steps_json = '''[{
    "data-set": "data",
    "operation": "text-transform",
    "column": "Sex",
    "expression": "value.title()"
}, {
    "data-set": "data",
    "operation": "categorize",
    "column": "Pclass",
    "new-column": "Pclass-name",
    "expression": "{1: 'High Class', 2: 'Middle Class', 3: 'Low Class'}"
}, {
    "data-set": "data",
    "operation": "text-transform",
    "column": "Embarked",
    "expression": "replace(value, {'C': 'Cherbourg', 'Q': 'Queenstown', 'S': 'Southampton'})"
}, {
    "data-set": "data",
    "operation": "fill-na",
    "column": "Age",
    "expression": "mean"
}, {
    "data-set": "data",
    "operation": "drop-na",
    "expression": "{'axis': 1, 'max_na_values': 0.1}"
}, {
    "data-set": "data",
    "operation": "drop-na",
    "expression": "{'axis': 0}"
}, {
    "data-set": "data",
    "operation": "drop-unique",
    "expression": "{'max_unique_values': 0.25}"
}]
'''

In [5]:
def compute(steps, start: int=None, end: int=None, steps_id: list=None):
    """
    
    """
    
    if steps_id is not None:
        _steps = [s for i, s in enumerate(steps) if i in steps_id]
    else:
        _steps = steps[start:end]
    
    for step in _steps:
        globals()[step['data-set']] = expr(globals()[step['data-set']], step)

In [6]:
data = pd.read_csv('../data/train.csv')
steps = json.loads(steps_json)

display(summary(data))

compute(steps)
    
display(data.head())
display(summary(data))


Types Set Values Count Set # Observations # NaN
PassengerId int64 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14... 891 891 0
Survived int64 [0, 1] 2 891 0
Pclass int64 [1, 2, 3] 3 891 0
Name object ['Abbing, Mr. Anthony', 'Abbott, Mr. Rossmore ... 891 891 0
Sex object ['female', 'male'] 2 891 0
Age float64 [0.42, 0.67, 0.75, 0.83, 0.92, 1.0, 2.0, 3.0, ... 88 714 177
SibSp int64 [0, 1, 2, 3, 4, 5, 8] 7 891 0
Parch int64 [0, 1, 2, 3, 4, 5, 6] 7 891 0
Ticket object ['110152', '110413', '110465', '110564', '1108... 681 891 0
Fare float64 [0.0, 4.0125, 5.0, 6.2375, 6.4375, 6.45, 6.495... 248 891 0
Cabin object ['A10', 'A14', 'A16', 'A19', 'A20', 'A23', 'A2... 147 204 687
Embarked object ['C', 'Q', 'S'] 3 889 2
Survived Pclass Sex Age SibSp Parch Embarked Pclass-name
0 0 3 Male 22.0 1 0 Southampton Low Class
1 1 1 Female 38.0 1 0 Cherbourg High Class
2 1 3 Female 26.0 0 0 Southampton Low Class
3 1 1 Female 35.0 1 0 Southampton High Class
4 0 3 Male 35.0 0 0 Southampton Low Class
Types Set Values Count Set # Observations # NaN
Survived int64 [0, 1] 2 889 0
Pclass int64 [1, 2, 3] 3 889 0
Sex object ['Female', 'Male'] 2 889 0
Age float64 [0.42, 0.67, 0.75, 0.83, 0.92, 1.0, 2.0, 3.0, ... 89 889 0
SibSp int64 [0, 1, 2, 3, 4, 5, 8] 7 889 0
Parch int64 [0, 1, 2, 3, 4, 5, 6] 7 889 0
Embarked object ['Cherbourg', 'Queenstown', 'Southampton'] 3 889 0
Pclass-name category ['High Class', 'Low Class', 'Middle Class'] 3 889 0