The OpenRefine json file for steps was used as reference:
[
{
"op": "core/text-transform",
"description": "Text transform on cells in column Sex using expression grel:if(value == 'male', 1, 0)",
"engineConfig": {
"mode": "row-based",
"facets": []
},
"columnName": "Sex",
"expression": "grel:if(value == 'male', 1, 0)",
"onError": "keep-original",
"repeat": false,
"repeatCount": 10
}
]
SkData Steps format proposed:
[
{
'data-set': 'data_set_name', // required
'operation': 'categorize|text-transform|fill-na|drop-na|drop-unique', // required
'column': 'column_name', // optional
'new-column': 'new_column_name', // optional
'expression': 'dict|inline-if' // required
}
]
In [1]:
from IPython.display import display
from functools import reduce
# local
from skdata.cleaning import *
from skdata.data import summary
import json
import pandas as pd
In [2]:
def replace(value: str, replace_dict: dict):
"""
"""
if not isinstance(value, str):
return value
return reduce(
lambda x, y: x.replace(y, replace_dict[y]), replace_dict, value
)
In [3]:
def expr(dataset: pd.DataFrame, step: str):
# aliases
op = step['operation']
k = step['column'] if 'column' in step else None
k_new = k if 'new-column' not in step else step['new-column']
c_expr = step['expression']
data = dataset
if op == 'text-transform':
f_expr = eval('lambda value: %s' % c_expr)
dataset[k_new] = dataset[k].apply(f_expr)
elif op == 'categorize':
params = dict(data=dataset, col_name=k, categories=eval(c_expr))
params.update(
{'new_col_name': k_new} if 'new-column' in step else {}
)
categorize(**params)
elif op == 'fill-na':
fill = c_expr
if c_expr in ['mean', 'max', 'min', 'median']:
fill = dataset.eval('%s.%s()' % (k, c_expr))
dataset[k].fillna(fill, inplace=True)
elif op == 'drop-na':
params = eval(c_expr)
dropna(dataset, **params)
elif op == 'drop-unique':
params = eval(c_expr)
drop_columns_with_unique_values(dataset, **params)
return dataset
In [4]:
steps_json = '''[{
"data-set": "data",
"operation": "text-transform",
"column": "Sex",
"expression": "value.title()"
}, {
"data-set": "data",
"operation": "categorize",
"column": "Pclass",
"new-column": "Pclass-name",
"expression": "{1: 'High Class', 2: 'Middle Class', 3: 'Low Class'}"
}, {
"data-set": "data",
"operation": "text-transform",
"column": "Embarked",
"expression": "replace(value, {'C': 'Cherbourg', 'Q': 'Queenstown', 'S': 'Southampton'})"
}, {
"data-set": "data",
"operation": "fill-na",
"column": "Age",
"expression": "mean"
}, {
"data-set": "data",
"operation": "drop-na",
"expression": "{'axis': 1, 'max_na_values': 0.1}"
}, {
"data-set": "data",
"operation": "drop-na",
"expression": "{'axis': 0}"
}, {
"data-set": "data",
"operation": "drop-unique",
"expression": "{'max_unique_values': 0.25}"
}]
'''
In [5]:
def compute(steps, start: int=None, end: int=None, steps_id: list=None):
"""
"""
if steps_id is not None:
_steps = [s for i, s in enumerate(steps) if i in steps_id]
else:
_steps = steps[start:end]
for step in _steps:
globals()[step['data-set']] = expr(globals()[step['data-set']], step)
In [6]:
data = pd.read_csv('../data/train.csv')
steps = json.loads(steps_json)
display(summary(data))
compute(steps)
display(data.head())
display(summary(data))