In [1]:
# Import py_entitymatching package
import py_entitymatching as em
import os
import pandas as pd
Then, read the (sample) input tables for blocking purposes.
In [2]:
# Get the datasets directory
datasets_dir = em.get_install_path() + os.sep + 'datasets'
# Get the paths of the input tables
path_A = datasets_dir + os.sep + 'person_table_A.csv'
path_B = datasets_dir + os.sep + 'person_table_B.csv'
In [3]:
# Read the CSV files and set 'ID' as the key attribute
A = em.read_csv_metadata(path_A, key='ID')
B = em.read_csv_metadata(path_B, key='ID')
First, block using rule-based blocker
In [4]:
# First get features that can be used
feature_table = em.get_features_for_blocking(A, B, validate_inferred_attr_types=False)
In [5]:
# Create rule-based blocker
rb = em.RuleBasedBlocker()
# Add rule : block tuples if name_name_lev(ltuple, rtuple) < 0.8
rb.add_rule(['name_name_lev_sim(ltuple, rtuple) < 0.8'], feature_table)
Out[5]:
In [6]:
E = rb.block_tables(A, B, l_output_attrs=['name'], r_output_attrs=['name'])
In [7]:
E
Out[7]:
In [8]:
dbg = em.debug_blocker(E, A, B, output_size=5)
In [9]:
dbg
Out[9]:
In [10]:
# Create rule-based blocker --- NOTE: we are creating a new blocker !!!
rb = em.RuleBasedBlocker()
# Add rule : block tuples if name_name_lev_sim(ltuple, rtuple) < 0.4
rb.add_rule(['name_name_lev_sim(ltuple, rtuple) < 0.4'], feature_table)
Out[10]:
In [11]:
E = rb.block_tables(A, B, l_output_attrs=['name'], r_output_attrs=['name'])
In [12]:
E
Out[12]:
In [13]:
dbg = em.debug_blocker(E, A, B, output_size=5)
In [14]:
dbg
Out[14]:
In [ ]: