In [1]:
# Import py_entitymatching package
import py_entitymatching as em
import os
import pandas as pd
Then, read the (sample) input tables for blocking purposes.
In [2]:
# Get the datasets directory
datasets_dir = em.get_install_path() + os.sep + 'datasets'
# Get the paths of the input tables
path_A = datasets_dir + os.sep + 'person_table_A.csv'
path_B = datasets_dir + os.sep + 'person_table_B.csv'
In [3]:
# Read the CSV files and set 'ID' as the key attribute
A = em.read_csv_metadata(path_A, key='ID')
B = em.read_csv_metadata(path_B, key='ID')
In [4]:
A.head()
Out[4]:
In [5]:
B.head()
Out[5]:
In [6]:
block_f = em.get_features_for_blocking(A, B, validate_inferred_attr_types=False)
In [7]:
block_f
Out[7]:
In [8]:
em._block_c['corres']
Out[8]:
In [9]:
em._atypes1['birth_year'], em._atypes1['hourly_wage'], em._atypes1['name'], em._atypes1['zipcode']
Out[9]:
In [10]:
em._atypes2['birth_year'], em._atypes2['hourly_wage'], em._atypes2['name'], em._atypes2['zipcode']
Out[10]:
There are three different ways to do overlap blocking:
candidate set
of tuple pairs.candidate set
of tuple pairs to typically produce a reduced candidate set of tuple pairs.
In [11]:
rb = em.RuleBasedBlocker()
# Add rule : block tuples if name_name_lev(ltuple, rtuple) < 0.4
rb.add_rule(['name_name_lev_sim(ltuple, rtuple) < 0.4'], block_f)
Out[11]:
In [12]:
C = rb.block_tables(A, B, l_output_attrs=['name', 'address'], r_output_attrs=['name', 'address'], show_progress=False)
In [13]:
C.head()
Out[13]:
In [14]:
rb = em.RuleBasedBlocker()
rb.add_rule(['birth_year_birth_year_exm(ltuple, rtuple) == 0'], block_f)
Out[14]:
In [15]:
D = rb.block_candset(C, show_progress=False)
In [16]:
D.head()
Out[16]:
In [18]:
A.loc[[0]]
Out[18]:
In [19]:
B.loc[[1]]
Out[19]:
In [20]:
rb = em.RuleBasedBlocker()
# Add rule : block tuples if name_name_lev(ltuple, rtuple) < 0.4
rb.add_rule(['name_name_lev_sim(ltuple, rtuple) < 0.4'], block_f)
rb.add_rule(['birth_year_birth_year_exm(ltuple, rtuple) == 0'], block_f)
Out[20]:
In [22]:
status = rb.block_tuples(A.loc[0], B.loc[0])
print(status)
In [ ]: