Introduction

This IPython notebook illustrates how to perform blocking using rule-based blocker.

First, we need to import py_entitymatching package and other libraries as follows:


In [1]:
# Import py_entitymatching package
import py_entitymatching as em
import os
import pandas as pd

Then, read the (sample) input tables for blocking purposes.


In [2]:
# Get the datasets directory
datasets_dir = em.get_install_path() + os.sep + 'datasets'

# Get the paths of the input tables
path_A = datasets_dir + os.sep + 'person_table_A.csv'
path_B = datasets_dir + os.sep + 'person_table_B.csv'

In [3]:
# Read the CSV files and set 'ID' as the key attribute
A = em.read_csv_metadata(path_A, key='ID')
B = em.read_csv_metadata(path_B, key='ID')

In [4]:
A.head()


Out[4]:
ID name birth_year hourly_wage address zipcode
0 a1 Kevin Smith 1989 30.0 607 From St, San Francisco 94107
1 a2 Michael Franklin 1988 27.5 1652 Stockton St, San Francisco 94122
2 a3 William Bridge 1986 32.0 3131 Webster St, San Francisco 94107
3 a4 Binto George 1987 32.5 423 Powell St, San Francisco 94122
4 a5 Alphonse Kemper 1984 35.0 1702 Post Street, San Francisco 94122

In [5]:
B.head()


Out[5]:
ID name birth_year hourly_wage address zipcode
0 b1 Mark Levene 1987 29.5 108 Clement St, San Francisco 94107
1 b2 Bill Bridge 1986 32.0 3131 Webster St, San Francisco 94107
2 b3 Mike Franklin 1988 27.5 1652 Stockton St, San Francisco 94122
3 b4 Joseph Kuan 1982 26.0 108 South Park, San Francisco 94122
4 b5 Alfons Kemper 1984 35.0 170 Post St, Apt 4, San Francisco 94122

Generating Features for Blocking


In [6]:
block_f = em.get_features_for_blocking(A, B, validate_inferred_attr_types=False)

In [7]:
block_f


Out[7]:
feature_name left_attribute right_attribute left_attr_tokenizer right_attr_tokenizer simfunction function function_source is_auto_generated
0 ID_ID_lev_dist ID ID None None lev_dist <function ID_ID_lev_dist at 0x1191d6f28> from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... True
1 ID_ID_lev_sim ID ID None None lev_sim <function ID_ID_lev_sim at 0x1192432f0> from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... True
2 ID_ID_jar ID ID None None jaro <function ID_ID_jar at 0x119243400> from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... True
3 ID_ID_jwn ID ID None None jaro_winkler <function ID_ID_jwn at 0x119243378> from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... True
4 ID_ID_exm ID ID None None exact_match <function ID_ID_exm at 0x119243488> from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... True
5 ID_ID_jac_qgm_3_qgm_3 ID ID qgm_3 qgm_3 jaccard <function ID_ID_jac_qgm_3_qgm_3 at 0x119243510> from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... True
6 name_name_jac_qgm_3_qgm_3 name name qgm_3 qgm_3 jaccard <function name_name_jac_qgm_3_qgm_3 at 0x119243598> from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... True
7 name_name_cos_dlm_dc0_dlm_dc0 name name dlm_dc0 dlm_dc0 cosine <function name_name_cos_dlm_dc0_dlm_dc0 at 0x119243620> from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... True
8 name_name_jac_dlm_dc0_dlm_dc0 name name dlm_dc0 dlm_dc0 jaccard <function name_name_jac_dlm_dc0_dlm_dc0 at 0x1192436a8> from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... True
9 name_name_mel name name None None monge_elkan <function name_name_mel at 0x119243730> from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... True
10 name_name_lev_dist name name None None lev_dist <function name_name_lev_dist at 0x1192437b8> from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... True
11 name_name_lev_sim name name None None lev_sim <function name_name_lev_sim at 0x119243840> from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... True
12 name_name_nmw name name None None needleman_wunsch <function name_name_nmw at 0x1192438c8> from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... True
13 name_name_sw name name None None smith_waterman <function name_name_sw at 0x119243950> from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... True
14 birth_year_birth_year_exm birth_year birth_year None None exact_match <function birth_year_birth_year_exm at 0x1192439d8> from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... True
15 birth_year_birth_year_anm birth_year birth_year None None abs_norm <function birth_year_birth_year_anm at 0x119243a60> from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... True
16 birth_year_birth_year_lev_dist birth_year birth_year None None lev_dist <function birth_year_birth_year_lev_dist at 0x119243ae8> from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... True
17 birth_year_birth_year_lev_sim birth_year birth_year None None lev_sim <function birth_year_birth_year_lev_sim at 0x119243b70> from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... True
18 hourly_wage_hourly_wage_exm hourly_wage hourly_wage None None exact_match <function hourly_wage_hourly_wage_exm at 0x119243bf8> from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... True
19 hourly_wage_hourly_wage_anm hourly_wage hourly_wage None None abs_norm <function hourly_wage_hourly_wage_anm at 0x119243c80> from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... True
20 hourly_wage_hourly_wage_lev_dist hourly_wage hourly_wage None None lev_dist <function hourly_wage_hourly_wage_lev_dist at 0x119243d08> from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... True
21 hourly_wage_hourly_wage_lev_sim hourly_wage hourly_wage None None lev_sim <function hourly_wage_hourly_wage_lev_sim at 0x119243d90> from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... True
22 zipcode_zipcode_exm zipcode zipcode None None exact_match <function zipcode_zipcode_exm at 0x119243e18> from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... True
23 zipcode_zipcode_anm zipcode zipcode None None abs_norm <function zipcode_zipcode_anm at 0x119243ea0> from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... True
24 zipcode_zipcode_lev_dist zipcode zipcode None None lev_dist <function zipcode_zipcode_lev_dist at 0x119243f28> from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... True
25 zipcode_zipcode_lev_sim zipcode zipcode None None lev_sim <function zipcode_zipcode_lev_sim at 0x119254048> from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... True

In [8]:
em._block_c['corres']


Out[8]:
[('ID', 'ID'),
 ('name', 'name'),
 ('birth_year', 'birth_year'),
 ('hourly_wage', 'hourly_wage'),
 ('address', 'address'),
 ('zipcode', 'zipcode')]

In [9]:
em._atypes1['birth_year'], em._atypes1['hourly_wage'], em._atypes1['name'], em._atypes1['zipcode']


Out[9]:
('numeric', 'numeric', 'str_bt_1w_5w', 'numeric')

In [10]:
em._atypes2['birth_year'], em._atypes2['hourly_wage'], em._atypes2['name'], em._atypes2['zipcode']


Out[10]:
('numeric', 'numeric', 'str_bt_1w_5w', 'numeric')

Different Ways to Block Using Rule Based Blocker

There are three different ways to do overlap blocking:

  1. Block two tables to produce a candidate set of tuple pairs.
  2. Block a candidate set of tuple pairs to typically produce a reduced candidate set of tuple pairs.
  3. Block two tuples to check if a tuple pair would get blocked.

Block Tables to Produce a Candidate Set of Tuple Pairs


In [11]:
rb = em.RuleBasedBlocker()
# Add rule : block tuples if name_name_lev(ltuple, rtuple) < 0.4
rb.add_rule(['name_name_lev_sim(ltuple, rtuple) < 0.4'], block_f)


Out[11]:
'_rule_0'

In [12]:
C = rb.block_tables(A, B, l_output_attrs=['name', 'address'], r_output_attrs=['name', 'address'], show_progress=False)

In [13]:
C.head()


Out[13]:
_id ltable_ID rtable_ID ltable_name ltable_address rtable_name rtable_address
0 0 a2 b3 Michael Franklin 1652 Stockton St, San Francisco Mike Franklin 1652 Stockton St, San Francisco
1 1 a2 b6 Michael Franklin 1652 Stockton St, San Francisco Michael Brodie 133 Clement Street, San Francisco
2 2 a3 b2 William Bridge 3131 Webster St, San Francisco Bill Bridge 3131 Webster St, San Francisco
3 3 a3 b6 William Bridge 3131 Webster St, San Francisco Michael Brodie 133 Clement Street, San Francisco
4 4 a4 b2 Binto George 423 Powell St, San Francisco Bill Bridge 3131 Webster St, San Francisco

Block Candidate Set


In [14]:
rb = em.RuleBasedBlocker()
rb.add_rule(['birth_year_birth_year_exm(ltuple, rtuple) == 0'], block_f)


Out[14]:
'_rule_0'

In [15]:
D = rb.block_candset(C, show_progress=False)

In [16]:
D.head()


Out[16]:
_id ltable_ID rtable_ID ltable_name ltable_address rtable_name rtable_address
0 0 a2 b3 Michael Franklin 1652 Stockton St, San Francisco Mike Franklin 1652 Stockton St, San Francisco
2 2 a3 b2 William Bridge 3131 Webster St, San Francisco Bill Bridge 3131 Webster St, San Francisco
5 5 a5 b5 Alphonse Kemper 1702 Post Street, San Francisco Alfons Kemper 170 Post St, Apt 4, San Francisco

Block Two tuples To Check If a Tuple Pair Would Get Blocked


In [18]:
A.loc[[0]]


Out[18]:
ID name birth_year hourly_wage address zipcode
0 a1 Kevin Smith 1989 30.0 607 From St, San Francisco 94107

In [19]:
B.loc[[1]]


Out[19]:
ID name birth_year hourly_wage address zipcode
1 b2 Bill Bridge 1986 32.0 3131 Webster St, San Francisco 94107

In [20]:
rb = em.RuleBasedBlocker()
# Add rule : block tuples if name_name_lev(ltuple, rtuple) < 0.4
rb.add_rule(['name_name_lev_sim(ltuple, rtuple) < 0.4'], block_f)
rb.add_rule(['birth_year_birth_year_exm(ltuple, rtuple) == 0'], block_f)


Out[20]:
'_rule_1'

In [22]:
status = rb.block_tuples(A.loc[0], B.loc[0])
print(status)


True

In [ ]: