Introduction

This IPython notebook illustrates how to perform blocking using rule-based blocker.

First, we need to import py_entitymatching package and other libraries as follows:



In [1]:

    
# Import py_entitymatching package
import py_entitymatching as em
import os
import pandas as pd

Then, read the (sample) input tables for blocking purposes.



In [2]:

    
# Get the datasets directory
datasets_dir = em.get_install_path() + os.sep + 'datasets'

# Get the paths of the input tables
path_A = datasets_dir + os.sep + 'person_table_A.csv'
path_B = datasets_dir + os.sep + 'person_table_B.csv'



In [3]:

    
# Read the CSV files and set 'ID' as the key attribute
A = em.read_csv_metadata(path_A, key='ID')
B = em.read_csv_metadata(path_B, key='ID')



In [4]:

    
A.head()









    Out[4]:







  
    
      
      ID
      name
      birth_year
      hourly_wage
      address
      zipcode
    
  
  
    
      0
      a1
      Kevin Smith
      1989
      30.0
      607 From St, San Francisco
      94107
    
    
      1
      a2
      Michael Franklin
      1988
      27.5
      1652 Stockton St, San Francisco
      94122
    
    
      2
      a3
      William Bridge
      1986
      32.0
      3131 Webster St, San Francisco
      94107
    
    
      3
      a4
      Binto George
      1987
      32.5
      423 Powell St, San Francisco
      94122
    
    
      4
      a5
      Alphonse Kemper
      1984
      35.0
      1702 Post Street, San Francisco
      94122



In [5]:

    
B.head()









    Out[5]:







  
    
      
      ID
      name
      birth_year
      hourly_wage
      address
      zipcode
    
  
  
    
      0
      b1
      Mark Levene
      1987
      29.5
      108 Clement St, San Francisco
      94107
    
    
      1
      b2
      Bill Bridge
      1986
      32.0
      3131 Webster St, San Francisco
      94107
    
    
      2
      b3
      Mike Franklin
      1988
      27.5
      1652 Stockton St, San Francisco
      94122
    
    
      3
      b4
      Joseph Kuan
      1982
      26.0
      108 South Park, San Francisco
      94122
    
    
      4
      b5
      Alfons Kemper
      1984
      35.0
      170 Post St, Apt 4,  San Francisco
      94122

Generating Features for Blocking



In [6]:

    
block_f = em.get_features_for_blocking(A, B, validate_inferred_attr_types=False)



In [7]:

    
block_f









    Out[7]:







  
    
      
      feature_name
      left_attribute
      right_attribute
      left_attr_tokenizer
      right_attr_tokenizer
      simfunction
      function
      function_source
      is_auto_generated
    
  
  
    
      0
      ID_ID_lev_dist
      ID
      ID
      None
      None
      lev_dist
      <function ID_ID_lev_dist at 0x1191d6f28>
      from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...
      True
    
    
      1
      ID_ID_lev_sim
      ID
      ID
      None
      None
      lev_sim
      <function ID_ID_lev_sim at 0x1192432f0>
      from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...
      True
    
    
      2
      ID_ID_jar
      ID
      ID
      None
      None
      jaro
      <function ID_ID_jar at 0x119243400>
      from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...
      True
    
    
      3
      ID_ID_jwn
      ID
      ID
      None
      None
      jaro_winkler
      <function ID_ID_jwn at 0x119243378>
      from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...
      True
    
    
      4
      ID_ID_exm
      ID
      ID
      None
      None
      exact_match
      <function ID_ID_exm at 0x119243488>
      from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...
      True
    
    
      5
      ID_ID_jac_qgm_3_qgm_3
      ID
      ID
      qgm_3
      qgm_3
      jaccard
      <function ID_ID_jac_qgm_3_qgm_3 at 0x119243510>
      from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...
      True
    
    
      6
      name_name_jac_qgm_3_qgm_3
      name
      name
      qgm_3
      qgm_3
      jaccard
      <function name_name_jac_qgm_3_qgm_3 at 0x119243598>
      from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...
      True
    
    
      7
      name_name_cos_dlm_dc0_dlm_dc0
      name
      name
      dlm_dc0
      dlm_dc0
      cosine
      <function name_name_cos_dlm_dc0_dlm_dc0 at 0x119243620>
      from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...
      True
    
    
      8
      name_name_jac_dlm_dc0_dlm_dc0
      name
      name
      dlm_dc0
      dlm_dc0
      jaccard
      <function name_name_jac_dlm_dc0_dlm_dc0 at 0x1192436a8>
      from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...
      True
    
    
      9
      name_name_mel
      name
      name
      None
      None
      monge_elkan
      <function name_name_mel at 0x119243730>
      from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...
      True
    
    
      10
      name_name_lev_dist
      name
      name
      None
      None
      lev_dist
      <function name_name_lev_dist at 0x1192437b8>
      from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...
      True
    
    
      11
      name_name_lev_sim
      name
      name
      None
      None
      lev_sim
      <function name_name_lev_sim at 0x119243840>
      from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...
      True
    
    
      12
      name_name_nmw
      name
      name
      None
      None
      needleman_wunsch
      <function name_name_nmw at 0x1192438c8>
      from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...
      True
    
    
      13
      name_name_sw
      name
      name
      None
      None
      smith_waterman
      <function name_name_sw at 0x119243950>
      from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...
      True
    
    
      14
      birth_year_birth_year_exm
      birth_year
      birth_year
      None
      None
      exact_match
      <function birth_year_birth_year_exm at 0x1192439d8>
      from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...
      True
    
    
      15
      birth_year_birth_year_anm
      birth_year
      birth_year
      None
      None
      abs_norm
      <function birth_year_birth_year_anm at 0x119243a60>
      from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...
      True
    
    
      16
      birth_year_birth_year_lev_dist
      birth_year
      birth_year
      None
      None
      lev_dist
      <function birth_year_birth_year_lev_dist at 0x119243ae8>
      from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...
      True
    
    
      17
      birth_year_birth_year_lev_sim
      birth_year
      birth_year
      None
      None
      lev_sim
      <function birth_year_birth_year_lev_sim at 0x119243b70>
      from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...
      True
    
    
      18
      hourly_wage_hourly_wage_exm
      hourly_wage
      hourly_wage
      None
      None
      exact_match
      <function hourly_wage_hourly_wage_exm at 0x119243bf8>
      from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...
      True
    
    
      19
      hourly_wage_hourly_wage_anm
      hourly_wage
      hourly_wage
      None
      None
      abs_norm
      <function hourly_wage_hourly_wage_anm at 0x119243c80>
      from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...
      True
    
    
      20
      hourly_wage_hourly_wage_lev_dist
      hourly_wage
      hourly_wage
      None
      None
      lev_dist
      <function hourly_wage_hourly_wage_lev_dist at 0x119243d08>
      from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...
      True
    
    
      21
      hourly_wage_hourly_wage_lev_sim
      hourly_wage
      hourly_wage
      None
      None
      lev_sim
      <function hourly_wage_hourly_wage_lev_sim at 0x119243d90>
      from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...
      True
    
    
      22
      zipcode_zipcode_exm
      zipcode
      zipcode
      None
      None
      exact_match
      <function zipcode_zipcode_exm at 0x119243e18>
      from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...
      True
    
    
      23
      zipcode_zipcode_anm
      zipcode
      zipcode
      None
      None
      abs_norm
      <function zipcode_zipcode_anm at 0x119243ea0>
      from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...
      True
    
    
      24
      zipcode_zipcode_lev_dist
      zipcode
      zipcode
      None
      None
      lev_dist
      <function zipcode_zipcode_lev_dist at 0x119243f28>
      from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...
      True
    
    
      25
      zipcode_zipcode_lev_sim
      zipcode
      zipcode
      None
      None
      lev_sim
      <function zipcode_zipcode_lev_sim at 0x119254048>
      from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...
      True



In [8]:

    
em._block_c['corres']









    Out[8]:





[('ID', 'ID'),
 ('name', 'name'),
 ('birth_year', 'birth_year'),
 ('hourly_wage', 'hourly_wage'),
 ('address', 'address'),
 ('zipcode', 'zipcode')]



In [9]:

    
em._atypes1['birth_year'], em._atypes1['hourly_wage'], em._atypes1['name'], em._atypes1['zipcode']









    Out[9]:





('numeric', 'numeric', 'str_bt_1w_5w', 'numeric')



In [10]:

    
em._atypes2['birth_year'], em._atypes2['hourly_wage'], em._atypes2['name'], em._atypes2['zipcode']









    Out[10]:





('numeric', 'numeric', 'str_bt_1w_5w', 'numeric')

Different Ways to Block Using Rule Based Blocker

There are three different ways to do overlap blocking:

Block two tables to produce a candidate set of tuple pairs.
Block a candidate set of tuple pairs to typically produce a reduced candidate set of tuple pairs.
Block two tuples to check if a tuple pair would get blocked.

Block Tables to Produce a Candidate Set of Tuple Pairs



In [11]:

    
rb = em.RuleBasedBlocker()
# Add rule : block tuples if name_name_lev(ltuple, rtuple) < 0.4
rb.add_rule(['name_name_lev_sim(ltuple, rtuple) < 0.4'], block_f)









    Out[11]:





'_rule_0'



In [12]:

    
C = rb.block_tables(A, B, l_output_attrs=['name', 'address'], r_output_attrs=['name', 'address'], show_progress=False)



In [13]:

    
C.head()









    Out[13]:







  
    
      
      _id
      ltable_ID
      rtable_ID
      ltable_name
      ltable_address
      rtable_name
      rtable_address
    
  
  
    
      0
      0
      a2
      b3
      Michael Franklin
      1652 Stockton St, San Francisco
      Mike Franklin
      1652 Stockton St, San Francisco
    
    
      1
      1
      a2
      b6
      Michael Franklin
      1652 Stockton St, San Francisco
      Michael Brodie
      133 Clement Street, San Francisco
    
    
      2
      2
      a3
      b2
      William Bridge
      3131 Webster St, San Francisco
      Bill Bridge
      3131 Webster St, San Francisco
    
    
      3
      3
      a3
      b6
      William Bridge
      3131 Webster St, San Francisco
      Michael Brodie
      133 Clement Street, San Francisco
    
    
      4
      4
      a4
      b2
      Binto George
      423 Powell St, San Francisco
      Bill Bridge
      3131 Webster St, San Francisco

Block Candidate Set



In [14]:

    
rb = em.RuleBasedBlocker()
rb.add_rule(['birth_year_birth_year_exm(ltuple, rtuple) == 0'], block_f)









    Out[14]:





'_rule_0'



In [15]:

    
D = rb.block_candset(C, show_progress=False)



In [16]:

    
D.head()









    Out[16]:







  
    
      
      _id
      ltable_ID
      rtable_ID
      ltable_name
      ltable_address
      rtable_name
      rtable_address
    
  
  
    
      0
      0
      a2
      b3
      Michael Franklin
      1652 Stockton St, San Francisco
      Mike Franklin
      1652 Stockton St, San Francisco
    
    
      2
      2
      a3
      b2
      William Bridge
      3131 Webster St, San Francisco
      Bill Bridge
      3131 Webster St, San Francisco
    
    
      5
      5
      a5
      b5
      Alphonse Kemper
      1702 Post Street, San Francisco
      Alfons Kemper
      170 Post St, Apt 4,  San Francisco

Block Two tuples To Check If a Tuple Pair Would Get Blocked



In [18]:

    
A.loc[[0]]









    Out[18]:







  
    
      
      ID
      name
      birth_year
      hourly_wage
      address
      zipcode
    
  
  
    
      0
      a1
      Kevin Smith
      1989
      30.0
      607 From St, San Francisco
      94107



In [19]:

    
B.loc[[1]]









    Out[19]:







  
    
      
      ID
      name
      birth_year
      hourly_wage
      address
      zipcode
    
  
  
    
      1
      b2
      Bill Bridge
      1986
      32.0
      3131 Webster St, San Francisco
      94107



In [20]:

    
rb = em.RuleBasedBlocker()
# Add rule : block tuples if name_name_lev(ltuple, rtuple) < 0.4
rb.add_rule(['name_name_lev_sim(ltuple, rtuple) < 0.4'], block_f)
rb.add_rule(['birth_year_birth_year_exm(ltuple, rtuple) == 0'], block_f)









    Out[20]:





'_rule_1'



In [22]:

    
status = rb.block_tuples(A.loc[0], B.loc[0])
print(status)









    



True



In [ ]:

	ID	name	birth_year	hourly_wage	address	zipcode
0	a1	Kevin Smith	1989	30.0	607 From St, San Francisco	94107
1	a2	Michael Franklin	1988	27.5	1652 Stockton St, San Francisco	94122
2	a3	William Bridge	1986	32.0	3131 Webster St, San Francisco	94107
3	a4	Binto George	1987	32.5	423 Powell St, San Francisco	94122
4	a5	Alphonse Kemper	1984	35.0	1702 Post Street, San Francisco	94122

	ID	name	birth_year	hourly_wage	address	zipcode
0	b1	Mark Levene	1987	29.5	108 Clement St, San Francisco	94107
1	b2	Bill Bridge	1986	32.0	3131 Webster St, San Francisco	94107
2	b3	Mike Franklin	1988	27.5	1652 Stockton St, San Francisco	94122
3	b4	Joseph Kuan	1982	26.0	108 South Park, San Francisco	94122
4	b5	Alfons Kemper	1984	35.0	170 Post St, Apt 4, San Francisco	94122

	feature_name	left_attribute	right_attribute	left_attr_tokenizer	right_attr_tokenizer	simfunction	function	function_source	is_auto_generated
0	ID_ID_lev_dist	ID	ID	None	None	lev_dist	<function ID_ID_lev_dist at 0x1191d6f28>	from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...	True
1	ID_ID_lev_sim	ID	ID	None	None	lev_sim	<function ID_ID_lev_sim at 0x1192432f0>	from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...	True
2	ID_ID_jar	ID	ID	None	None	jaro	<function ID_ID_jar at 0x119243400>	from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...	True
3	ID_ID_jwn	ID	ID	None	None	jaro_winkler	<function ID_ID_jwn at 0x119243378>	from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...	True
4	ID_ID_exm	ID	ID	None	None	exact_match	<function ID_ID_exm at 0x119243488>	from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...	True
5	ID_ID_jac_qgm_3_qgm_3	ID	ID	qgm_3	qgm_3	jaccard	<function ID_ID_jac_qgm_3_qgm_3 at 0x119243510>	from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...	True
6	name_name_jac_qgm_3_qgm_3	name	name	qgm_3	qgm_3	jaccard	<function name_name_jac_qgm_3_qgm_3 at 0x119243598>	from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...	True
7	name_name_cos_dlm_dc0_dlm_dc0	name	name	dlm_dc0	dlm_dc0	cosine	<function name_name_cos_dlm_dc0_dlm_dc0 at 0x119243620>	from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...	True
8	name_name_jac_dlm_dc0_dlm_dc0	name	name	dlm_dc0	dlm_dc0	jaccard	<function name_name_jac_dlm_dc0_dlm_dc0 at 0x1192436a8>	from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...	True
9	name_name_mel	name	name	None	None	monge_elkan	<function name_name_mel at 0x119243730>	from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...	True
10	name_name_lev_dist	name	name	None	None	lev_dist	<function name_name_lev_dist at 0x1192437b8>	from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...	True
11	name_name_lev_sim	name	name	None	None	lev_sim	<function name_name_lev_sim at 0x119243840>	from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...	True
12	name_name_nmw	name	name	None	None	needleman_wunsch	<function name_name_nmw at 0x1192438c8>	from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...	True
13	name_name_sw	name	name	None	None	smith_waterman	<function name_name_sw at 0x119243950>	from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...	True
14	birth_year_birth_year_exm	birth_year	birth_year	None	None	exact_match	<function birth_year_birth_year_exm at 0x1192439d8>	from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...	True
15	birth_year_birth_year_anm	birth_year	birth_year	None	None	abs_norm	<function birth_year_birth_year_anm at 0x119243a60>	from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...	True
16	birth_year_birth_year_lev_dist	birth_year	birth_year	None	None	lev_dist	<function birth_year_birth_year_lev_dist at 0x119243ae8>	from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...	True
17	birth_year_birth_year_lev_sim	birth_year	birth_year	None	None	lev_sim	<function birth_year_birth_year_lev_sim at 0x119243b70>	from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...	True
18	hourly_wage_hourly_wage_exm	hourly_wage	hourly_wage	None	None	exact_match	<function hourly_wage_hourly_wage_exm at 0x119243bf8>	from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...	True
19	hourly_wage_hourly_wage_anm	hourly_wage	hourly_wage	None	None	abs_norm	<function hourly_wage_hourly_wage_anm at 0x119243c80>	from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...	True
20	hourly_wage_hourly_wage_lev_dist	hourly_wage	hourly_wage	None	None	lev_dist	<function hourly_wage_hourly_wage_lev_dist at 0x119243d08>	from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...	True
21	hourly_wage_hourly_wage_lev_sim	hourly_wage	hourly_wage	None	None	lev_sim	<function hourly_wage_hourly_wage_lev_sim at 0x119243d90>	from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...	True
22	zipcode_zipcode_exm	zipcode	zipcode	None	None	exact_match	<function zipcode_zipcode_exm at 0x119243e18>	from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...	True
23	zipcode_zipcode_anm	zipcode	zipcode	None	None	abs_norm	<function zipcode_zipcode_anm at 0x119243ea0>	from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...	True
24	zipcode_zipcode_lev_dist	zipcode	zipcode	None	None	lev_dist	<function zipcode_zipcode_lev_dist at 0x119243f28>	from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...	True
25	zipcode_zipcode_lev_sim	zipcode	zipcode	None	None	lev_sim	<function zipcode_zipcode_lev_sim at 0x119254048>	from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...	True