In [1]:
# Import py_entitymatching package
import py_entitymatching as em
import os
import pandas as pd
Then, read the (sample) input tables for blocking purposes.
In [5]:
# Get the datasets directory
datasets_dir = em.get_install_path() + os.sep + 'datasets'
# Get the paths of the input tables
path_A = datasets_dir + os.sep + 'person_table_A.csv'
path_B = datasets_dir + os.sep + 'person_table_B.csv'
In [6]:
# Read the CSV files and set 'ID' as the key attribute
A = em.read_csv_metadata(path_A, key='ID')
B = em.read_csv_metadata(path_B, key='ID')
There are three different ways to do overlap blocking:
candidate set
of tuple pairs.candidate set
of tuple pairs to typically produce a reduced candidate set of tuple pairs.First, define a blackbox function
In [18]:
def address_address_function(x, y):
# x, y will be of type pandas series
# get name attribute
x_address = x['address']
y_address = y['address']
# get the city
x_split, y_split = x_address.split(','), y_address.split(',')
x_city = x_split[len(x_split) - 1]
y_city = y_split[len(y_split) - 1]
# check if the cities match
if x_city != y_city:
return True
else:
return False
In [22]:
# Instantiate blackbox blocker
bb = em.BlackBoxBlocker()
# Set the black box function
bb.set_black_box_function(address_address_function)
In [23]:
C = bb.block_tables(A, B, l_output_attrs=['name', 'address'], r_output_attrs=['name', 'address'])
In [24]:
C
Out[24]:
In [25]:
def name_name_function(x, y):
# x, y will be of type pandas series
# get name attribute
x_name = x['name']
y_name = y['name']
# get last names
x_name = x_name.split(' ')[1]
y_name = y_name.split(' ')[1]
# check if last names match
if x_name != y_name:
return True
else:
return False
In [29]:
# Instantiate blackbox blocker
bb = em.BlackBoxBlocker()
# Set the black box function
bb.set_black_box_function(name_name_function)
In [30]:
D = bb.block_candset(C)
In [31]:
D
Out[31]:
First, define the black box function first
In [33]:
def address_address_function(x, y):
# x, y will be of type pandas series
# get name attribute
x_address = x['address']
y_address = y['address']
# get the city
x_split, y_split = x_address.split(','), y_address.split(',')
x_city = x_split[len(x_split) - 1]
y_city = y_split[len(y_split) - 1]
# check if the cities match
if x_city != y_city:
return True
else:
return False
In [34]:
# Instantiate blackabox blocker
bb = em.BlackBoxBlocker()
# Set the blackbox function
bb.set_black_box_function(address_address_function)
In [35]:
A.ix[[0]]
Out[35]:
In [36]:
B.ix[[0]]
Out[36]:
In [38]:
status = bb.block_tuples(A.ix[0], B.ix[0])
print(status)