In [22]:
# Import py_entitymatching package
import py_entitymatching as em
import os
import pandas as pd
Then, read the (sample) input tables for blocking purposes.
In [23]:
# Get the datasets directory
datasets_dir = em.get_install_path() + os.sep + 'datasets'
# Get the paths of the input tables
path_A = datasets_dir + os.sep + 'person_table_A.csv'
path_B = datasets_dir + os.sep + 'person_table_B.csv'
In [24]:
# Read the CSV files and set 'ID' as the key attribute
A = em.read_csv_metadata(path_A, key='ID')
B = em.read_csv_metadata(path_B, key='ID')
In [25]:
atypes1 = em.get_attr_types(A)
atypes2 = em.get_attr_types(B)
In [26]:
atypes1.keys()
Out[26]:
In [27]:
atypes1['birth_year'], atypes1['hourly_wage'], atypes1['address'], atypes1['name'], atypes1['zipcode']
Out[27]:
In [28]:
atypes2['birth_year'], atypes2['hourly_wage'], atypes2['address'], atypes2['name'], atypes2['zipcode']
Out[28]:
In [29]:
block_c = em.get_attr_corres(A, B)
In [30]:
block_c.keys()
Out[30]:
In [31]:
id(A), id(block_c['ltable']), id(B), id(block_c['rtable'])
Out[31]:
In [32]:
block_c['corres']
Out[32]:
In [33]:
# for blocking
tok = em.get_tokenizers_for_blocking()
# for matching
# tok = em.get_tokenizers_for_matching()
In [34]:
tok
Out[34]:
In [35]:
# for blocking
sim = em.get_sim_funs_for_blocking()
# for matching
# sim = em.get_sim_funs_for_matching()
In [36]:
sim
Out[36]:
In [38]:
feature_table = em.get_features(A, B, atypes1, atypes2, block_c, tok, sim)
In [41]:
feature_table.head()
Out[41]:
In [40]:
type(feature_table)
Out[40]: