In [2]:
# Import py_entitymatching package
import py_entitymatching as em
import os
import pandas as pd
Then, read the (sample) input tables for matching purposes.
In [3]:
# Get the datasets directory
datasets_dir = em.get_install_path() + os.sep + 'datasets'
path_A = datasets_dir + os.sep + 'dblp_demo.csv'
path_B = datasets_dir + os.sep + 'acm_demo.csv'
path_labeled_data = datasets_dir + os.sep + 'labeled_data_demo.csv'
In [5]:
A = em.read_csv_metadata(path_A, key='id')
B = em.read_csv_metadata(path_B, key='id')
# Load the pre-labeled data
S = em.read_csv_metadata(path_labeled_data,
key='_id',
ltable=A, rtable=B,
fk_ltable='ltable_id', fk_rtable='rtable_id')
S.head()
Out[5]:
In [6]:
# Split S into I an J
IJ = em.split_train_test(S, train_proportion=0.5, random_state=0)
I = IJ['train']
J = IJ['test']
In [7]:
# Create a Decision Tree Matcher
dt = em.DTMatcher(name='DecisionTree', random_state=0)
In [8]:
# Generate a set of features
feature_table = em.get_features_for_matching(A, B, validate_inferred_attr_types=False)
feature_table
Out[8]:
In [9]:
# We will remove many of the features here to purposly create a poor model. This will make it easier
# to demonstrate triggers later
F = feature_table.drop([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14])
F
Out[9]:
In [10]:
# Convert the I into a set of feature vectors using F
H = em.extract_feature_vecs(I,
feature_table=F,
attrs_after='label',
show_progress=False)
H.head()
Out[10]:
In [11]:
# Impute feature vectors with the mean of the column values.
H = em.impute_table(H,
exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'],
strategy='mean')
In [12]:
# Fit the decision tree to the feature vectors
dt.fit(table=H, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'], target_attr='label')
In [13]:
# Use the decision tree matcher to predict if tuple pairs match
dt.predict(table=H, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'], target_attr='predicted_labels',
return_probs=True, probs_attr='proba', append=True, inplace=True)
H.head()
Out[13]:
In [14]:
# Split H into P and Q
PQ = em.split_train_test(H, train_proportion=0.5, random_state=0)
P = PQ['train']
Q = PQ['test']
In [15]:
# Debug RF matcher using GUI
em.vis_debug_dt(dt, P, Q,
exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'],
target_attr='label')
# We see with the debugger that the false negatives have completely different values in the Title attribute.
# This is most likly because we removed all of the features that compare the Title attribute from each table earlier.
In [16]:
# We can see which tuples are not predicted correctly
H[H['label'] != H['predicted_labels']]
Out[16]:
In [17]:
# Use the constructor to create a trigger
mt = em.MatchTrigger()
Before we can use the rule-based matcher, we need to create rules to evaluate tuple pairs. Each rule is a list of strings. Each string specifies a conjunction of predicates. Each predicate has three parts: (1) an expression, (2) a comparison operator, and (3) a value. The expression is evaluated over a tuple pair, producing a numeric value.
In [18]:
# Add two rules to the rule-based matcher
# Since we removed all of the features comparing Title earlier, we want to now add a rule that compares Titles
mt.add_cond_rule(['title_title_lev_sim(ltuple, rtuple) > 0.7'], feature_table)
# The rule has two predicates, one comparing the titles and the other looking for an exact match of the years
mt.add_cond_rule(['title_title_lev_sim(ltuple, rtuple) > 0.4', 'year_year_exm(ltuple, rtuple) == 1'], feature_table)
mt.get_rule_names()
Out[18]:
In [19]:
# Rules can also be deleted from the rule-based matcher
mt.delete_rule('_rule_1')
Out[19]:
In [20]:
# Since we are using the trigger to fix a problem related to false negatives, we want the condition to be
# True and the action to be 1. This way, the trigger will set a prediction to 1 when the rule returns True.
mt.add_cond_status(True)
mt.add_action(1)
Out[20]:
In [21]:
preds = mt.execute(input_table=H, label_column='predicted_labels', inplace=False)
preds.head()
Out[21]:
In [22]:
# We were able to significantly reduce the number of incorrectly labeled tuple pairs
preds[preds['label'] != preds['predicted_labels']]
Out[22]:
In [23]:
# We can see that the two tuples that are still labeled incorrectly are due to the title and authors being in the
# wrong column for one of the tuples.
pd.concat([S[S['_id'] == 11], S[S['_id'] == 267]])
Out[23]:
In [ ]: