In [1]:
# Import py_entitymatching package
import py_entitymatching as em
import os
import pandas as pd
# Set the seed value
seed = 0
In [2]:
!ls $datasets_dir
In [3]:
# Get the datasets directory
datasets_dir = em.get_install_path() + os.sep + 'datasets'
path_A = datasets_dir + os.sep + 'dblp_demo.csv'
path_B = datasets_dir + os.sep + 'acm_demo.csv'
path_labeled_data = datasets_dir + os.sep + 'labeled_data_demo.csv'
In [5]:
A = em.read_csv_metadata(path_A, key='id')
B = em.read_csv_metadata(path_B, key='id')
# Load the pre-labeled data
S = em.read_csv_metadata(path_labeled_data,
key='_id',
ltable=A, rtable=B,
fk_ltable='ltable_id', fk_rtable='rtable_id')
Then, split the labeled data into development set and evaluation set. Use the development set to select the best learning-based matcher
In [6]:
# Split S into I an J
IJ = em.split_train_test(S, train_proportion=0.5, random_state=0)
I = IJ['train']
J = IJ['test']
This, typically involves the following steps:
First, we need to create a set of learning-based matchers. The following matchers are supported in Magellan: (1) decision tree, (2) random forest, (3) naive bayes, (4) svm, (5) logistic regression, and (6) linear regression.
In [7]:
# Create a set of ML-matchers
dt = em.DTMatcher(name='DecisionTree', random_state=0)
svm = em.SVMMatcher(name='SVM', random_state=0)
rf = em.RFMatcher(name='RF', random_state=0)
lg = em.LogRegMatcher(name='LogReg', random_state=0)
ln = em.LinRegMatcher(name='LinReg')
In [8]:
# Generate a set of features
F = em.get_features_for_matching(A, B, validate_inferred_attr_types=False)
We observe that there were 20 features generated. As a first step, lets say that we decide to use only 'year' related features.
In [9]:
F.feature_name
Out[9]:
In [10]:
# Convert the I into a set of feature vectors using F
H = em.extract_feature_vecs(I,
feature_table=F,
attrs_after='label',
show_progress=False)
In [11]:
# Display first few rows
H.head()
Out[11]:
In [12]:
# Check if the feature vectors contain missing values
# A return value of True means that there are missing values
any(pd.notnull(H))
Out[12]:
We observe that the extracted feature vectors contain missing values. We have to impute the missing values for the learning-based matchers to fit the model correctly. For the purposes of this guide, we impute the missing value in a column with the mean of the values in that column.
In [13]:
# Impute feature vectors with the mean of the column values.
H = em.impute_table(H,
exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'],
strategy='mean')
In [14]:
# Select the best ML matcher using CV
result = em.select_matcher([dt, rf, svm, ln, lg], table=H,
exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'],
k=5,
target_attr='label', metric_to_select_matcher='f1', random_state=0)
result['cv_stats']
Out[14]:
In [15]:
result['drill_down_cv_stats']['precision']
Out[15]:
In [16]:
result['drill_down_cv_stats']['recall']
Out[16]:
In [17]:
result['drill_down_cv_stats']['f1']
Out[17]:
In [18]:
# Split H into P and Q
PQ = em.split_train_test(H, train_proportion=0.5, random_state=0)
P = PQ['train']
Q = PQ['test']
In [19]:
# Debug RF matcher using GUI
em.vis_debug_rf(rf, P, Q,
exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'],
target_attr='label')
In [20]:
# Add a feature to do Jaccard on title + authors and add it to F
# Create a feature declaratively
sim = em.get_sim_funs_for_matching()
tok = em.get_tokenizers_for_matching()
feature_string = """jaccard(wspace((ltuple['title'] + ' ' + ltuple['authors']).lower()),
wspace((rtuple['title'] + ' ' + rtuple['authors']).lower()))"""
feature = em.get_feature_fn(feature_string, sim, tok)
# Add feature to F
em.add_feature(F, 'jac_ws_title_authors', feature)
Out[20]:
In [21]:
# Convert I into feature vectors using updated F
H = em.extract_feature_vecs(I,
feature_table=F,
attrs_after='label',
show_progress=False)
In [22]:
# Check whether the updated F improves X (Random Forest)
result = em.select_matcher([rf], table=H,
exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'],
k=5,
target_attr='label', metric_to_select_matcher='f1', random_state=0)
result['drill_down_cv_stats']['f1']
Out[22]:
In [23]:
# Select the best matcher again using CV
result = em.select_matcher([dt, rf, svm, ln, lg], table=H,
exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'],
k=5,
target_attr='label', metric_to_select_matcher='f1', random_state=0)
result['cv_stats']
Out[23]:
In [24]:
result['drill_down_cv_stats']['f1']
Out[24]: