In [1]:
import sys
In [2]:
# py_entitymatching relies on the following packages from the Python eco-system
S.No | Package | Comments |
---|---|---|
1 | pandas | provides data structures (such as DataFrame) to store and manage relational data. Specifically, DataFrame is used to represent input tables. |
2 | scikit-learn | provides implementations for common machine learning algorithms. Specifically, this is used in ML-based matchers. |
3 | joblib | provides multiprocessing capabilities. Specifically, this is used to parallelize blockers across multiple processors. |
4 | PyQt4 | provides tools to build GUI. Specifically, this is used to build GUI for labeling data and debugging matchers. |
5 | py_stringsimjoin | provides scalable implementations for string similarity joins over two tables. Specifically, this is used to scale blockers. |
6 | py_stringmatching | provides a comprehensive set of tokenizers and string similarity functions. Specifically, this is to create features for blocking and matching. |
7 | cloudpickle | provides functions to serialize Python constructs. Specifically, this is used to load/save objects from/to disk. |
8 | pyprind | library to display progress indicators. Specifically, this is used to display progress of blocking functions, matching functions, etc. |
9 | pyparsing | library to parse strings. Specifically, this is used to parse rules/features that are declaratively written by the user. |
10 | six | provides functions to write compatible code across Python 2 and 3. |
In [3]:
import py_entitymatching as em
import profiler
import pandas as pd
In [4]:
## Read input tables
A = em.read_csv_metadata('dblp_demo.csv', key='id')
B = em.read_csv_metadata('acm_demo.csv', key='id')
In [5]:
len(A), len(B), len(A) * len(B)
Out[5]:
In [6]:
A.head(2)
Out[6]:
In [7]:
B.head(2)
Out[7]:
In [8]:
# If the tables are large we can downsample the tables like this
A1, B1 = em.down_sample(A, B, 500, 1, show_progress=False)
len(A1), len(B1)
# But for the demo, we will use the entire table A and B
Out[8]:
In [9]:
profiler.profile_table(A, 'paper year')
In [10]:
profiler.profile_table(B, 'paper year')
From the plot we can see that 20003 is definitely an error. We will replace 20003 by 2003.
In [11]:
B.replace({'paper year':{
20003:2003
}}, inplace=True)
In [12]:
### Blocking plan
### A, B -- AttrEquivalence blocker [year]--------------------------| Candidate set
In [12]:
# Create attribute equivalence blocker
ab = em.AttrEquivalenceBlocker()
# Block tables using 'year' attribute : same year include in candidate set
C1 = ab.block_tables(A, B, 'paper year', 'paper year',
l_output_attrs=['title', 'authors', 'paper year'],
r_output_attrs=['title', 'authors', 'paper year']
)
len(C1)
Out[12]:
In [13]:
C1.head(2)
Out[13]:
In [14]:
# check whether the current blocking method has dropped a lot of potential matches
dbg = em.debug_blocker(C1, A, B)
In [15]:
dbg.head()
Out[15]:
In [16]:
# em.view_table(dbg)
In [17]:
# Revised blocking plan
# A, B -- AttrEquivalence blocker [year] --------------------|
# |---> candidate set
# A, B -- Overlap blocker [title]---------------------------|
In [18]:
profiler.profile_table(A, 'title', plot=False)
Out[18]:
In [23]:
profiler.profile_table(B, 'title', plot=False)
Out[23]:
In [24]:
# Initialize overlap blocker
ob = em.OverlapBlocker()
# Block over title attribute
C2 = ob.block_tables(A, B, 'title', 'title', show_progress=False, overlap_size=1)
len(C2)
Out[24]:
In [25]:
# Combine the outputs from attr. equivalence blocker and overlap blocker
C = em.combine_blocker_outputs_via_union([C1, C2])
len(C)
Out[25]:
In [26]:
# Check again to see if we are dropping any potential matches
dbg = em.debug_blocker(C, A, B)
In [27]:
dbg.head()
Out[27]:
In [28]:
# Sample candidate set
S = em.sample_table(C, 450)
In [29]:
# Label S
S = em.label_table(S, 'label')
In [30]:
# Load the pre-labeled data
S = em.read_csv_metadata('labeled_data_demo.csv',
key='_id',
ltable=A, rtable=B,
fk_ltable='ltable_id', fk_rtable='rtable_id')
len(S)
Out[30]:
In [31]:
# Split S into I an J
IJ = em.split_train_test(S, train_proportion=0.5, random_state=0)
I = IJ['train']
J = IJ['test']
In [32]:
# Create a set of ML-matchers
dt = em.DTMatcher(name='DecisionTree', random_state=0)
svm = em.SVMMatcher(name='SVM', random_state=0)
rf = em.RFMatcher(name='RF', random_state=0)
lg = em.LogRegMatcher(name='LogReg', random_state=0)
ln = em.LinRegMatcher(name='LinReg')
In [33]:
# Generate a set of features
F = em.get_features_for_matching(A, B)
In [34]:
# List the feature names generated
F['feature_name']
Out[34]:
In [35]:
# Convert the I into a set of feature vectors using F
H = em.extract_feature_vecs(I,
feature_table=F,
attrs_after='label',
show_progress=False)
In [36]:
# Select the best ML matcher using CV
result = em.select_matcher([dt, rf, svm, ln, lg], table=H,
exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'],
k=5,
target_attr='label', metric='f1', random_state=0)
result['cv_stats']
Out[36]:
In [37]:
# Split H into P and Q
PQ = em.split_train_test(H, train_proportion=0.5, random_state=0)
P = PQ['train']
Q = PQ['test']
In [38]:
# Debug RF matcher using GUI
em.vis_debug_rf(rf, P, Q,
exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'],
target_attr='label')
In [39]:
# Add a feature to do Jaccard on title + authors and add it to F
# Create a feature declaratively
sim = em.get_sim_funs_for_matching()
tok = em.get_tokenizers_for_matching()
feature_string = """jaccard(wspace((ltuple['title'] + ' ' + ltuple['authors']).lower()),
wspace((rtuple['title'] + ' ' + rtuple['authors']).lower()))"""
feature = em.get_feature_fn(feature_string, sim, tok)
# Add feature to F
em.add_feature(F, 'jac_ws_title_authors', feature)
Out[39]:
In [40]:
# Print supported sim. functions
pd.DataFrame({'simfunctions':sorted(sim.keys())})
Out[40]:
In [41]:
# Print supported tokenizers
pd.DataFrame({'tokenizers':sorted(tok.keys())})
Out[41]:
In [42]:
F['feature_name']
Out[42]:
In [43]:
# Convert I into feature vectors using updated F
H = em.extract_feature_vecs(I,
feature_table=F,
attrs_after='label',
show_progress=False)
In [44]:
# Check whether the updated F improves X (Random Forest)
result = em.select_matcher([rf], table=H,
exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'],
k=5,
target_attr='label', metric='f1', random_state=0)
result['cv_stats']
Out[44]:
In [45]:
# Select the best matcher again using CV
result = em.select_matcher([dt, rf, svm, ln, lg], table=H,
exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'],
k=5,
target_attr='label', metric='f1', random_state=0)
result['cv_stats']
Out[45]:
In [46]:
# Train using feature vectors from I
dt.fit(table=H,
exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'],
target_attr='label')
# Convert J into a set of feature vectors using F
L = em.extract_feature_vecs(J, feature_table=F,
attrs_after='label', show_progress=False)
# Predict on L
predictions = dt.predict(table=L, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'],
append=True, target_attr='predicted', inplace=False)
In [47]:
# Evaluate the predictions
eval_result = em.eval_matches(predictions, 'label', 'predicted')
em.print_eval_summary(eval_result)