In [1]:
%matplotlib inline
import sys
sys.path.append('/Users/pradap/Documents/Research/Python-Package/anhaid/py_entitymatching/')
In [2]:
# py_entitymatching relies on the following packages from the Python eco-system
S.No | Package | Comments |
---|---|---|
1 | pandas | provides data structures (such as DataFrame) to store and manage relational data. Specifically, DataFrame is used to represent input tables. |
2 | scikit-learn | provides implementations for common machine learning algorithms. Specifically, this is used in ML-based matchers. |
3 | joblib | provides multiprocessing capabilities. Specifically, this is used to parallelize blockers across multiple processors. |
4 | PyQt4 | provides tools to build GUI. Specifically, this is used to build GUI for labeling data and debugging matchers. |
5 | py_stringsimjoin | provides scalable implementations for string similarity joins over two tables. Specifically, this is used to scale blockers. |
6 | py_stringmatching | provides a comprehensive set of tokenizers and string similarity functions. Specifically, this is to create features for blocking and matching. |
7 | cloudpickle | provides functions to serialize Python constructs. Specifically, this is used to load/save objects from/to disk. |
8 | pyprind | library to display progress indicators. Specifically, this is used to display progress of blocking functions, matching functions, etc. |
9 | pyparsing | library to parse strings. Specifically, this is used to parse rules/features that are declaratively written by the user. |
10 | six | provides functions to write compatible code across Python 2 and 3. |
In [3]:
import py_entitymatching as em
import profiler
import pandas as pd
In [4]:
## Read input tables
A = em.read_csv_metadata('dblp_demo.csv', key='id')
B = em.read_csv_metadata('acm_demo.csv', key='id')
In [5]:
len(A), len(B), len(A) * len(B)
Out[5]:
In [6]:
A.head(2)
Out[6]:
In [7]:
B.head(2)
Out[7]:
In [8]:
# If the tables are large we can downsample the tables like this
A1, B1 = em.down_sample(A, B, 500, 1, show_progress=False)
len(A1), len(B1)
# But for the demo, we will use the entire table A and B
Out[8]:
In [9]:
profiler.profile_table(A, 'year')
In [60]:
profiler.profile_table(B, 'year')
In [10]:
B.replace({'year':{
20003:2003
}}, inplace=True)
In [11]:
### Blocking plan
### A, B -- AttrEquivalence blocker [year]--------------------------| Candidate set
In [12]:
# Create attribute equivalence blocker
ab = em.AttrEquivalenceBlocker()
# Block tables using 'year' attribute : same year include in candidate set
C1 = ab.block_tables(A, B, 'year', 'year',
l_output_attrs=['title', 'authors', 'year'],
r_output_attrs=['title', 'authors', 'year']
)
len(C1)
Out[12]:
In [13]:
C1.head(2)
Out[13]:
In [14]:
# check whether the current blocking method has dropped a lot of potential matches
dbg = em.debug_blocker(C1, A, B)
In [15]:
dbg.head()
Out[15]:
In [16]:
# em.view_table(dbg)
In [17]:
# Revised blocking plan
# A, B -- AttrEquivalence blocker [year] --------------------|
# |---> candidate set
# A, B -- Overlap blocker [title]---------------------------|
In [18]:
profiler.profile_table(A, 'title', plot=False)
Out[18]:
In [19]:
profiler.profile_table(B, 'title', plot=False)
Out[19]:
In [20]:
# Initialize overlap blocker
ob = em.OverlapBlocker()
# Block over title attribute
C2 = ob.block_tables(A, B, 'title', 'title', show_progress=False, overlap_size=1)
len(C2)
Out[20]:
In [21]:
# Combine the outputs from attr. equivalence blocker and overlap blocker
C = em.combine_blocker_outputs_via_union([C1, C2])
len(C)
Out[21]:
In [22]:
# Check again to see if we are dropping any potential matches
dbg = em.debug_blocker(C, A, B)
In [23]:
dbg.head()
Out[23]:
In [24]:
# Sample candidate set
S = em.sample_table(C, 450)
In [25]:
# Label S
S = em.label_table(S, 'label')
In [75]:
# Load the pre-labeled data
S = em.read_csv_metadata('labeled_data_demo.csv', ltable=A, rtable=B)
len(S)
Out[75]:
In [76]:
# Split S into I an J
IJ = em.split_train_test(S, train_proportion=0.7, random_state=0)
I = IJ['train']
J = IJ['test']
In [77]:
# Create a set of ML-matchers
dt = em.DTMatcher(name='DecisionTree', random_state=0)
svm = em.SVMMatcher(name='SVM', random_state=0)
rf = em.RFMatcher(name='RF', random_state=0)
lg = em.LogRegMatcher(name='LogReg', random_state=0)
ln = em.LinRegMatcher(name='LinReg')
In [78]:
# Generate a set of features
F = em.get_features_for_matching(A, B)
In [79]:
# List the feature names generated
F['feature_name']
Out[79]:
In [80]:
# Convert the I into a set of feature vectors using F
H = em.extract_feature_vecs(I,
feature_table=F,
attrs_after='label',
show_progress=False)
In [81]:
# Select the best ML matcher using CV
result = em.select_matcher([dt, rf, svm, ln, lg], table=H,
exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'],
k=5,
target_attr='label', metric='f1', random_state=0)
result['cv_stats']
Out[81]:
In [82]:
# Split H into P and Q
PQ = em.split_train_test(H, train_proportion=0.5, random_state=0)
P = PQ['train']
Q = PQ['test']
In [83]:
# Debug RF matcher using GUI
em.vis_debug_rf(rf, P, Q,
exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'],
target_attr='label')
In [36]:
# Swap P and Q, and debug again
em.vis_debug_rf(rf, Q, P,
exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'],
target_attr='label')
In [84]:
# Add a feature to do Jaccard on title + authors and add it to F
# Create a feature declaratively
sim = em.get_sim_funs_for_matching()
tok = em.get_tokenizers_for_matching()
feature_string = """jaccard(wspace((ltuple['title'] + ' ' + ltuple['authors']).lower()),
wspace((rtuple['title'] + ' ' + rtuple['authors']).lower()))"""
feature = em.get_feature_fn(feature_string, sim, tok)
# Add feature to F
em.add_feature(F, 'jac_ws_title_authors', feature)
Out[84]:
In [38]:
# Print supported sim. functions
pd.DataFrame({'simfunctions':sorted(sim.keys())})
Out[38]:
In [39]:
# Print supported tokenizers
pd.DataFrame({'tokenizers':sorted(tok.keys())})
Out[39]:
In [85]:
F['feature_name']
Out[85]:
In [86]:
# Convert I into feature vectors using updated F
H = em.extract_feature_vecs(I,
feature_table=F,
attrs_after='label',
show_progress=False)
In [87]:
# Check whether the updated F improves X (Random Forest)
result = em.select_matcher([rf], table=H,
exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'],
k=5,
target_attr='label', metric='f1', random_state=0)
result['cv_stats']
Out[87]:
In [88]:
# Select the best matcher again using CV
result = em.select_matcher([dt, rf, svm, ln, lg], table=H,
exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'],
k=5,
target_attr='label', metric='f1', random_state=0)
result['cv_stats']
Out[88]:
In [89]:
# Train using feature vectors from I
dt.fit(table=H,
exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'],
target_attr='label')
# Convert J into a set of feature vectors using F
L = em.extract_feature_vecs(J, feature_table=F,
attrs_after='label', show_progress=False)
# Predict on L
predictions = dt.predict(table=L, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'],
append=True, target_attr='predicted', inplace=False)
In [90]:
# Evaluate the predictions
eval_result = em.eval_matches(predictions, 'label', 'predicted')
em.print_eval_summary(eval_result)