In [1]:
from ipykernel.kernelspec import RESOURCES
import magellan as mg
In [2]:
mg.init_jvm()
Out[2]:
In [3]:
# Input tables
A = mg.read_csv('tableA.csv', key='ID')
B = mg.read_csv('tableB.csv', key='ID')
In [4]:
A.head(2)
Out[4]:
In [5]:
B.head(2)
Out[5]:
In [6]:
# Stage2 Blocking: stored in tableC.csv
C = mg.read_csv('tableC.csv', ltable=A, rtable=B)
len(C)
Out[6]:
In [7]:
# Plan
# 1. Sample candidate set --> S
# 2. Label S --> G
# 3. Split G into development set I, and evaluation set J
# 4. Select best learning-based matcher Y, using I
# 5. Add triggers to Y ---> Z
# 6. Evaluate Z using J
In [9]:
# Sample the candidate set
S = mg.sample_table(C, 450)
In [10]:
# Label S
G = mg.label_table(S, 'gold')
G.to_csv('label.csv')
Out[10]:
In [11]:
# User labeled table
G = mg.read_csv('gold_init.csv', ltable=A, rtable=B)
len(G)
Out[11]:
In [12]:
# Split G into development (I) and evaluation (J)
IJ = mg.train_test_split(G, train_proportion=0.7)
I = IJ['train']
J = IJ['test']
(len(I), len(J))
Out[12]:
In [13]:
# Selecting the best learning-based matcher using I
# Plan
# 1. Create a set of ML-matchers
# 2. Generate features --> feature_table
# 3. Extract feature vectors using I and feature_table
# 4. Select best learning-based matcher using CV
# 5. Debug the selected matcher (and repeat the above steps)
In [14]:
# Create a set of ML-matchers
dt = mg.DTMatcher(name='DecisionTree', random_state=0)
svm = mg.SVMMatcher(name='SVM')
rf = mg.RFMatcher(name='RF', random_state=0)
nb = mg.NBMatcher(name='NB')
lg = mg.LogRegMatcher(name='LogReg')
ln = mg.LinRegMatcher(name='LinReg')
In [15]:
# Names of the matchers
(dt.name, svm.name, rf.name, nb.name, lg.name, ln.name)
Out[15]:
In [16]:
# Generate features
feat_table = mg.get_features_for_matching(A, B)
feat_table
Out[16]:
In [17]:
mg._match_s
Out[17]:
In [18]:
mg._match_t
Out[18]:
In [19]:
mg._match_c['corres']
Out[19]:
In [20]:
# Select 'address' related features
feat_subset_iter1 = feat_table[25:33]
feat_subset_iter1
Out[20]:
In [21]:
# Get feature vectors
K = mg.extract_feature_vecs(I, feature_table=feat_subset_iter1, attrs_after='gold')
In [22]:
# impute K
K.fillna(0, inplace=True)
K.head()
Out[22]:
In [28]:
# select the best ML matcher using CV
result = mg.select_matcher([dt, rf, svm, nb, lg, ln], table=K,
exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
target_attr='gold', metric='precision', random_state=0)
In [29]:
result['selected_matcher']
Out[29]:
In [30]:
result['cv_stats']
Out[30]:
In [32]:
UV = mg.train_test_split(K, train_proportion=0.5, random_state=0)
U = UV['train']
V = UV['test']
# 1. Train RF using U
# 2. Predict V using DT
# 3. Evaluate predictions
dt.fit(table=U,
exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
target_attr='gold')
P = dt.predict(table=V, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
append=True, target_attr='predicted', inplace=False)
eval_result = mg.eval_matches(P, 'gold', 'predicted')
mg.print_eval_summary(eval_result)
In [33]:
rf.fit(table=U,
exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
target_attr='gold')
P = rf.predict(table=V, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
append=True, target_attr='predicted', inplace=False)
eval_result = mg.eval_matches(P, 'gold', 'predicted')
mg.print_eval_summary(eval_result)
In [34]:
svm.fit(table=U,
exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
target_attr='gold')
P = svm.predict(table=V, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
append=True, target_attr='predicted', inplace=False)
eval_result = mg.eval_matches(P, 'gold', 'predicted')
mg.print_eval_summary(eval_result)
In [35]:
nb.fit(table=U,
exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
target_attr='gold')
P = nb.predict(table=V, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
append=True, target_attr='predicted', inplace=False)
eval_result = mg.eval_matches(P, 'gold', 'predicted')
mg.print_eval_summary(eval_result)
In [36]:
lg.fit(table=U,
exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
target_attr='gold')
P = lg.predict(table=V, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
append=True, target_attr='predicted', inplace=False)
eval_result = mg.eval_matches(P, 'gold', 'predicted')
mg.print_eval_summary(eval_result)
In [38]:
ln.fit(table=U,
exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
target_attr='gold')
P = ln.predict(table=V, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
append=True, target_attr='predicted', inplace=False)
eval_result = mg.eval_matches(P, 'gold', 'predicted')
mg.print_eval_summary(eval_result)
In [ ]:
# Debug random forest
# Split feature vectors to train and test
UV = mg.train_test_split(K, train_proportion=0.5, random_state=0)
U = UV['train']
V = UV['test']
In [ ]:
mg.vis_debug_rf(rf, U, V,
exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
target_attr='gold')
# Inspect false positives and take actions:
# removed 6 tuples as they had same address, but different name and phone numbers
In [41]:
#New G
G = mg.read_csv('gold_final.csv', ltable=A, rtable=B)
# Split G into development (I) and evaluation (J)
IJ = mg.train_test_split(G, train_proportion=0.7)
I = IJ['train']
J = IJ['test']
(len(G),len(I), len(J))
Out[41]:
In [42]:
K = mg.extract_feature_vecs(I, feature_table=feat_subset_iter1, attrs_after='gold')
K.fillna(0, inplace=True)
K.head()
Out[42]:
In [43]:
result = mg.select_matcher([dt, rf, svm, nb, lg, ln], table=K,
exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
target_attr='gold', metric='precision', random_state=0)
result['selected_matcher']
Out[43]:
In [44]:
result['cv_stats']
Out[44]:
In [45]:
UV = mg.train_test_split(K, train_proportion=0.5, random_state=0)
U = UV['train']
V = UV['test']
# 1. Train RF using U
# 2. Predict V using DT
# 3. Evaluate predictions
rf.fit(table=U,
exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
target_attr='gold')
P = rf.predict(table=V, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
append=True, target_attr='predicted', inplace=False)
eval_result = mg.eval_matches(P, 'gold', 'predicted')
mg.print_eval_summary(eval_result)
In [46]:
dt.fit(table=U,
exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
target_attr='gold')
P = dt.predict(table=V, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
append=True, target_attr='predicted', inplace=False)
eval_result = mg.eval_matches(P, 'gold', 'predicted')
mg.print_eval_summary(eval_result)
In [47]:
svm.fit(table=U,
exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
target_attr='gold')
P = svm.predict(table=V, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
append=True, target_attr='predicted', inplace=False)
eval_result = mg.eval_matches(P, 'gold', 'predicted')
mg.print_eval_summary(eval_result)
In [48]:
nb.fit(table=U,
exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
target_attr='gold')
P = nb.predict(table=V, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
append=True, target_attr='predicted', inplace=False)
eval_result = mg.eval_matches(P, 'gold', 'predicted')
mg.print_eval_summary(eval_result)
In [49]:
lg.fit(table=U,
exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
target_attr='gold')
P = lg.predict(table=V, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
append=True, target_attr='predicted', inplace=False)
eval_result = mg.eval_matches(P, 'gold', 'predicted')
mg.print_eval_summary(eval_result)
In [50]:
ln.fit(table=U,
exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
target_attr='gold')
P = ln.predict(table=V, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
append=True, target_attr='predicted', inplace=False)
eval_result = mg.eval_matches(P, 'gold', 'predicted')
mg.print_eval_summary(eval_result)
In [51]:
# Include feature for 'phone'
In [52]:
feat_table
Out[52]:
In [53]:
# Select 'phone' + 'address' related features
feat_subset_iter2 = feat_table[17:33]
In [54]:
feat_subset_iter2
Out[54]:
In [55]:
# Get new set of features
K = mg.extract_feature_vecs(I, feature_table=feat_subset_iter2, attrs_after='gold')
# impute K
K.fillna(0, inplace=True)
In [56]:
# Split feature vectors into U and V
UV = mg.train_test_split(K, train_proportion=0.5, random_state=0)
U = UV['train']
V = UV['test']
In [57]:
# Check whether the added features improves the accuracy in the test set.
# Steps
# 1. Train RF using U
# 2. Predict V using DT
# 3. Evaluate predictions
In [58]:
# Train RF using U
dt.fit(table=U,
exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
target_attr='gold')
In [59]:
# Predict V using rf
P = dt.predict(table=V, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
append=True, target_attr='predicted', inplace=False)
In [60]:
# Evaluate the predictions
eval_result = mg.eval_matches(P, 'gold', 'predicted')
mg.print_eval_summary(eval_result)
In [63]:
rf.fit(table=U,
exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
target_attr='gold')
P = rf.predict(table=V, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
append=True, target_attr='predicted', inplace=False)
eval_result = mg.eval_matches(P, 'gold', 'predicted')
mg.print_eval_summary(eval_result)
In [64]:
svm.fit(table=U,
exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
target_attr='gold')
P = svm.predict(table=V, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
append=True, target_attr='predicted', inplace=False)
eval_result = mg.eval_matches(P, 'gold', 'predicted')
mg.print_eval_summary(eval_result)
In [65]:
nb.fit(table=U,
exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
target_attr='gold')
P = nb.predict(table=V, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
append=True, target_attr='predicted', inplace=False)
eval_result = mg.eval_matches(P, 'gold', 'predicted')
mg.print_eval_summary(eval_result)
In [66]:
lg.fit(table=U,
exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
target_attr='gold')
P = lg.predict(table=V, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
append=True, target_attr='predicted', inplace=False)
eval_result = mg.eval_matches(P, 'gold', 'predicted')
mg.print_eval_summary(eval_result)
In [67]:
ln.fit(table=U,
exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
target_attr='gold')
P = ln.predict(table=V, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
append=True, target_attr='predicted', inplace=False)
eval_result = mg.eval_matches(P, 'gold', 'predicted')
mg.print_eval_summary(eval_result)
In [68]:
# Apply cross validation to find if there is a better matcher
result = mg.select_matcher([dt, rf, svm, nb, lg, ln], table=K,
exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
target_attr='gold', metric='f1')
In [69]:
result['cv_stats']
Out[69]:
In [70]:
# Select DT as the best matcher -- Y
# Use phone + address related features
In [71]:
# Add triggers on top of Y
# 1. Split K into U and V
# 2. Use U,V + Y to write triggers (examine fp, fn).
In [72]:
# Split feature vectors to U and V
UV = mg.train_test_split(K, train_proportion=0.5, random_state=0)
U = UV['train']
V = UV['test']
In [73]:
# Invoke debug interface to check FP and FN
mg.vis_debug_dt(dt, U, V,
exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
target_attr='gold')
In [74]:
feat_table
Out[74]:
In [87]:
# Add trigger: we have 0 FP and 2 FN, so we'll apply name matching rule
neg_trigger = mg.MatchTrigger()
neg_trigger.add_cond_rule('name_name_lev(ltuple, rtuple) < 0.1', feat_table)
neg_trigger.add_cond_status(True)
neg_trigger.add_action(0)
Out[87]:
In [88]:
# Check whether the added trigger improves the accuracy in the test set.
# Steps
# 1. Train DT using U
# 2. Predict V using DT
# 3. Apply trigger
# 4. Evaluate the result
In [89]:
# Train dt using U
dt.fit(table=U,
exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
target_attr='gold')
In [90]:
# Predict V using dt
P = dt.predict(table=V, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
append=True, target_attr='predicted', inplace=False)
In [91]:
# Apply trigger
Q = neg_trigger.execute(P, 'predicted', inplace=False)
In [93]:
# Evaluate the result
eval_result = mg.eval_matches(Q, 'predicted', 'gold')
mg.print_eval_summary(eval_result)
In [94]:
UV = mg.train_test_split(K, train_proportion=0.5, random_state=0)
U = UV['train']
V = UV['test']
mg.vis_debug_dt(dt, U, V,
exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
target_attr='gold')
In [126]:
# Do cross-validation for matcher + trigger using I (K)
result = mg.cv_matcher_and_trigger(dt, neg_trigger, table = K,
exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
target_attr='gold')
In [127]:
result['cv_stats']
Out[127]:
In [128]:
# Recall the cv for just the matcher (without trigger) was
result = mg.cv_matcher_and_trigger(dt, [], table = K,
exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
target_attr='gold')
In [129]:
result['cv_stats']
Out[129]:
In [130]:
# Now Z is DT (features: feat_subset_iter2) + neg_trigger
# Validate Z using J
# Steps
# 1. Extract feature vectors (using feat_subset_iter2) -- > M
# 2. Train DT using H (feature vectors generated using I)
# 3. Predict M using DT
# 4. Apply negative trigger
# 5. Evaluate the result
In [131]:
# Extract feature vectors
M = mg.extract_feature_vecs(J, feature_table=feat_subset_iter2, attrs_after='gold')
# Impute missing values
M.fillna(0, inplace=True)
In [132]:
# Train using feature vectors from I
dt.fit(table=K,
exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
target_attr='gold')
In [133]:
# Predict M
N = dt.predict(table=M, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
append=True, target_attr='predicted', inplace=False)
In [134]:
# Apply trigger
T = neg_trigger.execute(N, 'predicted', inplace=False)
In [135]:
# Evaluate the result
eval_result = mg.eval_matches(T, 'gold', 'predicted')
mg.print_eval_summary(eval_result)
In [136]:
dt.fit(table=K,
exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
target_attr='gold')
N = dt.predict(table=M, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
append=True, target_attr='predicted', inplace=False)
eval_result = mg.eval_matches(N, 'gold', 'predicted')
mg.print_eval_summary(eval_result)
In [137]:
rf.fit(table=K,
exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
target_attr='gold')
N = rf.predict(table=M, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
append=True, target_attr='predicted', inplace=False)
eval_result = mg.eval_matches(N, 'gold', 'predicted')
mg.print_eval_summary(eval_result)
In [138]:
svm.fit(table=K,
exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
target_attr='gold')
N = svm.predict(table=M, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
append=True, target_attr='predicted', inplace=False)
eval_result = mg.eval_matches(N, 'gold', 'predicted')
mg.print_eval_summary(eval_result)
In [139]:
nb.fit(table=K,
exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
target_attr='gold')
N = nb.predict(table=M, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
append=True, target_attr='predicted', inplace=False)
eval_result = mg.eval_matches(N, 'gold', 'predicted')
mg.print_eval_summary(eval_result)
In [140]:
lg.fit(table=K,
exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
target_attr='gold')
N = lg.predict(table=M, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
append=True, target_attr='predicted', inplace=False)
eval_result = mg.eval_matches(N, 'gold', 'predicted')
mg.print_eval_summary(eval_result)
In [141]:
ln.fit(table=K,
exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
target_attr='gold')
N = ln.predict(table=M, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
append=True, target_attr='predicted', inplace=False)
eval_result = mg.eval_matches(N, 'gold', 'predicted')
mg.print_eval_summary(eval_result)
In [125]:
UV = mg.train_test_split(M, train_proportion=0.5, random_state=0)
U = UV['train']
V = UV['test']
mg.vis_debug_dt(dt, U, V,
exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
target_attr='gold')
In [142]:
T.to_csv('stage3_final_matches.csv')
Out[142]: