In [1]:
from ipykernel.kernelspec import RESOURCES
import magellan as mg


DEBUG:Cloud:Log file (/home/ubuntuvm/.picloud/cloud.log) opened

In [2]:
mg.init_jvm()


Out[2]:
True

In [3]:
# Input tables
A = mg.read_csv('tableA.csv', key='ID')
B = mg.read_csv('tableB.csv', key='ID')

In [4]:
A.head(2)


Out[4]:
ID name votes rating phone address zip cuisine reviewcount
0 0 Strings Ramen Shop 15 3.1 (312) 374-3450 2141 S. Archer Avenue, Chicago 60616 Asian,Chinese,Ramen 2
1 1 Francesco's Hole In The Wall 179 4.0 (847) 272-0155 254 Skokie Boulevard 60062 Italian 6

In [5]:
B.head(2)


Out[5]:
ID name votes rating phone address zip cuisine
0 0 Patino’s Grill 35 5 (773) 280-9562 2943 W Irving Park Rd 60618 American (Traditional)
1 1 Grandma J’s Local Kitchen 188 4 (773) 227-3626 1552 N Kedzie Ave 60651 Breakfast & Brunch

In [6]:
# Stage2 Blocking: stored in tableC.csv
C = mg.read_csv('tableC.csv', ltable=A, rtable=B)
len(C)


Out[6]:
10630

In [7]:
# Plan

# 1. Sample candidate set --> S
# 2. Label S --> G
# 3. Split G into development set I, and evaluation set J
# 4. Select best learning-based matcher Y, using I 
# 5. Add triggers to Y ---> Z
# 6. Evaluate Z using J

In [9]:
# Sample the candidate set
S = mg.sample_table(C, 450)

In [10]:
# Label S
G = mg.label_table(S, 'gold')
G.to_csv('label.csv')


Out[10]:
True

In [11]:
# User labeled table
G = mg.read_csv('gold_init.csv', ltable=A, rtable=B)
len(G)


Out[11]:
450

In [12]:
# Split G into development (I) and evaluation (J)
IJ = mg.train_test_split(G, train_proportion=0.7)
I = IJ['train']
J = IJ['test']
(len(I), len(J))


Out[12]:
(315, 135)

In [13]:
# Selecting the best learning-based matcher using I

# Plan 
# 1. Create a set of ML-matchers
# 2. Generate features --> feature_table
# 3. Extract feature vectors using I and feature_table
# 4. Select best learning-based matcher using CV
# 5. Debug the selected matcher (and repeat the above steps)

In [14]:
# Create a set of ML-matchers
dt = mg.DTMatcher(name='DecisionTree', random_state=0)
svm = mg.SVMMatcher(name='SVM')
rf = mg.RFMatcher(name='RF', random_state=0)
nb = mg.NBMatcher(name='NB')
lg = mg.LogRegMatcher(name='LogReg')
ln = mg.LinRegMatcher(name='LinReg')

In [15]:
# Names of the matchers
(dt.name, svm.name, rf.name, nb.name, lg.name, ln.name)


Out[15]:
('DecisionTree', 'SVM', 'RF', 'NB', 'LogReg', 'LinReg')

In [16]:
# Generate features
feat_table = mg.get_features_for_matching(A, B)
feat_table


Out[16]:
feature_name left_attribute right_attribute left_attr_tokenizer right_attr_tokenizer simfunction function function_source
0 ID_ID_exm ID ID None None exact_match <function ID_ID_exm at 0x7feee96ab500> from magellan.feature.simfunctions import *\nf...
1 ID_ID_anm ID ID None None abs_norm <function ID_ID_anm at 0x7feee96ab230> from magellan.feature.simfunctions import *\nf...
2 ID_ID_lev ID ID None None lev <function ID_ID_lev at 0x7feee96ab848> from magellan.feature.simfunctions import *\nf...
3 name_name_jac_qgm_3_qgm_3 name name qgm_3 qgm_3 jaccard <function name_name_jac_qgm_3_qgm_3 at 0x7feee... from magellan.feature.simfunctions import *\nf...
4 name_name_cos_dlm_dc0_dlm_dc0 name name dlm_dc0 dlm_dc0 cosine <function name_name_cos_dlm_dc0_dlm_dc0 at 0x7... from magellan.feature.simfunctions import *\nf...
5 name_name_jac_dlm_dc0_dlm_dc0 name name dlm_dc0 dlm_dc0 jaccard <function name_name_jac_dlm_dc0_dlm_dc0 at 0x7... from magellan.feature.simfunctions import *\nf...
6 name_name_mel name name None None monge_elkan <function name_name_mel at 0x7feee96aba28> from magellan.feature.simfunctions import *\nf...
7 name_name_lev name name None None lev <function name_name_lev at 0x7feee96abaa0> from magellan.feature.simfunctions import *\nf...
8 name_name_nmw name name None None needleman_wunsch <function name_name_nmw at 0x7feee96abb18> from magellan.feature.simfunctions import *\nf...
9 name_name_sw name name None None smith_waterman <function name_name_sw at 0x7feee96abb90> from magellan.feature.simfunctions import *\nf...
10 name_name_swg name name None None smith_waterman_gotoh <function name_name_swg at 0x7feee96abc08> from magellan.feature.simfunctions import *\nf...
11 votes_votes_exm votes votes None None exact_match <function votes_votes_exm at 0x7feee96abc80> from magellan.feature.simfunctions import *\nf...
12 votes_votes_anm votes votes None None abs_norm <function votes_votes_anm at 0x7feee96abcf8> from magellan.feature.simfunctions import *\nf...
13 votes_votes_lev votes votes None None lev <function votes_votes_lev at 0x7feee96abd70> from magellan.feature.simfunctions import *\nf...
14 rating_rating_exm rating rating None None exact_match <function rating_rating_exm at 0x7feee96abde8> from magellan.feature.simfunctions import *\nf...
15 rating_rating_anm rating rating None None abs_norm <function rating_rating_anm at 0x7feee96abe60> from magellan.feature.simfunctions import *\nf...
16 rating_rating_lev rating rating None None lev <function rating_rating_lev at 0x7feee96abed8> from magellan.feature.simfunctions import *\nf...
17 phone_phone_jac_qgm_3_qgm_3 phone phone qgm_3 qgm_3 jaccard <function phone_phone_jac_qgm_3_qgm_3 at 0x7fe... from magellan.feature.simfunctions import *\nf...
18 phone_phone_cos_dlm_dc0_dlm_dc0 phone phone dlm_dc0 dlm_dc0 cosine <function phone_phone_cos_dlm_dc0_dlm_dc0 at 0... from magellan.feature.simfunctions import *\nf...
19 phone_phone_jac_dlm_dc0_dlm_dc0 phone phone dlm_dc0 dlm_dc0 jaccard <function phone_phone_jac_dlm_dc0_dlm_dc0 at 0... from magellan.feature.simfunctions import *\nf...
20 phone_phone_mel phone phone None None monge_elkan <function phone_phone_mel at 0x7feee913f050> from magellan.feature.simfunctions import *\nf...
21 phone_phone_lev phone phone None None lev <function phone_phone_lev at 0x7feee913f0c8> from magellan.feature.simfunctions import *\nf...
22 phone_phone_nmw phone phone None None needleman_wunsch <function phone_phone_nmw at 0x7feee913f140> from magellan.feature.simfunctions import *\nf...
23 phone_phone_sw phone phone None None smith_waterman <function phone_phone_sw at 0x7feee913f1b8> from magellan.feature.simfunctions import *\nf...
24 phone_phone_swg phone phone None None smith_waterman_gotoh <function phone_phone_swg at 0x7feee913f230> from magellan.feature.simfunctions import *\nf...
25 address_address_jac_qgm_3_qgm_3 address address qgm_3 qgm_3 jaccard <function address_address_jac_qgm_3_qgm_3 at 0... from magellan.feature.simfunctions import *\nf...
26 address_address_cos_dlm_dc0_dlm_dc0 address address dlm_dc0 dlm_dc0 cosine <function address_address_cos_dlm_dc0_dlm_dc0 ... from magellan.feature.simfunctions import *\nf...
27 address_address_jac_dlm_dc0_dlm_dc0 address address dlm_dc0 dlm_dc0 jaccard <function address_address_jac_dlm_dc0_dlm_dc0 ... from magellan.feature.simfunctions import *\nf...
28 address_address_mel address address None None monge_elkan <function address_address_mel at 0x7feee913f410> from magellan.feature.simfunctions import *\nf...
29 address_address_lev address address None None lev <function address_address_lev at 0x7feee913f488> from magellan.feature.simfunctions import *\nf...
30 address_address_nmw address address None None needleman_wunsch <function address_address_nmw at 0x7feee913f500> from magellan.feature.simfunctions import *\nf...
31 address_address_sw address address None None smith_waterman <function address_address_sw at 0x7feee913f578> from magellan.feature.simfunctions import *\nf...
32 address_address_swg address address None None smith_waterman_gotoh <function address_address_swg at 0x7feee913f5f0> from magellan.feature.simfunctions import *\nf...
33 zip_zip_exm zip zip None None exact_match <function zip_zip_exm at 0x7feee913f668> from magellan.feature.simfunctions import *\nf...
34 zip_zip_anm zip zip None None abs_norm <function zip_zip_anm at 0x7feee913f6e0> from magellan.feature.simfunctions import *\nf...
35 zip_zip_lev zip zip None None lev <function zip_zip_lev at 0x7feee913f758> from magellan.feature.simfunctions import *\nf...
36 cuisine_cuisine_jac_qgm_3_qgm_3 cuisine cuisine qgm_3 qgm_3 jaccard <function cuisine_cuisine_jac_qgm_3_qgm_3 at 0... from magellan.feature.simfunctions import *\nf...
37 cuisine_cuisine_cos_dlm_dc0_dlm_dc0 cuisine cuisine dlm_dc0 dlm_dc0 cosine <function cuisine_cuisine_cos_dlm_dc0_dlm_dc0 ... from magellan.feature.simfunctions import *\nf...
38 cuisine_cuisine_jac_dlm_dc0_dlm_dc0 cuisine cuisine dlm_dc0 dlm_dc0 jaccard <function cuisine_cuisine_jac_dlm_dc0_dlm_dc0 ... from magellan.feature.simfunctions import *\nf...
39 cuisine_cuisine_mel cuisine cuisine None None monge_elkan <function cuisine_cuisine_mel at 0x7feee913f938> from magellan.feature.simfunctions import *\nf...
40 cuisine_cuisine_lev cuisine cuisine None None lev <function cuisine_cuisine_lev at 0x7feee913f9b0> from magellan.feature.simfunctions import *\nf...
41 cuisine_cuisine_nmw cuisine cuisine None None needleman_wunsch <function cuisine_cuisine_nmw at 0x7feee913fa28> from magellan.feature.simfunctions import *\nf...
42 cuisine_cuisine_sw cuisine cuisine None None smith_waterman <function cuisine_cuisine_sw at 0x7feee913faa0> from magellan.feature.simfunctions import *\nf...
43 cuisine_cuisine_swg cuisine cuisine None None smith_waterman_gotoh <function cuisine_cuisine_swg at 0x7feee913fb18> from magellan.feature.simfunctions import *\nf...

In [17]:
mg._match_s


Out[17]:
{'abs_norm': <function magellan.feature.simfunctions.abs_norm>,
 'cosine': <function magellan.feature.simfunctions.cosine>,
 'exact_match': <function magellan.feature.simfunctions.exact_match>,
 'jaccard': <function magellan.feature.simfunctions.jaccard>,
 'jaro': <function magellan.feature.simfunctions.jaro>,
 'jaro_winkler': <function magellan.feature.simfunctions.jaro_winkler>,
 'lev': <function magellan.feature.simfunctions.lev>,
 'monge_elkan': <function magellan.feature.simfunctions.monge_elkan>,
 'needleman_wunsch': <function magellan.feature.simfunctions.needleman_wunsch>,
 'rel_diff': <function magellan.feature.simfunctions.rel_diff>,
 'smith_waterman': <function magellan.feature.simfunctions.smith_waterman>,
 'smith_waterman_gotoh': <function magellan.feature.simfunctions.smith_waterman_gotoh>,
 'soundex': <function magellan.feature.simfunctions.soundex>}

In [18]:
mg._match_t


Out[18]:
{'dlm_dc0': <function magellan.feature.tokenizers.tok_delim>,
 'qgm_2': <function magellan.feature.tokenizers.tok_qgram>,
 'qgm_3': <function magellan.feature.tokenizers.tok_qgram>}

In [19]:
mg._match_c['corres']


Out[19]:
[('ID', 'ID'),
 ('name', 'name'),
 ('votes', 'votes'),
 ('rating', 'rating'),
 ('phone', 'phone'),
 ('address', 'address'),
 ('zip', 'zip'),
 ('cuisine', 'cuisine')]

In [20]:
# Select 'address' related features
feat_subset_iter1 = feat_table[25:33]
feat_subset_iter1


Out[20]:
feature_name left_attribute right_attribute left_attr_tokenizer right_attr_tokenizer simfunction function function_source
25 address_address_jac_qgm_3_qgm_3 address address qgm_3 qgm_3 jaccard <function address_address_jac_qgm_3_qgm_3 at 0... from magellan.feature.simfunctions import *\nf...
26 address_address_cos_dlm_dc0_dlm_dc0 address address dlm_dc0 dlm_dc0 cosine <function address_address_cos_dlm_dc0_dlm_dc0 ... from magellan.feature.simfunctions import *\nf...
27 address_address_jac_dlm_dc0_dlm_dc0 address address dlm_dc0 dlm_dc0 jaccard <function address_address_jac_dlm_dc0_dlm_dc0 ... from magellan.feature.simfunctions import *\nf...
28 address_address_mel address address None None monge_elkan <function address_address_mel at 0x7feee913f410> from magellan.feature.simfunctions import *\nf...
29 address_address_lev address address None None lev <function address_address_lev at 0x7feee913f488> from magellan.feature.simfunctions import *\nf...
30 address_address_nmw address address None None needleman_wunsch <function address_address_nmw at 0x7feee913f500> from magellan.feature.simfunctions import *\nf...
31 address_address_sw address address None None smith_waterman <function address_address_sw at 0x7feee913f578> from magellan.feature.simfunctions import *\nf...
32 address_address_swg address address None None smith_waterman_gotoh <function address_address_swg at 0x7feee913f5f0> from magellan.feature.simfunctions import *\nf...

In [21]:
# Get feature vectors
K = mg.extract_feature_vecs(I, feature_table=feat_subset_iter1, attrs_after='gold')

In [22]:
# impute K
K.fillna(0, inplace=True)
K.head()


Out[22]:
_id ltable.id rtable.id address_address_jac_qgm_3_qgm_3 address_address_cos_dlm_dc0_dlm_dc0 address_address_jac_dlm_dc0_dlm_dc0 address_address_mel address_address_lev address_address_nmw address_address_sw address_address_swg gold
0 0 4451 2989 1.000000 1.00000 1.0 1.000000 1.000000 1.000000 1.000000 1.000000 1
1 1 4303 1976 0.560000 0.57735 0.4 0.800000 0.739130 0.847826 0.800000 0.800000 0
2 2 1405 645 0.026316 0.00000 0.0 0.200000 0.214286 0.500000 0.200000 0.200000 0
3 3 3932 1869 0.600000 0.75000 0.6 0.777778 0.777778 0.888889 0.805556 0.777778 0
4 4 3450 1580 0.684211 0.75000 0.6 0.833333 0.833333 0.916667 0.833333 0.833333 0

In [28]:
# select the best ML matcher using CV
result = mg.select_matcher([dt, rf, svm, nb, lg, ln], table=K, 
        exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
        target_attr='gold', metric='precision', random_state=0)

In [29]:
result['selected_matcher']


Out[29]:
<magellan.matcher.logregmatcher.LogRegMatcher at 0x7feee95ebb50>

In [30]:
result['cv_stats']


Out[30]:
Name Matcher Num folds Fold 1 Fold 2 Fold 3 Fold 4 Fold 5 Mean score
0 DecisionTree <magellan.matcher.dtmatcher.DTMatcher object a... 5 0.909091 0.615385 0.600000 0.777778 0.882353 0.756921
1 RF <magellan.matcher.rfmatcher.RFMatcher object a... 5 0.900000 0.800000 0.857143 0.714286 0.937500 0.841786
2 SVM <magellan.matcher.svmmatcher.SVMMatcher object... 5 1.000000 1.000000 1.000000 0.714286 0.000000 0.742857
3 NB <magellan.matcher.nbmatcher.NBMatcher object a... 5 0.777778 0.380952 0.533333 0.312500 0.809524 0.562817
4 LogReg <magellan.matcher.logregmatcher.LogRegMatcher ... 5 1.000000 0.875000 0.800000 0.714286 1.000000 0.877857
5 LinReg <magellan.matcher.linregmatcher.LinRegMatcher ... 5 0.846154 0.666667 0.875000 0.545455 0.944444 0.775544

In [32]:
UV = mg.train_test_split(K, train_proportion=0.5, random_state=0)
U = UV['train']
V = UV['test']
# 1. Train RF using U
# 2. Predict V using DT
# 3. Evaluate predictions
dt.fit(table=U,
       exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
       target_attr='gold')
P = dt.predict(table=V, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
              append=True, target_attr='predicted', inplace=False)
eval_result = mg.eval_matches(P, 'gold', 'predicted')
mg.print_eval_summary(eval_result)


Precision : 82.76% (24/29)
Recall : 75.0% (24/32)
F1 : 78.69%
False positives : 5 (out of 29 positive predictions)
False negatives : 8 (out of 129 negative predictions)

In [33]:
rf.fit(table=U,
       exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
       target_attr='gold')
P = rf.predict(table=V, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
              append=True, target_attr='predicted', inplace=False)
eval_result = mg.eval_matches(P, 'gold', 'predicted')
mg.print_eval_summary(eval_result)


Precision : 91.3% (21/23)
Recall : 65.63% (21/32)
F1 : 76.36%
False positives : 2 (out of 23 positive predictions)
False negatives : 11 (out of 135 negative predictions)

In [34]:
svm.fit(table=U,
       exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
       target_attr='gold')
P = svm.predict(table=V, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
              append=True, target_attr='predicted', inplace=False)
eval_result = mg.eval_matches(P, 'gold', 'predicted')
mg.print_eval_summary(eval_result)


Precision : 0.0% (0/0)
Recall : 0.0% (0/32)
F1 : 0.0%
False positives : 0 (out of 0 positive predictions)
False negatives : 32 (out of 158 negative predictions)

In [35]:
nb.fit(table=U,
       exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
       target_attr='gold')
P = nb.predict(table=V, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
              append=True, target_attr='predicted', inplace=False)
eval_result = mg.eval_matches(P, 'gold', 'predicted')
mg.print_eval_summary(eval_result)


Precision : 54.72% (29/53)
Recall : 90.63% (29/32)
F1 : 68.24%
False positives : 24 (out of 53 positive predictions)
False negatives : 3 (out of 105 negative predictions)

In [36]:
lg.fit(table=U,
       exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
       target_attr='gold')
P = lg.predict(table=V, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
              append=True, target_attr='predicted', inplace=False)
eval_result = mg.eval_matches(P, 'gold', 'predicted')
mg.print_eval_summary(eval_result)


Precision : 100.0% (12/12)
Recall : 37.5% (12/32)
F1 : 54.55%
False positives : 0 (out of 12 positive predictions)
False negatives : 20 (out of 146 negative predictions)

In [38]:
ln.fit(table=U,
       exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
       target_attr='gold')
P = ln.predict(table=V, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
              append=True, target_attr='predicted', inplace=False)
eval_result = mg.eval_matches(P, 'gold', 'predicted')
mg.print_eval_summary(eval_result)


Precision : 78.13% (25/32)
Recall : 78.13% (25/32)
F1 : 78.13%
False positives : 7 (out of 32 positive predictions)
False negatives : 7 (out of 126 negative predictions)

In [ ]:
# Debug random forest
# Split feature vectors to train and test
UV = mg.train_test_split(K, train_proportion=0.5, random_state=0)
U = UV['train']
V = UV['test']

In [ ]:
mg.vis_debug_rf(rf, U, V, 
        exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
        target_attr='gold')
# Inspect false positives and take actions:
# removed 6 tuples as they had same address, but different name and phone numbers

In [41]:
#New G
G = mg.read_csv('gold_final.csv', ltable=A, rtable=B)
# Split G into development (I) and evaluation (J)
IJ = mg.train_test_split(G, train_proportion=0.7)
I = IJ['train']
J = IJ['test']
(len(G),len(I), len(J))


Out[41]:
(444, 310, 134)

In [42]:
K = mg.extract_feature_vecs(I, feature_table=feat_subset_iter1, attrs_after='gold')
K.fillna(0, inplace=True)
K.head()


Out[42]:
_id ltable.id rtable.id address_address_jac_qgm_3_qgm_3 address_address_cos_dlm_dc0_dlm_dc0 address_address_jac_dlm_dc0_dlm_dc0 address_address_mel address_address_lev address_address_nmw address_address_sw address_address_swg gold
0 0 5208 2507 0.514286 0.755929 0.571429 1.000000 0.540541 0.540541 1.000000 1.000000 1
1 1 5497 2616 0.500000 0.666667 0.500000 0.714286 0.785714 0.892857 0.714286 0.714286 0
2 2 3753 2373 0.666667 0.750000 0.600000 0.836364 0.863636 0.931818 0.840909 0.836364 0
3 3 3951 1632 0.550000 0.333333 0.200000 0.812500 0.684211 0.763158 0.812500 0.812500 0
4 4 4350 3203 0.533333 0.750000 0.600000 0.769231 0.785714 0.892857 0.846154 0.769231 0

In [43]:
result = mg.select_matcher([dt, rf, svm, nb, lg, ln], table=K,
        exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
        target_attr='gold', metric='precision', random_state=0)
result['selected_matcher']


Out[43]:
<magellan.matcher.logregmatcher.LogRegMatcher at 0x7feee95ebb50>

In [44]:
result['cv_stats']


Out[44]:
Name Matcher Num folds Fold 1 Fold 2 Fold 3 Fold 4 Fold 5 Mean score
0 DecisionTree <magellan.matcher.dtmatcher.DTMatcher object a... 5 0.833333 0.909091 1.000000 0.800000 0.894737 0.887432
1 RF <magellan.matcher.rfmatcher.RFMatcher object a... 5 0.909091 0.900000 1.000000 0.800000 1.000000 0.921818
2 SVM <magellan.matcher.svmmatcher.SVMMatcher object... 5 0.916667 0.900000 1.000000 0.888889 0.809524 0.903016
3 NB <magellan.matcher.nbmatcher.NBMatcher object a... 5 0.600000 0.714286 0.736842 0.588235 0.571429 0.642158
4 LogReg <magellan.matcher.logregmatcher.LogRegMatcher ... 5 1.000000 1.000000 1.000000 0.875000 0.888889 0.952778
5 LinReg <magellan.matcher.linregmatcher.LinRegMatcher ... 5 0.800000 0.923077 1.000000 0.833333 0.739130 0.859108

In [45]:
UV = mg.train_test_split(K, train_proportion=0.5, random_state=0)
U = UV['train']
V = UV['test']
# 1. Train RF using U
# 2. Predict V using DT
# 3. Evaluate predictions
rf.fit(table=U,
       exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
       target_attr='gold')
P = rf.predict(table=V, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
              append=True, target_attr='predicted', inplace=False)
eval_result = mg.eval_matches(P, 'gold', 'predicted')
mg.print_eval_summary(eval_result)


Precision : 100.0% (28/28)
Recall : 77.78% (28/36)
F1 : 87.5%
False positives : 0 (out of 28 positive predictions)
False negatives : 8 (out of 127 negative predictions)

In [46]:
dt.fit(table=U,
       exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
       target_attr='gold')
P = dt.predict(table=V, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
              append=True, target_attr='predicted', inplace=False)
eval_result = mg.eval_matches(P, 'gold', 'predicted')
mg.print_eval_summary(eval_result)


Precision : 96.55% (28/29)
Recall : 77.78% (28/36)
F1 : 86.15%
False positives : 1 (out of 29 positive predictions)
False negatives : 8 (out of 126 negative predictions)

In [47]:
svm.fit(table=U,
       exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
       target_attr='gold')
P = svm.predict(table=V, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
              append=True, target_attr='predicted', inplace=False)
eval_result = mg.eval_matches(P, 'gold', 'predicted')
mg.print_eval_summary(eval_result)


Precision : 100.0% (10/10)
Recall : 27.78% (10/36)
F1 : 43.48%
False positives : 0 (out of 10 positive predictions)
False negatives : 26 (out of 145 negative predictions)

In [48]:
nb.fit(table=U,
       exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
       target_attr='gold')
P = nb.predict(table=V, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
              append=True, target_attr='predicted', inplace=False)
eval_result = mg.eval_matches(P, 'gold', 'predicted')
mg.print_eval_summary(eval_result)


Precision : 65.22% (30/46)
Recall : 83.33% (30/36)
F1 : 73.17%
False positives : 16 (out of 46 positive predictions)
False negatives : 6 (out of 109 negative predictions)

In [49]:
lg.fit(table=U,
       exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
       target_attr='gold')
P = lg.predict(table=V, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
              append=True, target_attr='predicted', inplace=False)
eval_result = mg.eval_matches(P, 'gold', 'predicted')
mg.print_eval_summary(eval_result)


Precision : 100.0% (21/21)
Recall : 58.33% (21/36)
F1 : 73.68%
False positives : 0 (out of 21 positive predictions)
False negatives : 15 (out of 134 negative predictions)

In [50]:
ln.fit(table=U,
       exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
       target_attr='gold')
P = ln.predict(table=V, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
              append=True, target_attr='predicted', inplace=False)
eval_result = mg.eval_matches(P, 'gold', 'predicted')
mg.print_eval_summary(eval_result)


Precision : 88.89% (32/36)
Recall : 88.89% (32/36)
F1 : 88.89%
False positives : 4 (out of 36 positive predictions)
False negatives : 4 (out of 119 negative predictions)

In [51]:
# Include feature for 'phone'

In [52]:
feat_table


Out[52]:
feature_name left_attribute right_attribute left_attr_tokenizer right_attr_tokenizer simfunction function function_source
0 ID_ID_exm ID ID None None exact_match <function ID_ID_exm at 0x7feee96ab500> from magellan.feature.simfunctions import *\nf...
1 ID_ID_anm ID ID None None abs_norm <function ID_ID_anm at 0x7feee96ab230> from magellan.feature.simfunctions import *\nf...
2 ID_ID_lev ID ID None None lev <function ID_ID_lev at 0x7feee96ab848> from magellan.feature.simfunctions import *\nf...
3 name_name_jac_qgm_3_qgm_3 name name qgm_3 qgm_3 jaccard <function name_name_jac_qgm_3_qgm_3 at 0x7feee... from magellan.feature.simfunctions import *\nf...
4 name_name_cos_dlm_dc0_dlm_dc0 name name dlm_dc0 dlm_dc0 cosine <function name_name_cos_dlm_dc0_dlm_dc0 at 0x7... from magellan.feature.simfunctions import *\nf...
5 name_name_jac_dlm_dc0_dlm_dc0 name name dlm_dc0 dlm_dc0 jaccard <function name_name_jac_dlm_dc0_dlm_dc0 at 0x7... from magellan.feature.simfunctions import *\nf...
6 name_name_mel name name None None monge_elkan <function name_name_mel at 0x7feee96aba28> from magellan.feature.simfunctions import *\nf...
7 name_name_lev name name None None lev <function name_name_lev at 0x7feee96abaa0> from magellan.feature.simfunctions import *\nf...
8 name_name_nmw name name None None needleman_wunsch <function name_name_nmw at 0x7feee96abb18> from magellan.feature.simfunctions import *\nf...
9 name_name_sw name name None None smith_waterman <function name_name_sw at 0x7feee96abb90> from magellan.feature.simfunctions import *\nf...
10 name_name_swg name name None None smith_waterman_gotoh <function name_name_swg at 0x7feee96abc08> from magellan.feature.simfunctions import *\nf...
11 votes_votes_exm votes votes None None exact_match <function votes_votes_exm at 0x7feee96abc80> from magellan.feature.simfunctions import *\nf...
12 votes_votes_anm votes votes None None abs_norm <function votes_votes_anm at 0x7feee96abcf8> from magellan.feature.simfunctions import *\nf...
13 votes_votes_lev votes votes None None lev <function votes_votes_lev at 0x7feee96abd70> from magellan.feature.simfunctions import *\nf...
14 rating_rating_exm rating rating None None exact_match <function rating_rating_exm at 0x7feee96abde8> from magellan.feature.simfunctions import *\nf...
15 rating_rating_anm rating rating None None abs_norm <function rating_rating_anm at 0x7feee96abe60> from magellan.feature.simfunctions import *\nf...
16 rating_rating_lev rating rating None None lev <function rating_rating_lev at 0x7feee96abed8> from magellan.feature.simfunctions import *\nf...
17 phone_phone_jac_qgm_3_qgm_3 phone phone qgm_3 qgm_3 jaccard <function phone_phone_jac_qgm_3_qgm_3 at 0x7fe... from magellan.feature.simfunctions import *\nf...
18 phone_phone_cos_dlm_dc0_dlm_dc0 phone phone dlm_dc0 dlm_dc0 cosine <function phone_phone_cos_dlm_dc0_dlm_dc0 at 0... from magellan.feature.simfunctions import *\nf...
19 phone_phone_jac_dlm_dc0_dlm_dc0 phone phone dlm_dc0 dlm_dc0 jaccard <function phone_phone_jac_dlm_dc0_dlm_dc0 at 0... from magellan.feature.simfunctions import *\nf...
20 phone_phone_mel phone phone None None monge_elkan <function phone_phone_mel at 0x7feee913f050> from magellan.feature.simfunctions import *\nf...
21 phone_phone_lev phone phone None None lev <function phone_phone_lev at 0x7feee913f0c8> from magellan.feature.simfunctions import *\nf...
22 phone_phone_nmw phone phone None None needleman_wunsch <function phone_phone_nmw at 0x7feee913f140> from magellan.feature.simfunctions import *\nf...
23 phone_phone_sw phone phone None None smith_waterman <function phone_phone_sw at 0x7feee913f1b8> from magellan.feature.simfunctions import *\nf...
24 phone_phone_swg phone phone None None smith_waterman_gotoh <function phone_phone_swg at 0x7feee913f230> from magellan.feature.simfunctions import *\nf...
25 address_address_jac_qgm_3_qgm_3 address address qgm_3 qgm_3 jaccard <function address_address_jac_qgm_3_qgm_3 at 0... from magellan.feature.simfunctions import *\nf...
26 address_address_cos_dlm_dc0_dlm_dc0 address address dlm_dc0 dlm_dc0 cosine <function address_address_cos_dlm_dc0_dlm_dc0 ... from magellan.feature.simfunctions import *\nf...
27 address_address_jac_dlm_dc0_dlm_dc0 address address dlm_dc0 dlm_dc0 jaccard <function address_address_jac_dlm_dc0_dlm_dc0 ... from magellan.feature.simfunctions import *\nf...
28 address_address_mel address address None None monge_elkan <function address_address_mel at 0x7feee913f410> from magellan.feature.simfunctions import *\nf...
29 address_address_lev address address None None lev <function address_address_lev at 0x7feee913f488> from magellan.feature.simfunctions import *\nf...
30 address_address_nmw address address None None needleman_wunsch <function address_address_nmw at 0x7feee913f500> from magellan.feature.simfunctions import *\nf...
31 address_address_sw address address None None smith_waterman <function address_address_sw at 0x7feee913f578> from magellan.feature.simfunctions import *\nf...
32 address_address_swg address address None None smith_waterman_gotoh <function address_address_swg at 0x7feee913f5f0> from magellan.feature.simfunctions import *\nf...
33 zip_zip_exm zip zip None None exact_match <function zip_zip_exm at 0x7feee913f668> from magellan.feature.simfunctions import *\nf...
34 zip_zip_anm zip zip None None abs_norm <function zip_zip_anm at 0x7feee913f6e0> from magellan.feature.simfunctions import *\nf...
35 zip_zip_lev zip zip None None lev <function zip_zip_lev at 0x7feee913f758> from magellan.feature.simfunctions import *\nf...
36 cuisine_cuisine_jac_qgm_3_qgm_3 cuisine cuisine qgm_3 qgm_3 jaccard <function cuisine_cuisine_jac_qgm_3_qgm_3 at 0... from magellan.feature.simfunctions import *\nf...
37 cuisine_cuisine_cos_dlm_dc0_dlm_dc0 cuisine cuisine dlm_dc0 dlm_dc0 cosine <function cuisine_cuisine_cos_dlm_dc0_dlm_dc0 ... from magellan.feature.simfunctions import *\nf...
38 cuisine_cuisine_jac_dlm_dc0_dlm_dc0 cuisine cuisine dlm_dc0 dlm_dc0 jaccard <function cuisine_cuisine_jac_dlm_dc0_dlm_dc0 ... from magellan.feature.simfunctions import *\nf...
39 cuisine_cuisine_mel cuisine cuisine None None monge_elkan <function cuisine_cuisine_mel at 0x7feee913f938> from magellan.feature.simfunctions import *\nf...
40 cuisine_cuisine_lev cuisine cuisine None None lev <function cuisine_cuisine_lev at 0x7feee913f9b0> from magellan.feature.simfunctions import *\nf...
41 cuisine_cuisine_nmw cuisine cuisine None None needleman_wunsch <function cuisine_cuisine_nmw at 0x7feee913fa28> from magellan.feature.simfunctions import *\nf...
42 cuisine_cuisine_sw cuisine cuisine None None smith_waterman <function cuisine_cuisine_sw at 0x7feee913faa0> from magellan.feature.simfunctions import *\nf...
43 cuisine_cuisine_swg cuisine cuisine None None smith_waterman_gotoh <function cuisine_cuisine_swg at 0x7feee913fb18> from magellan.feature.simfunctions import *\nf...

In [53]:
# Select 'phone'  + 'address' related features
feat_subset_iter2 = feat_table[17:33]

In [54]:
feat_subset_iter2


Out[54]:
feature_name left_attribute right_attribute left_attr_tokenizer right_attr_tokenizer simfunction function function_source
17 phone_phone_jac_qgm_3_qgm_3 phone phone qgm_3 qgm_3 jaccard <function phone_phone_jac_qgm_3_qgm_3 at 0x7fe... from magellan.feature.simfunctions import *\nf...
18 phone_phone_cos_dlm_dc0_dlm_dc0 phone phone dlm_dc0 dlm_dc0 cosine <function phone_phone_cos_dlm_dc0_dlm_dc0 at 0... from magellan.feature.simfunctions import *\nf...
19 phone_phone_jac_dlm_dc0_dlm_dc0 phone phone dlm_dc0 dlm_dc0 jaccard <function phone_phone_jac_dlm_dc0_dlm_dc0 at 0... from magellan.feature.simfunctions import *\nf...
20 phone_phone_mel phone phone None None monge_elkan <function phone_phone_mel at 0x7feee913f050> from magellan.feature.simfunctions import *\nf...
21 phone_phone_lev phone phone None None lev <function phone_phone_lev at 0x7feee913f0c8> from magellan.feature.simfunctions import *\nf...
22 phone_phone_nmw phone phone None None needleman_wunsch <function phone_phone_nmw at 0x7feee913f140> from magellan.feature.simfunctions import *\nf...
23 phone_phone_sw phone phone None None smith_waterman <function phone_phone_sw at 0x7feee913f1b8> from magellan.feature.simfunctions import *\nf...
24 phone_phone_swg phone phone None None smith_waterman_gotoh <function phone_phone_swg at 0x7feee913f230> from magellan.feature.simfunctions import *\nf...
25 address_address_jac_qgm_3_qgm_3 address address qgm_3 qgm_3 jaccard <function address_address_jac_qgm_3_qgm_3 at 0... from magellan.feature.simfunctions import *\nf...
26 address_address_cos_dlm_dc0_dlm_dc0 address address dlm_dc0 dlm_dc0 cosine <function address_address_cos_dlm_dc0_dlm_dc0 ... from magellan.feature.simfunctions import *\nf...
27 address_address_jac_dlm_dc0_dlm_dc0 address address dlm_dc0 dlm_dc0 jaccard <function address_address_jac_dlm_dc0_dlm_dc0 ... from magellan.feature.simfunctions import *\nf...
28 address_address_mel address address None None monge_elkan <function address_address_mel at 0x7feee913f410> from magellan.feature.simfunctions import *\nf...
29 address_address_lev address address None None lev <function address_address_lev at 0x7feee913f488> from magellan.feature.simfunctions import *\nf...
30 address_address_nmw address address None None needleman_wunsch <function address_address_nmw at 0x7feee913f500> from magellan.feature.simfunctions import *\nf...
31 address_address_sw address address None None smith_waterman <function address_address_sw at 0x7feee913f578> from magellan.feature.simfunctions import *\nf...
32 address_address_swg address address None None smith_waterman_gotoh <function address_address_swg at 0x7feee913f5f0> from magellan.feature.simfunctions import *\nf...

In [55]:
# Get new set of features
K = mg.extract_feature_vecs(I, feature_table=feat_subset_iter2, attrs_after='gold')
# impute K
K.fillna(0, inplace=True)

In [56]:
# Split feature vectors into U and V
UV = mg.train_test_split(K, train_proportion=0.5, random_state=0)
U = UV['train']
V = UV['test']

In [57]:
# Check whether the added features improves the accuracy in the test set.
# Steps
# 1. Train RF using U
# 2. Predict V using DT
# 3. Evaluate predictions

In [58]:
# Train RF using U
dt.fit(table=U, 
       exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'], 
       target_attr='gold')

In [59]:
# Predict V using rf
P = dt.predict(table=V, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'], 
              append=True, target_attr='predicted', inplace=False)

In [60]:
# Evaluate the predictions
eval_result = mg.eval_matches(P, 'gold', 'predicted')
mg.print_eval_summary(eval_result)


Precision : 100.0% (35/35)
Recall : 97.22% (35/36)
F1 : 98.59%
False positives : 0 (out of 35 positive predictions)
False negatives : 1 (out of 120 negative predictions)

In [63]:
rf.fit(table=U,
       exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
       target_attr='gold')
P = rf.predict(table=V, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
              append=True, target_attr='predicted', inplace=False)
eval_result = mg.eval_matches(P, 'gold', 'predicted')
mg.print_eval_summary(eval_result)


Precision : 100.0% (35/35)
Recall : 97.22% (35/36)
F1 : 98.59%
False positives : 0 (out of 35 positive predictions)
False negatives : 1 (out of 120 negative predictions)

In [64]:
svm.fit(table=U,
       exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
       target_attr='gold')
P = svm.predict(table=V, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
              append=True, target_attr='predicted', inplace=False)
eval_result = mg.eval_matches(P, 'gold', 'predicted')
mg.print_eval_summary(eval_result)


Precision : 100.0% (33/33)
Recall : 91.67% (33/36)
F1 : 95.65%
False positives : 0 (out of 33 positive predictions)
False negatives : 3 (out of 122 negative predictions)

In [65]:
nb.fit(table=U,
       exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
       target_attr='gold')
P = nb.predict(table=V, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
              append=True, target_attr='predicted', inplace=False)
eval_result = mg.eval_matches(P, 'gold', 'predicted')
mg.print_eval_summary(eval_result)


Precision : 94.44% (34/36)
Recall : 94.44% (34/36)
F1 : 94.44%
False positives : 2 (out of 36 positive predictions)
False negatives : 2 (out of 119 negative predictions)

In [66]:
lg.fit(table=U,
       exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
       target_attr='gold')
P = lg.predict(table=V, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
              append=True, target_attr='predicted', inplace=False)
eval_result = mg.eval_matches(P, 'gold', 'predicted')
mg.print_eval_summary(eval_result)


Precision : 100.0% (33/33)
Recall : 91.67% (33/36)
F1 : 95.65%
False positives : 0 (out of 33 positive predictions)
False negatives : 3 (out of 122 negative predictions)

In [67]:
ln.fit(table=U,
       exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
       target_attr='gold')
P = ln.predict(table=V, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
              append=True, target_attr='predicted', inplace=False)
eval_result = mg.eval_matches(P, 'gold', 'predicted')
mg.print_eval_summary(eval_result)


Precision : 100.0% (34/34)
Recall : 94.44% (34/36)
F1 : 97.14%
False positives : 0 (out of 34 positive predictions)
False negatives : 2 (out of 121 negative predictions)

In [68]:
# Apply cross validation to find if there is a better matcher
result = mg.select_matcher([dt, rf, svm, nb, lg, ln], table=K, 
        exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
        target_attr='gold', metric='f1')

In [69]:
result['cv_stats']


Out[69]:
Name Matcher Num folds Fold 1 Fold 2 Fold 3 Fold 4 Fold 5 Mean score
0 DecisionTree <magellan.matcher.dtmatcher.DTMatcher object a... 5 1.000000 0.956522 0.928571 1.000000 0.960000 0.969019
1 RF <magellan.matcher.rfmatcher.RFMatcher object a... 5 0.965517 0.888889 0.960000 0.967742 1.000000 0.956430
2 SVM <magellan.matcher.svmmatcher.SVMMatcher object... 5 0.857143 0.888889 0.969697 1.000000 1.000000 0.943146
3 NB <magellan.matcher.nbmatcher.NBMatcher object a... 5 0.969697 0.952381 0.956522 0.888889 0.960000 0.945498
4 LogReg <magellan.matcher.logregmatcher.LogRegMatcher ... 5 0.967742 0.947368 0.962963 1.000000 0.918919 0.959398
5 LinReg <magellan.matcher.linregmatcher.LinRegMatcher ... 5 0.941176 0.903226 0.965517 1.000000 0.972973 0.956578

In [70]:
# Select DT as the best matcher -- Y
# Use phone + address related features

In [71]:
# Add triggers on top of Y

# 1. Split K into U and V
# 2. Use U,V  + Y to write triggers (examine fp, fn).

In [72]:
# Split feature vectors to U and V
UV = mg.train_test_split(K, train_proportion=0.5, random_state=0)
U = UV['train']
V = UV['test']

In [73]:
# Invoke debug interface to check FP and FN
mg.vis_debug_dt(dt, U, V, 
        exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
        target_attr='gold')

In [74]:
feat_table


Out[74]:
feature_name left_attribute right_attribute left_attr_tokenizer right_attr_tokenizer simfunction function function_source
0 ID_ID_exm ID ID None None exact_match <function ID_ID_exm at 0x7feee96ab500> from magellan.feature.simfunctions import *\nf...
1 ID_ID_anm ID ID None None abs_norm <function ID_ID_anm at 0x7feee96ab230> from magellan.feature.simfunctions import *\nf...
2 ID_ID_lev ID ID None None lev <function ID_ID_lev at 0x7feee96ab848> from magellan.feature.simfunctions import *\nf...
3 name_name_jac_qgm_3_qgm_3 name name qgm_3 qgm_3 jaccard <function name_name_jac_qgm_3_qgm_3 at 0x7feee... from magellan.feature.simfunctions import *\nf...
4 name_name_cos_dlm_dc0_dlm_dc0 name name dlm_dc0 dlm_dc0 cosine <function name_name_cos_dlm_dc0_dlm_dc0 at 0x7... from magellan.feature.simfunctions import *\nf...
5 name_name_jac_dlm_dc0_dlm_dc0 name name dlm_dc0 dlm_dc0 jaccard <function name_name_jac_dlm_dc0_dlm_dc0 at 0x7... from magellan.feature.simfunctions import *\nf...
6 name_name_mel name name None None monge_elkan <function name_name_mel at 0x7feee96aba28> from magellan.feature.simfunctions import *\nf...
7 name_name_lev name name None None lev <function name_name_lev at 0x7feee96abaa0> from magellan.feature.simfunctions import *\nf...
8 name_name_nmw name name None None needleman_wunsch <function name_name_nmw at 0x7feee96abb18> from magellan.feature.simfunctions import *\nf...
9 name_name_sw name name None None smith_waterman <function name_name_sw at 0x7feee96abb90> from magellan.feature.simfunctions import *\nf...
10 name_name_swg name name None None smith_waterman_gotoh <function name_name_swg at 0x7feee96abc08> from magellan.feature.simfunctions import *\nf...
11 votes_votes_exm votes votes None None exact_match <function votes_votes_exm at 0x7feee96abc80> from magellan.feature.simfunctions import *\nf...
12 votes_votes_anm votes votes None None abs_norm <function votes_votes_anm at 0x7feee96abcf8> from magellan.feature.simfunctions import *\nf...
13 votes_votes_lev votes votes None None lev <function votes_votes_lev at 0x7feee96abd70> from magellan.feature.simfunctions import *\nf...
14 rating_rating_exm rating rating None None exact_match <function rating_rating_exm at 0x7feee96abde8> from magellan.feature.simfunctions import *\nf...
15 rating_rating_anm rating rating None None abs_norm <function rating_rating_anm at 0x7feee96abe60> from magellan.feature.simfunctions import *\nf...
16 rating_rating_lev rating rating None None lev <function rating_rating_lev at 0x7feee96abed8> from magellan.feature.simfunctions import *\nf...
17 phone_phone_jac_qgm_3_qgm_3 phone phone qgm_3 qgm_3 jaccard <function phone_phone_jac_qgm_3_qgm_3 at 0x7fe... from magellan.feature.simfunctions import *\nf...
18 phone_phone_cos_dlm_dc0_dlm_dc0 phone phone dlm_dc0 dlm_dc0 cosine <function phone_phone_cos_dlm_dc0_dlm_dc0 at 0... from magellan.feature.simfunctions import *\nf...
19 phone_phone_jac_dlm_dc0_dlm_dc0 phone phone dlm_dc0 dlm_dc0 jaccard <function phone_phone_jac_dlm_dc0_dlm_dc0 at 0... from magellan.feature.simfunctions import *\nf...
20 phone_phone_mel phone phone None None monge_elkan <function phone_phone_mel at 0x7feee913f050> from magellan.feature.simfunctions import *\nf...
21 phone_phone_lev phone phone None None lev <function phone_phone_lev at 0x7feee913f0c8> from magellan.feature.simfunctions import *\nf...
22 phone_phone_nmw phone phone None None needleman_wunsch <function phone_phone_nmw at 0x7feee913f140> from magellan.feature.simfunctions import *\nf...
23 phone_phone_sw phone phone None None smith_waterman <function phone_phone_sw at 0x7feee913f1b8> from magellan.feature.simfunctions import *\nf...
24 phone_phone_swg phone phone None None smith_waterman_gotoh <function phone_phone_swg at 0x7feee913f230> from magellan.feature.simfunctions import *\nf...
25 address_address_jac_qgm_3_qgm_3 address address qgm_3 qgm_3 jaccard <function address_address_jac_qgm_3_qgm_3 at 0... from magellan.feature.simfunctions import *\nf...
26 address_address_cos_dlm_dc0_dlm_dc0 address address dlm_dc0 dlm_dc0 cosine <function address_address_cos_dlm_dc0_dlm_dc0 ... from magellan.feature.simfunctions import *\nf...
27 address_address_jac_dlm_dc0_dlm_dc0 address address dlm_dc0 dlm_dc0 jaccard <function address_address_jac_dlm_dc0_dlm_dc0 ... from magellan.feature.simfunctions import *\nf...
28 address_address_mel address address None None monge_elkan <function address_address_mel at 0x7feee913f410> from magellan.feature.simfunctions import *\nf...
29 address_address_lev address address None None lev <function address_address_lev at 0x7feee913f488> from magellan.feature.simfunctions import *\nf...
30 address_address_nmw address address None None needleman_wunsch <function address_address_nmw at 0x7feee913f500> from magellan.feature.simfunctions import *\nf...
31 address_address_sw address address None None smith_waterman <function address_address_sw at 0x7feee913f578> from magellan.feature.simfunctions import *\nf...
32 address_address_swg address address None None smith_waterman_gotoh <function address_address_swg at 0x7feee913f5f0> from magellan.feature.simfunctions import *\nf...
33 zip_zip_exm zip zip None None exact_match <function zip_zip_exm at 0x7feee913f668> from magellan.feature.simfunctions import *\nf...
34 zip_zip_anm zip zip None None abs_norm <function zip_zip_anm at 0x7feee913f6e0> from magellan.feature.simfunctions import *\nf...
35 zip_zip_lev zip zip None None lev <function zip_zip_lev at 0x7feee913f758> from magellan.feature.simfunctions import *\nf...
36 cuisine_cuisine_jac_qgm_3_qgm_3 cuisine cuisine qgm_3 qgm_3 jaccard <function cuisine_cuisine_jac_qgm_3_qgm_3 at 0... from magellan.feature.simfunctions import *\nf...
37 cuisine_cuisine_cos_dlm_dc0_dlm_dc0 cuisine cuisine dlm_dc0 dlm_dc0 cosine <function cuisine_cuisine_cos_dlm_dc0_dlm_dc0 ... from magellan.feature.simfunctions import *\nf...
38 cuisine_cuisine_jac_dlm_dc0_dlm_dc0 cuisine cuisine dlm_dc0 dlm_dc0 jaccard <function cuisine_cuisine_jac_dlm_dc0_dlm_dc0 ... from magellan.feature.simfunctions import *\nf...
39 cuisine_cuisine_mel cuisine cuisine None None monge_elkan <function cuisine_cuisine_mel at 0x7feee913f938> from magellan.feature.simfunctions import *\nf...
40 cuisine_cuisine_lev cuisine cuisine None None lev <function cuisine_cuisine_lev at 0x7feee913f9b0> from magellan.feature.simfunctions import *\nf...
41 cuisine_cuisine_nmw cuisine cuisine None None needleman_wunsch <function cuisine_cuisine_nmw at 0x7feee913fa28> from magellan.feature.simfunctions import *\nf...
42 cuisine_cuisine_sw cuisine cuisine None None smith_waterman <function cuisine_cuisine_sw at 0x7feee913faa0> from magellan.feature.simfunctions import *\nf...
43 cuisine_cuisine_swg cuisine cuisine None None smith_waterman_gotoh <function cuisine_cuisine_swg at 0x7feee913fb18> from magellan.feature.simfunctions import *\nf...

In [87]:
# Add trigger: we have 0 FP and 2 FN, so we'll apply name matching rule
neg_trigger = mg.MatchTrigger()
neg_trigger.add_cond_rule('name_name_lev(ltuple, rtuple) < 0.1', feat_table)
neg_trigger.add_cond_status(True)
neg_trigger.add_action(0)


Out[87]:
True

In [88]:
# Check whether the added trigger improves the accuracy in the test set.
# Steps
# 1. Train DT using U
# 2. Predict V using DT
# 3. Apply trigger
# 4. Evaluate the result

In [89]:
# Train dt using U
dt.fit(table=U, 
       exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'], 
       target_attr='gold')

In [90]:
# Predict V using dt
P = dt.predict(table=V, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'], 
              append=True, target_attr='predicted', inplace=False)

In [91]:
# Apply trigger
Q = neg_trigger.execute(P, 'predicted', inplace=False)

In [93]:
# Evaluate the result
eval_result = mg.eval_matches(Q, 'predicted', 'gold')
mg.print_eval_summary(eval_result)


Precision : 97.22% (35/36)
Recall : 100.0% (35/35)
F1 : 98.59%
False positives : 1 (out of 36 positive predictions)
False negatives : 0 (out of 119 negative predictions)

In [94]:
UV = mg.train_test_split(K, train_proportion=0.5, random_state=0)
U = UV['train']
V = UV['test']
mg.vis_debug_dt(dt, U, V,
        exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
        target_attr='gold')

In [126]:
# Do cross-validation for matcher + trigger using I (K)
result = mg.cv_matcher_and_trigger(dt, neg_trigger, table = K, 
                                   exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
                                  target_attr='gold')


0%  100%
[#####] | ETA[sec]: 0.000 
Total time elapsed: 1.729 sec

In [127]:
result['cv_stats']


Out[127]:
Metric Num folds Fold 1 Fold 2 Fold 3 Fold 4 Fold 5 Mean score
0 precision 5 1 1 0.941176 0.933333 1 0.974902
1 recall 5 1 1 0.941176 0.933333 1 0.974902
2 f1 5 1 1 0.941176 0.933333 1 0.974902

In [128]:
# Recall the cv for just the matcher (without trigger) was
result = mg.cv_matcher_and_trigger(dt, [], table = K, 
                                   exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
                                  target_attr='gold')


0%  100%
[#####] | ETA[sec]: 0.000 
Total time elapsed: 0.869 sec

In [129]:
result['cv_stats']


Out[129]:
Metric Num folds Fold 1 Fold 2 Fold 3 Fold 4 Fold 5 Mean score
0 precision 5 0.947368 1.000000 0.941176 1 0.909091 0.959527
1 recall 5 1.000000 0.933333 1.000000 1 0.833333 0.953333
2 f1 5 0.972973 0.965517 0.969697 1 0.869565 0.955550

In [130]:
# Now Z is DT (features: feat_subset_iter2) + neg_trigger
# Validate Z using J
# Steps
# 1. Extract feature vectors (using feat_subset_iter2) -- > M
# 2. Train DT using H (feature vectors generated using I)
# 3. Predict M using DT
# 4. Apply negative trigger
# 5. Evaluate the result

In [131]:
# Extract feature vectors
M = mg.extract_feature_vecs(J, feature_table=feat_subset_iter2, attrs_after='gold')
# Impute missing values
M.fillna(0, inplace=True)

In [132]:
# Train using feature vectors from I 
dt.fit(table=K, 
       exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'], 
       target_attr='gold')

In [133]:
# Predict M 
N = dt.predict(table=M, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'], 
              append=True, target_attr='predicted', inplace=False)

In [134]:
# Apply trigger
T = neg_trigger.execute(N, 'predicted', inplace=False)

In [135]:
# Evaluate the result
eval_result = mg.eval_matches(T, 'gold', 'predicted')
mg.print_eval_summary(eval_result)


Precision : 100.0% (16/16)
Recall : 88.89% (16/18)
F1 : 94.12%
False positives : 0 (out of 16 positive predictions)
False negatives : 2 (out of 118 negative predictions)

In [136]:
dt.fit(table=K,
       exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
       target_attr='gold')
N = dt.predict(table=M, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
              append=True, target_attr='predicted', inplace=False)
eval_result = mg.eval_matches(N, 'gold', 'predicted')
mg.print_eval_summary(eval_result)


Precision : 100.0% (16/16)
Recall : 88.89% (16/18)
F1 : 94.12%
False positives : 0 (out of 16 positive predictions)
False negatives : 2 (out of 118 negative predictions)

In [137]:
rf.fit(table=K,
       exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
       target_attr='gold')
N = rf.predict(table=M, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
              append=True, target_attr='predicted', inplace=False)
eval_result = mg.eval_matches(N, 'gold', 'predicted')
mg.print_eval_summary(eval_result)


Precision : 100.0% (16/16)
Recall : 88.89% (16/18)
F1 : 94.12%
False positives : 0 (out of 16 positive predictions)
False negatives : 2 (out of 118 negative predictions)

In [138]:
svm.fit(table=K,
       exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
       target_attr='gold')
N = svm.predict(table=M, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
              append=True, target_attr='predicted', inplace=False)
eval_result = mg.eval_matches(N, 'gold', 'predicted')
mg.print_eval_summary(eval_result)


Precision : 100.0% (16/16)
Recall : 88.89% (16/18)
F1 : 94.12%
False positives : 0 (out of 16 positive predictions)
False negatives : 2 (out of 118 negative predictions)

In [139]:
nb.fit(table=K,
       exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
       target_attr='gold')
N = nb.predict(table=M, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
              append=True, target_attr='predicted', inplace=False)
eval_result = mg.eval_matches(N, 'gold', 'predicted')
mg.print_eval_summary(eval_result)


Precision : 94.12% (16/17)
Recall : 88.89% (16/18)
F1 : 91.43%
False positives : 1 (out of 17 positive predictions)
False negatives : 2 (out of 117 negative predictions)

In [140]:
lg.fit(table=K,
       exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
       target_attr='gold')
N = lg.predict(table=M, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
              append=True, target_attr='predicted', inplace=False)
eval_result = mg.eval_matches(N, 'gold', 'predicted')
mg.print_eval_summary(eval_result)


Precision : 100.0% (16/16)
Recall : 88.89% (16/18)
F1 : 94.12%
False positives : 0 (out of 16 positive predictions)
False negatives : 2 (out of 118 negative predictions)

In [141]:
ln.fit(table=K,
       exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
       target_attr='gold')
N = ln.predict(table=M, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
              append=True, target_attr='predicted', inplace=False)
eval_result = mg.eval_matches(N, 'gold', 'predicted')
mg.print_eval_summary(eval_result)


Precision : 100.0% (16/16)
Recall : 88.89% (16/18)
F1 : 94.12%
False positives : 0 (out of 16 positive predictions)
False negatives : 2 (out of 118 negative predictions)

In [125]:
UV = mg.train_test_split(M, train_proportion=0.5, random_state=0)
U = UV['train']
V = UV['test']
mg.vis_debug_dt(dt, U, V,
        exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
        target_attr='gold')

In [142]:
T.to_csv('stage3_final_matches.csv')


Out[142]:
True