notebook.community

Edit and run



In [1]:

    
from ipykernel.kernelspec import RESOURCES
import magellan as mg









    



DEBUG:Cloud:Log file (/home/ubuntuvm/.picloud/cloud.log) opened



In [2]:

    
mg.init_jvm()









    Out[2]:





True



In [3]:

    
# Input tables
A = mg.read_csv('tableA.csv', key='ID')
B = mg.read_csv('tableB.csv', key='ID')



In [4]:

    
A.head(2)









    Out[4]:






  
    
      
      ID
      name
      votes
      rating
      phone
      address
      zip
      cuisine
      reviewcount
    
  
  
    
      0
      0
      Strings Ramen Shop
      15
      3.1
      (312) 374-3450
      2141 S. Archer Avenue, Chicago
      60616
      Asian,Chinese,Ramen
      2
    
    
      1
      1
      Francesco's Hole In The Wall
      179
      4.0
      (847) 272-0155
      254 Skokie Boulevard
      60062
      Italian
      6



In [5]:

    
B.head(2)









    Out[5]:






  
    
      
      ID
      name
      votes
      rating
      phone
      address
      zip
      cuisine
    
  
  
    
      0
      0
      Patinoâ€™s Grill
      35
      5
      (773) 280-9562
      2943 W Irving Park Rd
      60618
      American (Traditional)
    
    
      1
      1
      Grandma Jâ€™s Local Kitchen
      188
      4
      (773) 227-3626
      1552 N Kedzie Ave
      60651
      Breakfast & Brunch



In [6]:

    
# Stage2 Blocking: stored in tableC.csv
C = mg.read_csv('tableC.csv', ltable=A, rtable=B)
len(C)









    Out[6]:





10630



In [7]:

    
# Plan

# 1. Sample candidate set --> S
# 2. Label S --> G
# 3. Split G into development set I, and evaluation set J
# 4. Select best learning-based matcher Y, using I 
# 5. Add triggers to Y ---> Z
# 6. Evaluate Z using J



In [9]:

    
# Sample the candidate set
S = mg.sample_table(C, 450)



In [10]:

    
# Label S
G = mg.label_table(S, 'gold')
G.to_csv('label.csv')









    Out[10]:





True



In [11]:

    
# User labeled table
G = mg.read_csv('gold_init.csv', ltable=A, rtable=B)
len(G)









    Out[11]:





450



In [12]:

    
# Split G into development (I) and evaluation (J)
IJ = mg.train_test_split(G, train_proportion=0.7)
I = IJ['train']
J = IJ['test']
(len(I), len(J))









    Out[12]:





(315, 135)



In [13]:

    
# Selecting the best learning-based matcher using I

# Plan 
# 1. Create a set of ML-matchers
# 2. Generate features --> feature_table
# 3. Extract feature vectors using I and feature_table
# 4. Select best learning-based matcher using CV
# 5. Debug the selected matcher (and repeat the above steps)



In [14]:

    
# Create a set of ML-matchers
dt = mg.DTMatcher(name='DecisionTree', random_state=0)
svm = mg.SVMMatcher(name='SVM')
rf = mg.RFMatcher(name='RF', random_state=0)
nb = mg.NBMatcher(name='NB')
lg = mg.LogRegMatcher(name='LogReg')
ln = mg.LinRegMatcher(name='LinReg')



In [15]:

    
# Names of the matchers
(dt.name, svm.name, rf.name, nb.name, lg.name, ln.name)









    Out[15]:





('DecisionTree', 'SVM', 'RF', 'NB', 'LogReg', 'LinReg')



In [16]:

    
# Generate features
feat_table = mg.get_features_for_matching(A, B)
feat_table









    Out[16]:






  
    
      
      feature_name
      left_attribute
      right_attribute
      left_attr_tokenizer
      right_attr_tokenizer
      simfunction
      function
      function_source
    
  
  
    
      0
      ID_ID_exm
      ID
      ID
      None
      None
      exact_match
      <function ID_ID_exm at 0x7feee96ab500>
      from magellan.feature.simfunctions import *\nf...
    
    
      1
      ID_ID_anm
      ID
      ID
      None
      None
      abs_norm
      <function ID_ID_anm at 0x7feee96ab230>
      from magellan.feature.simfunctions import *\nf...
    
    
      2
      ID_ID_lev
      ID
      ID
      None
      None
      lev
      <function ID_ID_lev at 0x7feee96ab848>
      from magellan.feature.simfunctions import *\nf...
    
    
      3
      name_name_jac_qgm_3_qgm_3
      name
      name
      qgm_3
      qgm_3
      jaccard
      <function name_name_jac_qgm_3_qgm_3 at 0x7feee...
      from magellan.feature.simfunctions import *\nf...
    
    
      4
      name_name_cos_dlm_dc0_dlm_dc0
      name
      name
      dlm_dc0
      dlm_dc0
      cosine
      <function name_name_cos_dlm_dc0_dlm_dc0 at 0x7...
      from magellan.feature.simfunctions import *\nf...
    
    
      5
      name_name_jac_dlm_dc0_dlm_dc0
      name
      name
      dlm_dc0
      dlm_dc0
      jaccard
      <function name_name_jac_dlm_dc0_dlm_dc0 at 0x7...
      from magellan.feature.simfunctions import *\nf...
    
    
      6
      name_name_mel
      name
      name
      None
      None
      monge_elkan
      <function name_name_mel at 0x7feee96aba28>
      from magellan.feature.simfunctions import *\nf...
    
    
      7
      name_name_lev
      name
      name
      None
      None
      lev
      <function name_name_lev at 0x7feee96abaa0>
      from magellan.feature.simfunctions import *\nf...
    
    
      8
      name_name_nmw
      name
      name
      None
      None
      needleman_wunsch
      <function name_name_nmw at 0x7feee96abb18>
      from magellan.feature.simfunctions import *\nf...
    
    
      9
      name_name_sw
      name
      name
      None
      None
      smith_waterman
      <function name_name_sw at 0x7feee96abb90>
      from magellan.feature.simfunctions import *\nf...
    
    
      10
      name_name_swg
      name
      name
      None
      None
      smith_waterman_gotoh
      <function name_name_swg at 0x7feee96abc08>
      from magellan.feature.simfunctions import *\nf...
    
    
      11
      votes_votes_exm
      votes
      votes
      None
      None
      exact_match
      <function votes_votes_exm at 0x7feee96abc80>
      from magellan.feature.simfunctions import *\nf...
    
    
      12
      votes_votes_anm
      votes
      votes
      None
      None
      abs_norm
      <function votes_votes_anm at 0x7feee96abcf8>
      from magellan.feature.simfunctions import *\nf...
    
    
      13
      votes_votes_lev
      votes
      votes
      None
      None
      lev
      <function votes_votes_lev at 0x7feee96abd70>
      from magellan.feature.simfunctions import *\nf...
    
    
      14
      rating_rating_exm
      rating
      rating
      None
      None
      exact_match
      <function rating_rating_exm at 0x7feee96abde8>
      from magellan.feature.simfunctions import *\nf...
    
    
      15
      rating_rating_anm
      rating
      rating
      None
      None
      abs_norm
      <function rating_rating_anm at 0x7feee96abe60>
      from magellan.feature.simfunctions import *\nf...
    
    
      16
      rating_rating_lev
      rating
      rating
      None
      None
      lev
      <function rating_rating_lev at 0x7feee96abed8>
      from magellan.feature.simfunctions import *\nf...
    
    
      17
      phone_phone_jac_qgm_3_qgm_3
      phone
      phone
      qgm_3
      qgm_3
      jaccard
      <function phone_phone_jac_qgm_3_qgm_3 at 0x7fe...
      from magellan.feature.simfunctions import *\nf...
    
    
      18
      phone_phone_cos_dlm_dc0_dlm_dc0
      phone
      phone
      dlm_dc0
      dlm_dc0
      cosine
      <function phone_phone_cos_dlm_dc0_dlm_dc0 at 0...
      from magellan.feature.simfunctions import *\nf...
    
    
      19
      phone_phone_jac_dlm_dc0_dlm_dc0
      phone
      phone
      dlm_dc0
      dlm_dc0
      jaccard
      <function phone_phone_jac_dlm_dc0_dlm_dc0 at 0...
      from magellan.feature.simfunctions import *\nf...
    
    
      20
      phone_phone_mel
      phone
      phone
      None
      None
      monge_elkan
      <function phone_phone_mel at 0x7feee913f050>
      from magellan.feature.simfunctions import *\nf...
    
    
      21
      phone_phone_lev
      phone
      phone
      None
      None
      lev
      <function phone_phone_lev at 0x7feee913f0c8>
      from magellan.feature.simfunctions import *\nf...
    
    
      22
      phone_phone_nmw
      phone
      phone
      None
      None
      needleman_wunsch
      <function phone_phone_nmw at 0x7feee913f140>
      from magellan.feature.simfunctions import *\nf...
    
    
      23
      phone_phone_sw
      phone
      phone
      None
      None
      smith_waterman
      <function phone_phone_sw at 0x7feee913f1b8>
      from magellan.feature.simfunctions import *\nf...
    
    
      24
      phone_phone_swg
      phone
      phone
      None
      None
      smith_waterman_gotoh
      <function phone_phone_swg at 0x7feee913f230>
      from magellan.feature.simfunctions import *\nf...
    
    
      25
      address_address_jac_qgm_3_qgm_3
      address
      address
      qgm_3
      qgm_3
      jaccard
      <function address_address_jac_qgm_3_qgm_3 at 0...
      from magellan.feature.simfunctions import *\nf...
    
    
      26
      address_address_cos_dlm_dc0_dlm_dc0
      address
      address
      dlm_dc0
      dlm_dc0
      cosine
      <function address_address_cos_dlm_dc0_dlm_dc0 ...
      from magellan.feature.simfunctions import *\nf...
    
    
      27
      address_address_jac_dlm_dc0_dlm_dc0
      address
      address
      dlm_dc0
      dlm_dc0
      jaccard
      <function address_address_jac_dlm_dc0_dlm_dc0 ...
      from magellan.feature.simfunctions import *\nf...
    
    
      28
      address_address_mel
      address
      address
      None
      None
      monge_elkan
      <function address_address_mel at 0x7feee913f410>
      from magellan.feature.simfunctions import *\nf...
    
    
      29
      address_address_lev
      address
      address
      None
      None
      lev
      <function address_address_lev at 0x7feee913f488>
      from magellan.feature.simfunctions import *\nf...
    
    
      30
      address_address_nmw
      address
      address
      None
      None
      needleman_wunsch
      <function address_address_nmw at 0x7feee913f500>
      from magellan.feature.simfunctions import *\nf...
    
    
      31
      address_address_sw
      address
      address
      None
      None
      smith_waterman
      <function address_address_sw at 0x7feee913f578>
      from magellan.feature.simfunctions import *\nf...
    
    
      32
      address_address_swg
      address
      address
      None
      None
      smith_waterman_gotoh
      <function address_address_swg at 0x7feee913f5f0>
      from magellan.feature.simfunctions import *\nf...
    
    
      33
      zip_zip_exm
      zip
      zip
      None
      None
      exact_match
      <function zip_zip_exm at 0x7feee913f668>
      from magellan.feature.simfunctions import *\nf...
    
    
      34
      zip_zip_anm
      zip
      zip
      None
      None
      abs_norm
      <function zip_zip_anm at 0x7feee913f6e0>
      from magellan.feature.simfunctions import *\nf...
    
    
      35
      zip_zip_lev
      zip
      zip
      None
      None
      lev
      <function zip_zip_lev at 0x7feee913f758>
      from magellan.feature.simfunctions import *\nf...
    
    
      36
      cuisine_cuisine_jac_qgm_3_qgm_3
      cuisine
      cuisine
      qgm_3
      qgm_3
      jaccard
      <function cuisine_cuisine_jac_qgm_3_qgm_3 at 0...
      from magellan.feature.simfunctions import *\nf...
    
    
      37
      cuisine_cuisine_cos_dlm_dc0_dlm_dc0
      cuisine
      cuisine
      dlm_dc0
      dlm_dc0
      cosine
      <function cuisine_cuisine_cos_dlm_dc0_dlm_dc0 ...
      from magellan.feature.simfunctions import *\nf...
    
    
      38
      cuisine_cuisine_jac_dlm_dc0_dlm_dc0
      cuisine
      cuisine
      dlm_dc0
      dlm_dc0
      jaccard
      <function cuisine_cuisine_jac_dlm_dc0_dlm_dc0 ...
      from magellan.feature.simfunctions import *\nf...
    
    
      39
      cuisine_cuisine_mel
      cuisine
      cuisine
      None
      None
      monge_elkan
      <function cuisine_cuisine_mel at 0x7feee913f938>
      from magellan.feature.simfunctions import *\nf...
    
    
      40
      cuisine_cuisine_lev
      cuisine
      cuisine
      None
      None
      lev
      <function cuisine_cuisine_lev at 0x7feee913f9b0>
      from magellan.feature.simfunctions import *\nf...
    
    
      41
      cuisine_cuisine_nmw
      cuisine
      cuisine
      None
      None
      needleman_wunsch
      <function cuisine_cuisine_nmw at 0x7feee913fa28>
      from magellan.feature.simfunctions import *\nf...
    
    
      42
      cuisine_cuisine_sw
      cuisine
      cuisine
      None
      None
      smith_waterman
      <function cuisine_cuisine_sw at 0x7feee913faa0>
      from magellan.feature.simfunctions import *\nf...
    
    
      43
      cuisine_cuisine_swg
      cuisine
      cuisine
      None
      None
      smith_waterman_gotoh
      <function cuisine_cuisine_swg at 0x7feee913fb18>
      from magellan.feature.simfunctions import *\nf...



In [17]:

    
mg._match_s









    Out[17]:





{'abs_norm': <function magellan.feature.simfunctions.abs_norm>,
 'cosine': <function magellan.feature.simfunctions.cosine>,
 'exact_match': <function magellan.feature.simfunctions.exact_match>,
 'jaccard': <function magellan.feature.simfunctions.jaccard>,
 'jaro': <function magellan.feature.simfunctions.jaro>,
 'jaro_winkler': <function magellan.feature.simfunctions.jaro_winkler>,
 'lev': <function magellan.feature.simfunctions.lev>,
 'monge_elkan': <function magellan.feature.simfunctions.monge_elkan>,
 'needleman_wunsch': <function magellan.feature.simfunctions.needleman_wunsch>,
 'rel_diff': <function magellan.feature.simfunctions.rel_diff>,
 'smith_waterman': <function magellan.feature.simfunctions.smith_waterman>,
 'smith_waterman_gotoh': <function magellan.feature.simfunctions.smith_waterman_gotoh>,
 'soundex': <function magellan.feature.simfunctions.soundex>}



In [18]:

    
mg._match_t









    Out[18]:





{'dlm_dc0': <function magellan.feature.tokenizers.tok_delim>,
 'qgm_2': <function magellan.feature.tokenizers.tok_qgram>,
 'qgm_3': <function magellan.feature.tokenizers.tok_qgram>}



In [19]:

    
mg._match_c['corres']









    Out[19]:





[('ID', 'ID'),
 ('name', 'name'),
 ('votes', 'votes'),
 ('rating', 'rating'),
 ('phone', 'phone'),
 ('address', 'address'),
 ('zip', 'zip'),
 ('cuisine', 'cuisine')]



In [20]:

    
# Select 'address' related features
feat_subset_iter1 = feat_table[25:33]
feat_subset_iter1









    Out[20]:






  
    
      
      feature_name
      left_attribute
      right_attribute
      left_attr_tokenizer
      right_attr_tokenizer
      simfunction
      function
      function_source
    
  
  
    
      25
      address_address_jac_qgm_3_qgm_3
      address
      address
      qgm_3
      qgm_3
      jaccard
      <function address_address_jac_qgm_3_qgm_3 at 0...
      from magellan.feature.simfunctions import *\nf...
    
    
      26
      address_address_cos_dlm_dc0_dlm_dc0
      address
      address
      dlm_dc0
      dlm_dc0
      cosine
      <function address_address_cos_dlm_dc0_dlm_dc0 ...
      from magellan.feature.simfunctions import *\nf...
    
    
      27
      address_address_jac_dlm_dc0_dlm_dc0
      address
      address
      dlm_dc0
      dlm_dc0
      jaccard
      <function address_address_jac_dlm_dc0_dlm_dc0 ...
      from magellan.feature.simfunctions import *\nf...
    
    
      28
      address_address_mel
      address
      address
      None
      None
      monge_elkan
      <function address_address_mel at 0x7feee913f410>
      from magellan.feature.simfunctions import *\nf...
    
    
      29
      address_address_lev
      address
      address
      None
      None
      lev
      <function address_address_lev at 0x7feee913f488>
      from magellan.feature.simfunctions import *\nf...
    
    
      30
      address_address_nmw
      address
      address
      None
      None
      needleman_wunsch
      <function address_address_nmw at 0x7feee913f500>
      from magellan.feature.simfunctions import *\nf...
    
    
      31
      address_address_sw
      address
      address
      None
      None
      smith_waterman
      <function address_address_sw at 0x7feee913f578>
      from magellan.feature.simfunctions import *\nf...
    
    
      32
      address_address_swg
      address
      address
      None
      None
      smith_waterman_gotoh
      <function address_address_swg at 0x7feee913f5f0>
      from magellan.feature.simfunctions import *\nf...



In [21]:

    
# Get feature vectors
K = mg.extract_feature_vecs(I, feature_table=feat_subset_iter1, attrs_after='gold')



In [22]:

    
# impute K
K.fillna(0, inplace=True)
K.head()









    Out[22]:






  
    
      
      _id
      ltable.id
      rtable.id
      address_address_jac_qgm_3_qgm_3
      address_address_cos_dlm_dc0_dlm_dc0
      address_address_jac_dlm_dc0_dlm_dc0
      address_address_mel
      address_address_lev
      address_address_nmw
      address_address_sw
      address_address_swg
      gold
    
  
  
    
      0
      0
      4451
      2989
      1.000000
      1.00000
      1.0
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1
    
    
      1
      1
      4303
      1976
      0.560000
      0.57735
      0.4
      0.800000
      0.739130
      0.847826
      0.800000
      0.800000
      0
    
    
      2
      2
      1405
      645
      0.026316
      0.00000
      0.0
      0.200000
      0.214286
      0.500000
      0.200000
      0.200000
      0
    
    
      3
      3
      3932
      1869
      0.600000
      0.75000
      0.6
      0.777778
      0.777778
      0.888889
      0.805556
      0.777778
      0
    
    
      4
      4
      3450
      1580
      0.684211
      0.75000
      0.6
      0.833333
      0.833333
      0.916667
      0.833333
      0.833333
      0



In [28]:

    
# select the best ML matcher using CV
result = mg.select_matcher([dt, rf, svm, nb, lg, ln], table=K, 
        exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
        target_attr='gold', metric='precision', random_state=0)



In [29]:

    
result['selected_matcher']









    Out[29]:





<magellan.matcher.logregmatcher.LogRegMatcher at 0x7feee95ebb50>



In [30]:

    
result['cv_stats']









    Out[30]:






  
    
      
      Name
      Matcher
      Num folds
      Fold 1
      Fold 2
      Fold 3
      Fold 4
      Fold 5
      Mean score
    
  
  
    
      0
      DecisionTree
      <magellan.matcher.dtmatcher.DTMatcher object a...
      5
      0.909091
      0.615385
      0.600000
      0.777778
      0.882353
      0.756921
    
    
      1
      RF
      <magellan.matcher.rfmatcher.RFMatcher object a...
      5
      0.900000
      0.800000
      0.857143
      0.714286
      0.937500
      0.841786
    
    
      2
      SVM
      <magellan.matcher.svmmatcher.SVMMatcher object...
      5
      1.000000
      1.000000
      1.000000
      0.714286
      0.000000
      0.742857
    
    
      3
      NB
      <magellan.matcher.nbmatcher.NBMatcher object a...
      5
      0.777778
      0.380952
      0.533333
      0.312500
      0.809524
      0.562817
    
    
      4
      LogReg
      <magellan.matcher.logregmatcher.LogRegMatcher ...
      5
      1.000000
      0.875000
      0.800000
      0.714286
      1.000000
      0.877857
    
    
      5
      LinReg
      <magellan.matcher.linregmatcher.LinRegMatcher ...
      5
      0.846154
      0.666667
      0.875000
      0.545455
      0.944444
      0.775544



In [32]:

    
UV = mg.train_test_split(K, train_proportion=0.5, random_state=0)
U = UV['train']
V = UV['test']
# 1. Train RF using U
# 2. Predict V using DT
# 3. Evaluate predictions
dt.fit(table=U,
       exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
       target_attr='gold')
P = dt.predict(table=V, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
              append=True, target_attr='predicted', inplace=False)
eval_result = mg.eval_matches(P, 'gold', 'predicted')
mg.print_eval_summary(eval_result)









    



Precision : 82.76% (24/29)
Recall : 75.0% (24/32)
F1 : 78.69%
False positives : 5 (out of 29 positive predictions)
False negatives : 8 (out of 129 negative predictions)



In [33]:

    
rf.fit(table=U,
       exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
       target_attr='gold')
P = rf.predict(table=V, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
              append=True, target_attr='predicted', inplace=False)
eval_result = mg.eval_matches(P, 'gold', 'predicted')
mg.print_eval_summary(eval_result)









    



Precision : 91.3% (21/23)
Recall : 65.63% (21/32)
F1 : 76.36%
False positives : 2 (out of 23 positive predictions)
False negatives : 11 (out of 135 negative predictions)



In [34]:

    
svm.fit(table=U,
       exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
       target_attr='gold')
P = svm.predict(table=V, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
              append=True, target_attr='predicted', inplace=False)
eval_result = mg.eval_matches(P, 'gold', 'predicted')
mg.print_eval_summary(eval_result)









    



Precision : 0.0% (0/0)
Recall : 0.0% (0/32)
F1 : 0.0%
False positives : 0 (out of 0 positive predictions)
False negatives : 32 (out of 158 negative predictions)



In [35]:

    
nb.fit(table=U,
       exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
       target_attr='gold')
P = nb.predict(table=V, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
              append=True, target_attr='predicted', inplace=False)
eval_result = mg.eval_matches(P, 'gold', 'predicted')
mg.print_eval_summary(eval_result)









    



Precision : 54.72% (29/53)
Recall : 90.63% (29/32)
F1 : 68.24%
False positives : 24 (out of 53 positive predictions)
False negatives : 3 (out of 105 negative predictions)



In [36]:

    
lg.fit(table=U,
       exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
       target_attr='gold')
P = lg.predict(table=V, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
              append=True, target_attr='predicted', inplace=False)
eval_result = mg.eval_matches(P, 'gold', 'predicted')
mg.print_eval_summary(eval_result)









    



Precision : 100.0% (12/12)
Recall : 37.5% (12/32)
F1 : 54.55%
False positives : 0 (out of 12 positive predictions)
False negatives : 20 (out of 146 negative predictions)



In [38]:

    
ln.fit(table=U,
       exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
       target_attr='gold')
P = ln.predict(table=V, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
              append=True, target_attr='predicted', inplace=False)
eval_result = mg.eval_matches(P, 'gold', 'predicted')
mg.print_eval_summary(eval_result)









    



Precision : 78.13% (25/32)
Recall : 78.13% (25/32)
F1 : 78.13%
False positives : 7 (out of 32 positive predictions)
False negatives : 7 (out of 126 negative predictions)



In [ ]:

    
# Debug random forest
# Split feature vectors to train and test
UV = mg.train_test_split(K, train_proportion=0.5, random_state=0)
U = UV['train']
V = UV['test']



In [ ]:

    
mg.vis_debug_rf(rf, U, V, 
        exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
        target_attr='gold')
# Inspect false positives and take actions:
# removed 6 tuples as they had same address, but different name and phone numbers



In [41]:

    
#New G
G = mg.read_csv('gold_final.csv', ltable=A, rtable=B)
# Split G into development (I) and evaluation (J)
IJ = mg.train_test_split(G, train_proportion=0.7)
I = IJ['train']
J = IJ['test']
(len(G),len(I), len(J))









    Out[41]:





(444, 310, 134)



In [42]:

    
K = mg.extract_feature_vecs(I, feature_table=feat_subset_iter1, attrs_after='gold')
K.fillna(0, inplace=True)
K.head()









    Out[42]:






  
    
      
      _id
      ltable.id
      rtable.id
      address_address_jac_qgm_3_qgm_3
      address_address_cos_dlm_dc0_dlm_dc0
      address_address_jac_dlm_dc0_dlm_dc0
      address_address_mel
      address_address_lev
      address_address_nmw
      address_address_sw
      address_address_swg
      gold
    
  
  
    
      0
      0
      5208
      2507
      0.514286
      0.755929
      0.571429
      1.000000
      0.540541
      0.540541
      1.000000
      1.000000
      1
    
    
      1
      1
      5497
      2616
      0.500000
      0.666667
      0.500000
      0.714286
      0.785714
      0.892857
      0.714286
      0.714286
      0
    
    
      2
      2
      3753
      2373
      0.666667
      0.750000
      0.600000
      0.836364
      0.863636
      0.931818
      0.840909
      0.836364
      0
    
    
      3
      3
      3951
      1632
      0.550000
      0.333333
      0.200000
      0.812500
      0.684211
      0.763158
      0.812500
      0.812500
      0
    
    
      4
      4
      4350
      3203
      0.533333
      0.750000
      0.600000
      0.769231
      0.785714
      0.892857
      0.846154
      0.769231
      0



In [43]:

    
result = mg.select_matcher([dt, rf, svm, nb, lg, ln], table=K,
        exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
        target_attr='gold', metric='precision', random_state=0)
result['selected_matcher']









    Out[43]:





<magellan.matcher.logregmatcher.LogRegMatcher at 0x7feee95ebb50>



In [44]:

    
result['cv_stats']









    Out[44]:






  
    
      
      Name
      Matcher
      Num folds
      Fold 1
      Fold 2
      Fold 3
      Fold 4
      Fold 5
      Mean score
    
  
  
    
      0
      DecisionTree
      <magellan.matcher.dtmatcher.DTMatcher object a...
      5
      0.833333
      0.909091
      1.000000
      0.800000
      0.894737
      0.887432
    
    
      1
      RF
      <magellan.matcher.rfmatcher.RFMatcher object a...
      5
      0.909091
      0.900000
      1.000000
      0.800000
      1.000000
      0.921818
    
    
      2
      SVM
      <magellan.matcher.svmmatcher.SVMMatcher object...
      5
      0.916667
      0.900000
      1.000000
      0.888889
      0.809524
      0.903016
    
    
      3
      NB
      <magellan.matcher.nbmatcher.NBMatcher object a...
      5
      0.600000
      0.714286
      0.736842
      0.588235
      0.571429
      0.642158
    
    
      4
      LogReg
      <magellan.matcher.logregmatcher.LogRegMatcher ...
      5
      1.000000
      1.000000
      1.000000
      0.875000
      0.888889
      0.952778
    
    
      5
      LinReg
      <magellan.matcher.linregmatcher.LinRegMatcher ...
      5
      0.800000
      0.923077
      1.000000
      0.833333
      0.739130
      0.859108



In [45]:

    
UV = mg.train_test_split(K, train_proportion=0.5, random_state=0)
U = UV['train']
V = UV['test']
# 1. Train RF using U
# 2. Predict V using DT
# 3. Evaluate predictions
rf.fit(table=U,
       exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
       target_attr='gold')
P = rf.predict(table=V, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
              append=True, target_attr='predicted', inplace=False)
eval_result = mg.eval_matches(P, 'gold', 'predicted')
mg.print_eval_summary(eval_result)









    



Precision : 100.0% (28/28)
Recall : 77.78% (28/36)
F1 : 87.5%
False positives : 0 (out of 28 positive predictions)
False negatives : 8 (out of 127 negative predictions)



In [46]:

    
dt.fit(table=U,
       exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
       target_attr='gold')
P = dt.predict(table=V, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
              append=True, target_attr='predicted', inplace=False)
eval_result = mg.eval_matches(P, 'gold', 'predicted')
mg.print_eval_summary(eval_result)









    



Precision : 96.55% (28/29)
Recall : 77.78% (28/36)
F1 : 86.15%
False positives : 1 (out of 29 positive predictions)
False negatives : 8 (out of 126 negative predictions)



In [47]:

    
svm.fit(table=U,
       exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
       target_attr='gold')
P = svm.predict(table=V, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
              append=True, target_attr='predicted', inplace=False)
eval_result = mg.eval_matches(P, 'gold', 'predicted')
mg.print_eval_summary(eval_result)









    



Precision : 100.0% (10/10)
Recall : 27.78% (10/36)
F1 : 43.48%
False positives : 0 (out of 10 positive predictions)
False negatives : 26 (out of 145 negative predictions)



In [48]:

    
nb.fit(table=U,
       exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
       target_attr='gold')
P = nb.predict(table=V, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
              append=True, target_attr='predicted', inplace=False)
eval_result = mg.eval_matches(P, 'gold', 'predicted')
mg.print_eval_summary(eval_result)









    



Precision : 65.22% (30/46)
Recall : 83.33% (30/36)
F1 : 73.17%
False positives : 16 (out of 46 positive predictions)
False negatives : 6 (out of 109 negative predictions)



In [49]:

    
lg.fit(table=U,
       exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
       target_attr='gold')
P = lg.predict(table=V, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
              append=True, target_attr='predicted', inplace=False)
eval_result = mg.eval_matches(P, 'gold', 'predicted')
mg.print_eval_summary(eval_result)









    



Precision : 100.0% (21/21)
Recall : 58.33% (21/36)
F1 : 73.68%
False positives : 0 (out of 21 positive predictions)
False negatives : 15 (out of 134 negative predictions)



In [50]:

    
ln.fit(table=U,
       exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
       target_attr='gold')
P = ln.predict(table=V, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
              append=True, target_attr='predicted', inplace=False)
eval_result = mg.eval_matches(P, 'gold', 'predicted')
mg.print_eval_summary(eval_result)









    



Precision : 88.89% (32/36)
Recall : 88.89% (32/36)
F1 : 88.89%
False positives : 4 (out of 36 positive predictions)
False negatives : 4 (out of 119 negative predictions)



In [51]:

    
# Include feature for 'phone'



In [52]:

    
feat_table









    Out[52]:






  
    
      
      feature_name
      left_attribute
      right_attribute
      left_attr_tokenizer
      right_attr_tokenizer
      simfunction
      function
      function_source
    
  
  
    
      0
      ID_ID_exm
      ID
      ID
      None
      None
      exact_match
      <function ID_ID_exm at 0x7feee96ab500>
      from magellan.feature.simfunctions import *\nf...
    
    
      1
      ID_ID_anm
      ID
      ID
      None
      None
      abs_norm
      <function ID_ID_anm at 0x7feee96ab230>
      from magellan.feature.simfunctions import *\nf...
    
    
      2
      ID_ID_lev
      ID
      ID
      None
      None
      lev
      <function ID_ID_lev at 0x7feee96ab848>
      from magellan.feature.simfunctions import *\nf...
    
    
      3
      name_name_jac_qgm_3_qgm_3
      name
      name
      qgm_3
      qgm_3
      jaccard
      <function name_name_jac_qgm_3_qgm_3 at 0x7feee...
      from magellan.feature.simfunctions import *\nf...
    
    
      4
      name_name_cos_dlm_dc0_dlm_dc0
      name
      name
      dlm_dc0
      dlm_dc0
      cosine
      <function name_name_cos_dlm_dc0_dlm_dc0 at 0x7...
      from magellan.feature.simfunctions import *\nf...
    
    
      5
      name_name_jac_dlm_dc0_dlm_dc0
      name
      name
      dlm_dc0
      dlm_dc0
      jaccard
      <function name_name_jac_dlm_dc0_dlm_dc0 at 0x7...
      from magellan.feature.simfunctions import *\nf...
    
    
      6
      name_name_mel
      name
      name
      None
      None
      monge_elkan
      <function name_name_mel at 0x7feee96aba28>
      from magellan.feature.simfunctions import *\nf...
    
    
      7
      name_name_lev
      name
      name
      None
      None
      lev
      <function name_name_lev at 0x7feee96abaa0>
      from magellan.feature.simfunctions import *\nf...
    
    
      8
      name_name_nmw
      name
      name
      None
      None
      needleman_wunsch
      <function name_name_nmw at 0x7feee96abb18>
      from magellan.feature.simfunctions import *\nf...
    
    
      9
      name_name_sw
      name
      name
      None
      None
      smith_waterman
      <function name_name_sw at 0x7feee96abb90>
      from magellan.feature.simfunctions import *\nf...
    
    
      10
      name_name_swg
      name
      name
      None
      None
      smith_waterman_gotoh
      <function name_name_swg at 0x7feee96abc08>
      from magellan.feature.simfunctions import *\nf...
    
    
      11
      votes_votes_exm
      votes
      votes
      None
      None
      exact_match
      <function votes_votes_exm at 0x7feee96abc80>
      from magellan.feature.simfunctions import *\nf...
    
    
      12
      votes_votes_anm
      votes
      votes
      None
      None
      abs_norm
      <function votes_votes_anm at 0x7feee96abcf8>
      from magellan.feature.simfunctions import *\nf...
    
    
      13
      votes_votes_lev
      votes
      votes
      None
      None
      lev
      <function votes_votes_lev at 0x7feee96abd70>
      from magellan.feature.simfunctions import *\nf...
    
    
      14
      rating_rating_exm
      rating
      rating
      None
      None
      exact_match
      <function rating_rating_exm at 0x7feee96abde8>
      from magellan.feature.simfunctions import *\nf...
    
    
      15
      rating_rating_anm
      rating
      rating
      None
      None
      abs_norm
      <function rating_rating_anm at 0x7feee96abe60>
      from magellan.feature.simfunctions import *\nf...
    
    
      16
      rating_rating_lev
      rating
      rating
      None
      None
      lev
      <function rating_rating_lev at 0x7feee96abed8>
      from magellan.feature.simfunctions import *\nf...
    
    
      17
      phone_phone_jac_qgm_3_qgm_3
      phone
      phone
      qgm_3
      qgm_3
      jaccard
      <function phone_phone_jac_qgm_3_qgm_3 at 0x7fe...
      from magellan.feature.simfunctions import *\nf...
    
    
      18
      phone_phone_cos_dlm_dc0_dlm_dc0
      phone
      phone
      dlm_dc0
      dlm_dc0
      cosine
      <function phone_phone_cos_dlm_dc0_dlm_dc0 at 0...
      from magellan.feature.simfunctions import *\nf...
    
    
      19
      phone_phone_jac_dlm_dc0_dlm_dc0
      phone
      phone
      dlm_dc0
      dlm_dc0
      jaccard
      <function phone_phone_jac_dlm_dc0_dlm_dc0 at 0...
      from magellan.feature.simfunctions import *\nf...
    
    
      20
      phone_phone_mel
      phone
      phone
      None
      None
      monge_elkan
      <function phone_phone_mel at 0x7feee913f050>
      from magellan.feature.simfunctions import *\nf...
    
    
      21
      phone_phone_lev
      phone
      phone
      None
      None
      lev
      <function phone_phone_lev at 0x7feee913f0c8>
      from magellan.feature.simfunctions import *\nf...
    
    
      22
      phone_phone_nmw
      phone
      phone
      None
      None
      needleman_wunsch
      <function phone_phone_nmw at 0x7feee913f140>
      from magellan.feature.simfunctions import *\nf...
    
    
      23
      phone_phone_sw
      phone
      phone
      None
      None
      smith_waterman
      <function phone_phone_sw at 0x7feee913f1b8>
      from magellan.feature.simfunctions import *\nf...
    
    
      24
      phone_phone_swg
      phone
      phone
      None
      None
      smith_waterman_gotoh
      <function phone_phone_swg at 0x7feee913f230>
      from magellan.feature.simfunctions import *\nf...
    
    
      25
      address_address_jac_qgm_3_qgm_3
      address
      address
      qgm_3
      qgm_3
      jaccard
      <function address_address_jac_qgm_3_qgm_3 at 0...
      from magellan.feature.simfunctions import *\nf...
    
    
      26
      address_address_cos_dlm_dc0_dlm_dc0
      address
      address
      dlm_dc0
      dlm_dc0
      cosine
      <function address_address_cos_dlm_dc0_dlm_dc0 ...
      from magellan.feature.simfunctions import *\nf...
    
    
      27
      address_address_jac_dlm_dc0_dlm_dc0
      address
      address
      dlm_dc0
      dlm_dc0
      jaccard
      <function address_address_jac_dlm_dc0_dlm_dc0 ...
      from magellan.feature.simfunctions import *\nf...
    
    
      28
      address_address_mel
      address
      address
      None
      None
      monge_elkan
      <function address_address_mel at 0x7feee913f410>
      from magellan.feature.simfunctions import *\nf...
    
    
      29
      address_address_lev
      address
      address
      None
      None
      lev
      <function address_address_lev at 0x7feee913f488>
      from magellan.feature.simfunctions import *\nf...
    
    
      30
      address_address_nmw
      address
      address
      None
      None
      needleman_wunsch
      <function address_address_nmw at 0x7feee913f500>
      from magellan.feature.simfunctions import *\nf...
    
    
      31
      address_address_sw
      address
      address
      None
      None
      smith_waterman
      <function address_address_sw at 0x7feee913f578>
      from magellan.feature.simfunctions import *\nf...
    
    
      32
      address_address_swg
      address
      address
      None
      None
      smith_waterman_gotoh
      <function address_address_swg at 0x7feee913f5f0>
      from magellan.feature.simfunctions import *\nf...
    
    
      33
      zip_zip_exm
      zip
      zip
      None
      None
      exact_match
      <function zip_zip_exm at 0x7feee913f668>
      from magellan.feature.simfunctions import *\nf...
    
    
      34
      zip_zip_anm
      zip
      zip
      None
      None
      abs_norm
      <function zip_zip_anm at 0x7feee913f6e0>
      from magellan.feature.simfunctions import *\nf...
    
    
      35
      zip_zip_lev
      zip
      zip
      None
      None
      lev
      <function zip_zip_lev at 0x7feee913f758>
      from magellan.feature.simfunctions import *\nf...
    
    
      36
      cuisine_cuisine_jac_qgm_3_qgm_3
      cuisine
      cuisine
      qgm_3
      qgm_3
      jaccard
      <function cuisine_cuisine_jac_qgm_3_qgm_3 at 0...
      from magellan.feature.simfunctions import *\nf...
    
    
      37
      cuisine_cuisine_cos_dlm_dc0_dlm_dc0
      cuisine
      cuisine
      dlm_dc0
      dlm_dc0
      cosine
      <function cuisine_cuisine_cos_dlm_dc0_dlm_dc0 ...
      from magellan.feature.simfunctions import *\nf...
    
    
      38
      cuisine_cuisine_jac_dlm_dc0_dlm_dc0
      cuisine
      cuisine
      dlm_dc0
      dlm_dc0
      jaccard
      <function cuisine_cuisine_jac_dlm_dc0_dlm_dc0 ...
      from magellan.feature.simfunctions import *\nf...
    
    
      39
      cuisine_cuisine_mel
      cuisine
      cuisine
      None
      None
      monge_elkan
      <function cuisine_cuisine_mel at 0x7feee913f938>
      from magellan.feature.simfunctions import *\nf...
    
    
      40
      cuisine_cuisine_lev
      cuisine
      cuisine
      None
      None
      lev
      <function cuisine_cuisine_lev at 0x7feee913f9b0>
      from magellan.feature.simfunctions import *\nf...
    
    
      41
      cuisine_cuisine_nmw
      cuisine
      cuisine
      None
      None
      needleman_wunsch
      <function cuisine_cuisine_nmw at 0x7feee913fa28>
      from magellan.feature.simfunctions import *\nf...
    
    
      42
      cuisine_cuisine_sw
      cuisine
      cuisine
      None
      None
      smith_waterman
      <function cuisine_cuisine_sw at 0x7feee913faa0>
      from magellan.feature.simfunctions import *\nf...
    
    
      43
      cuisine_cuisine_swg
      cuisine
      cuisine
      None
      None
      smith_waterman_gotoh
      <function cuisine_cuisine_swg at 0x7feee913fb18>
      from magellan.feature.simfunctions import *\nf...



In [53]:

    
# Select 'phone'  + 'address' related features
feat_subset_iter2 = feat_table[17:33]



In [54]:

    
feat_subset_iter2









    Out[54]:






  
    
      
      feature_name
      left_attribute
      right_attribute
      left_attr_tokenizer
      right_attr_tokenizer
      simfunction
      function
      function_source
    
  
  
    
      17
      phone_phone_jac_qgm_3_qgm_3
      phone
      phone
      qgm_3
      qgm_3
      jaccard
      <function phone_phone_jac_qgm_3_qgm_3 at 0x7fe...
      from magellan.feature.simfunctions import *\nf...
    
    
      18
      phone_phone_cos_dlm_dc0_dlm_dc0
      phone
      phone
      dlm_dc0
      dlm_dc0
      cosine
      <function phone_phone_cos_dlm_dc0_dlm_dc0 at 0...
      from magellan.feature.simfunctions import *\nf...
    
    
      19
      phone_phone_jac_dlm_dc0_dlm_dc0
      phone
      phone
      dlm_dc0
      dlm_dc0
      jaccard
      <function phone_phone_jac_dlm_dc0_dlm_dc0 at 0...
      from magellan.feature.simfunctions import *\nf...
    
    
      20
      phone_phone_mel
      phone
      phone
      None
      None
      monge_elkan
      <function phone_phone_mel at 0x7feee913f050>
      from magellan.feature.simfunctions import *\nf...
    
    
      21
      phone_phone_lev
      phone
      phone
      None
      None
      lev
      <function phone_phone_lev at 0x7feee913f0c8>
      from magellan.feature.simfunctions import *\nf...
    
    
      22
      phone_phone_nmw
      phone
      phone
      None
      None
      needleman_wunsch
      <function phone_phone_nmw at 0x7feee913f140>
      from magellan.feature.simfunctions import *\nf...
    
    
      23
      phone_phone_sw
      phone
      phone
      None
      None
      smith_waterman
      <function phone_phone_sw at 0x7feee913f1b8>
      from magellan.feature.simfunctions import *\nf...
    
    
      24
      phone_phone_swg
      phone
      phone
      None
      None
      smith_waterman_gotoh
      <function phone_phone_swg at 0x7feee913f230>
      from magellan.feature.simfunctions import *\nf...
    
    
      25
      address_address_jac_qgm_3_qgm_3
      address
      address
      qgm_3
      qgm_3
      jaccard
      <function address_address_jac_qgm_3_qgm_3 at 0...
      from magellan.feature.simfunctions import *\nf...
    
    
      26
      address_address_cos_dlm_dc0_dlm_dc0
      address
      address
      dlm_dc0
      dlm_dc0
      cosine
      <function address_address_cos_dlm_dc0_dlm_dc0 ...
      from magellan.feature.simfunctions import *\nf...
    
    
      27
      address_address_jac_dlm_dc0_dlm_dc0
      address
      address
      dlm_dc0
      dlm_dc0
      jaccard
      <function address_address_jac_dlm_dc0_dlm_dc0 ...
      from magellan.feature.simfunctions import *\nf...
    
    
      28
      address_address_mel
      address
      address
      None
      None
      monge_elkan
      <function address_address_mel at 0x7feee913f410>
      from magellan.feature.simfunctions import *\nf...
    
    
      29
      address_address_lev
      address
      address
      None
      None
      lev
      <function address_address_lev at 0x7feee913f488>
      from magellan.feature.simfunctions import *\nf...
    
    
      30
      address_address_nmw
      address
      address
      None
      None
      needleman_wunsch
      <function address_address_nmw at 0x7feee913f500>
      from magellan.feature.simfunctions import *\nf...
    
    
      31
      address_address_sw
      address
      address
      None
      None
      smith_waterman
      <function address_address_sw at 0x7feee913f578>
      from magellan.feature.simfunctions import *\nf...
    
    
      32
      address_address_swg
      address
      address
      None
      None
      smith_waterman_gotoh
      <function address_address_swg at 0x7feee913f5f0>
      from magellan.feature.simfunctions import *\nf...



In [55]:

    
# Get new set of features
K = mg.extract_feature_vecs(I, feature_table=feat_subset_iter2, attrs_after='gold')
# impute K
K.fillna(0, inplace=True)



In [56]:

    
# Split feature vectors into U and V
UV = mg.train_test_split(K, train_proportion=0.5, random_state=0)
U = UV['train']
V = UV['test']



In [57]:

    
# Check whether the added features improves the accuracy in the test set.
# Steps
# 1. Train RF using U
# 2. Predict V using DT
# 3. Evaluate predictions



In [58]:

    
# Train RF using U
dt.fit(table=U, 
       exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'], 
       target_attr='gold')



In [59]:

    
# Predict V using rf
P = dt.predict(table=V, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'], 
              append=True, target_attr='predicted', inplace=False)



In [60]:

    
# Evaluate the predictions
eval_result = mg.eval_matches(P, 'gold', 'predicted')
mg.print_eval_summary(eval_result)









    



Precision : 100.0% (35/35)
Recall : 97.22% (35/36)
F1 : 98.59%
False positives : 0 (out of 35 positive predictions)
False negatives : 1 (out of 120 negative predictions)



In [63]:

    
rf.fit(table=U,
       exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
       target_attr='gold')
P = rf.predict(table=V, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
              append=True, target_attr='predicted', inplace=False)
eval_result = mg.eval_matches(P, 'gold', 'predicted')
mg.print_eval_summary(eval_result)









    



Precision : 100.0% (35/35)
Recall : 97.22% (35/36)
F1 : 98.59%
False positives : 0 (out of 35 positive predictions)
False negatives : 1 (out of 120 negative predictions)



In [64]:

    
svm.fit(table=U,
       exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
       target_attr='gold')
P = svm.predict(table=V, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
              append=True, target_attr='predicted', inplace=False)
eval_result = mg.eval_matches(P, 'gold', 'predicted')
mg.print_eval_summary(eval_result)









    



Precision : 100.0% (33/33)
Recall : 91.67% (33/36)
F1 : 95.65%
False positives : 0 (out of 33 positive predictions)
False negatives : 3 (out of 122 negative predictions)



In [65]:

    
nb.fit(table=U,
       exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
       target_attr='gold')
P = nb.predict(table=V, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
              append=True, target_attr='predicted', inplace=False)
eval_result = mg.eval_matches(P, 'gold', 'predicted')
mg.print_eval_summary(eval_result)









    



Precision : 94.44% (34/36)
Recall : 94.44% (34/36)
F1 : 94.44%
False positives : 2 (out of 36 positive predictions)
False negatives : 2 (out of 119 negative predictions)



In [66]:

    
lg.fit(table=U,
       exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
       target_attr='gold')
P = lg.predict(table=V, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
              append=True, target_attr='predicted', inplace=False)
eval_result = mg.eval_matches(P, 'gold', 'predicted')
mg.print_eval_summary(eval_result)









    



Precision : 100.0% (33/33)
Recall : 91.67% (33/36)
F1 : 95.65%
False positives : 0 (out of 33 positive predictions)
False negatives : 3 (out of 122 negative predictions)



In [67]:

    
ln.fit(table=U,
       exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
       target_attr='gold')
P = ln.predict(table=V, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
              append=True, target_attr='predicted', inplace=False)
eval_result = mg.eval_matches(P, 'gold', 'predicted')
mg.print_eval_summary(eval_result)









    



Precision : 100.0% (34/34)
Recall : 94.44% (34/36)
F1 : 97.14%
False positives : 0 (out of 34 positive predictions)
False negatives : 2 (out of 121 negative predictions)



In [68]:

    
# Apply cross validation to find if there is a better matcher
result = mg.select_matcher([dt, rf, svm, nb, lg, ln], table=K, 
        exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
        target_attr='gold', metric='f1')



In [69]:

    
result['cv_stats']









    Out[69]:






  
    
      
      Name
      Matcher
      Num folds
      Fold 1
      Fold 2
      Fold 3
      Fold 4
      Fold 5
      Mean score
    
  
  
    
      0
      DecisionTree
      <magellan.matcher.dtmatcher.DTMatcher object a...
      5
      1.000000
      0.956522
      0.928571
      1.000000
      0.960000
      0.969019
    
    
      1
      RF
      <magellan.matcher.rfmatcher.RFMatcher object a...
      5
      0.965517
      0.888889
      0.960000
      0.967742
      1.000000
      0.956430
    
    
      2
      SVM
      <magellan.matcher.svmmatcher.SVMMatcher object...
      5
      0.857143
      0.888889
      0.969697
      1.000000
      1.000000
      0.943146
    
    
      3
      NB
      <magellan.matcher.nbmatcher.NBMatcher object a...
      5
      0.969697
      0.952381
      0.956522
      0.888889
      0.960000
      0.945498
    
    
      4
      LogReg
      <magellan.matcher.logregmatcher.LogRegMatcher ...
      5
      0.967742
      0.947368
      0.962963
      1.000000
      0.918919
      0.959398
    
    
      5
      LinReg
      <magellan.matcher.linregmatcher.LinRegMatcher ...
      5
      0.941176
      0.903226
      0.965517
      1.000000
      0.972973
      0.956578



In [70]:

    
# Select DT as the best matcher -- Y
# Use phone + address related features



In [71]:

    
# Add triggers on top of Y

# 1. Split K into U and V
# 2. Use U,V  + Y to write triggers (examine fp, fn).



In [72]:

    
# Split feature vectors to U and V
UV = mg.train_test_split(K, train_proportion=0.5, random_state=0)
U = UV['train']
V = UV['test']



In [73]:

    
# Invoke debug interface to check FP and FN
mg.vis_debug_dt(dt, U, V, 
        exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
        target_attr='gold')



In [74]:

    
feat_table









    Out[74]:






  
    
      
      feature_name
      left_attribute
      right_attribute
      left_attr_tokenizer
      right_attr_tokenizer
      simfunction
      function
      function_source
    
  
  
    
      0
      ID_ID_exm
      ID
      ID
      None
      None
      exact_match
      <function ID_ID_exm at 0x7feee96ab500>
      from magellan.feature.simfunctions import *\nf...
    
    
      1
      ID_ID_anm
      ID
      ID
      None
      None
      abs_norm
      <function ID_ID_anm at 0x7feee96ab230>
      from magellan.feature.simfunctions import *\nf...
    
    
      2
      ID_ID_lev
      ID
      ID
      None
      None
      lev
      <function ID_ID_lev at 0x7feee96ab848>
      from magellan.feature.simfunctions import *\nf...
    
    
      3
      name_name_jac_qgm_3_qgm_3
      name
      name
      qgm_3
      qgm_3
      jaccard
      <function name_name_jac_qgm_3_qgm_3 at 0x7feee...
      from magellan.feature.simfunctions import *\nf...
    
    
      4
      name_name_cos_dlm_dc0_dlm_dc0
      name
      name
      dlm_dc0
      dlm_dc0
      cosine
      <function name_name_cos_dlm_dc0_dlm_dc0 at 0x7...
      from magellan.feature.simfunctions import *\nf...
    
    
      5
      name_name_jac_dlm_dc0_dlm_dc0
      name
      name
      dlm_dc0
      dlm_dc0
      jaccard
      <function name_name_jac_dlm_dc0_dlm_dc0 at 0x7...
      from magellan.feature.simfunctions import *\nf...
    
    
      6
      name_name_mel
      name
      name
      None
      None
      monge_elkan
      <function name_name_mel at 0x7feee96aba28>
      from magellan.feature.simfunctions import *\nf...
    
    
      7
      name_name_lev
      name
      name
      None
      None
      lev
      <function name_name_lev at 0x7feee96abaa0>
      from magellan.feature.simfunctions import *\nf...
    
    
      8
      name_name_nmw
      name
      name
      None
      None
      needleman_wunsch
      <function name_name_nmw at 0x7feee96abb18>
      from magellan.feature.simfunctions import *\nf...
    
    
      9
      name_name_sw
      name
      name
      None
      None
      smith_waterman
      <function name_name_sw at 0x7feee96abb90>
      from magellan.feature.simfunctions import *\nf...
    
    
      10
      name_name_swg
      name
      name
      None
      None
      smith_waterman_gotoh
      <function name_name_swg at 0x7feee96abc08>
      from magellan.feature.simfunctions import *\nf...
    
    
      11
      votes_votes_exm
      votes
      votes
      None
      None
      exact_match
      <function votes_votes_exm at 0x7feee96abc80>
      from magellan.feature.simfunctions import *\nf...
    
    
      12
      votes_votes_anm
      votes
      votes
      None
      None
      abs_norm
      <function votes_votes_anm at 0x7feee96abcf8>
      from magellan.feature.simfunctions import *\nf...
    
    
      13
      votes_votes_lev
      votes
      votes
      None
      None
      lev
      <function votes_votes_lev at 0x7feee96abd70>
      from magellan.feature.simfunctions import *\nf...
    
    
      14
      rating_rating_exm
      rating
      rating
      None
      None
      exact_match
      <function rating_rating_exm at 0x7feee96abde8>
      from magellan.feature.simfunctions import *\nf...
    
    
      15
      rating_rating_anm
      rating
      rating
      None
      None
      abs_norm
      <function rating_rating_anm at 0x7feee96abe60>
      from magellan.feature.simfunctions import *\nf...
    
    
      16
      rating_rating_lev
      rating
      rating
      None
      None
      lev
      <function rating_rating_lev at 0x7feee96abed8>
      from magellan.feature.simfunctions import *\nf...
    
    
      17
      phone_phone_jac_qgm_3_qgm_3
      phone
      phone
      qgm_3
      qgm_3
      jaccard
      <function phone_phone_jac_qgm_3_qgm_3 at 0x7fe...
      from magellan.feature.simfunctions import *\nf...
    
    
      18
      phone_phone_cos_dlm_dc0_dlm_dc0
      phone
      phone
      dlm_dc0
      dlm_dc0
      cosine
      <function phone_phone_cos_dlm_dc0_dlm_dc0 at 0...
      from magellan.feature.simfunctions import *\nf...
    
    
      19
      phone_phone_jac_dlm_dc0_dlm_dc0
      phone
      phone
      dlm_dc0
      dlm_dc0
      jaccard
      <function phone_phone_jac_dlm_dc0_dlm_dc0 at 0...
      from magellan.feature.simfunctions import *\nf...
    
    
      20
      phone_phone_mel
      phone
      phone
      None
      None
      monge_elkan
      <function phone_phone_mel at 0x7feee913f050>
      from magellan.feature.simfunctions import *\nf...
    
    
      21
      phone_phone_lev
      phone
      phone
      None
      None
      lev
      <function phone_phone_lev at 0x7feee913f0c8>
      from magellan.feature.simfunctions import *\nf...
    
    
      22
      phone_phone_nmw
      phone
      phone
      None
      None
      needleman_wunsch
      <function phone_phone_nmw at 0x7feee913f140>
      from magellan.feature.simfunctions import *\nf...
    
    
      23
      phone_phone_sw
      phone
      phone
      None
      None
      smith_waterman
      <function phone_phone_sw at 0x7feee913f1b8>
      from magellan.feature.simfunctions import *\nf...
    
    
      24
      phone_phone_swg
      phone
      phone
      None
      None
      smith_waterman_gotoh
      <function phone_phone_swg at 0x7feee913f230>
      from magellan.feature.simfunctions import *\nf...
    
    
      25
      address_address_jac_qgm_3_qgm_3
      address
      address
      qgm_3
      qgm_3
      jaccard
      <function address_address_jac_qgm_3_qgm_3 at 0...
      from magellan.feature.simfunctions import *\nf...
    
    
      26
      address_address_cos_dlm_dc0_dlm_dc0
      address
      address
      dlm_dc0
      dlm_dc0
      cosine
      <function address_address_cos_dlm_dc0_dlm_dc0 ...
      from magellan.feature.simfunctions import *\nf...
    
    
      27
      address_address_jac_dlm_dc0_dlm_dc0
      address
      address
      dlm_dc0
      dlm_dc0
      jaccard
      <function address_address_jac_dlm_dc0_dlm_dc0 ...
      from magellan.feature.simfunctions import *\nf...
    
    
      28
      address_address_mel
      address
      address
      None
      None
      monge_elkan
      <function address_address_mel at 0x7feee913f410>
      from magellan.feature.simfunctions import *\nf...
    
    
      29
      address_address_lev
      address
      address
      None
      None
      lev
      <function address_address_lev at 0x7feee913f488>
      from magellan.feature.simfunctions import *\nf...
    
    
      30
      address_address_nmw
      address
      address
      None
      None
      needleman_wunsch
      <function address_address_nmw at 0x7feee913f500>
      from magellan.feature.simfunctions import *\nf...
    
    
      31
      address_address_sw
      address
      address
      None
      None
      smith_waterman
      <function address_address_sw at 0x7feee913f578>
      from magellan.feature.simfunctions import *\nf...
    
    
      32
      address_address_swg
      address
      address
      None
      None
      smith_waterman_gotoh
      <function address_address_swg at 0x7feee913f5f0>
      from magellan.feature.simfunctions import *\nf...
    
    
      33
      zip_zip_exm
      zip
      zip
      None
      None
      exact_match
      <function zip_zip_exm at 0x7feee913f668>
      from magellan.feature.simfunctions import *\nf...
    
    
      34
      zip_zip_anm
      zip
      zip
      None
      None
      abs_norm
      <function zip_zip_anm at 0x7feee913f6e0>
      from magellan.feature.simfunctions import *\nf...
    
    
      35
      zip_zip_lev
      zip
      zip
      None
      None
      lev
      <function zip_zip_lev at 0x7feee913f758>
      from magellan.feature.simfunctions import *\nf...
    
    
      36
      cuisine_cuisine_jac_qgm_3_qgm_3
      cuisine
      cuisine
      qgm_3
      qgm_3
      jaccard
      <function cuisine_cuisine_jac_qgm_3_qgm_3 at 0...
      from magellan.feature.simfunctions import *\nf...
    
    
      37
      cuisine_cuisine_cos_dlm_dc0_dlm_dc0
      cuisine
      cuisine
      dlm_dc0
      dlm_dc0
      cosine
      <function cuisine_cuisine_cos_dlm_dc0_dlm_dc0 ...
      from magellan.feature.simfunctions import *\nf...
    
    
      38
      cuisine_cuisine_jac_dlm_dc0_dlm_dc0
      cuisine
      cuisine
      dlm_dc0
      dlm_dc0
      jaccard
      <function cuisine_cuisine_jac_dlm_dc0_dlm_dc0 ...
      from magellan.feature.simfunctions import *\nf...
    
    
      39
      cuisine_cuisine_mel
      cuisine
      cuisine
      None
      None
      monge_elkan
      <function cuisine_cuisine_mel at 0x7feee913f938>
      from magellan.feature.simfunctions import *\nf...
    
    
      40
      cuisine_cuisine_lev
      cuisine
      cuisine
      None
      None
      lev
      <function cuisine_cuisine_lev at 0x7feee913f9b0>
      from magellan.feature.simfunctions import *\nf...
    
    
      41
      cuisine_cuisine_nmw
      cuisine
      cuisine
      None
      None
      needleman_wunsch
      <function cuisine_cuisine_nmw at 0x7feee913fa28>
      from magellan.feature.simfunctions import *\nf...
    
    
      42
      cuisine_cuisine_sw
      cuisine
      cuisine
      None
      None
      smith_waterman
      <function cuisine_cuisine_sw at 0x7feee913faa0>
      from magellan.feature.simfunctions import *\nf...
    
    
      43
      cuisine_cuisine_swg
      cuisine
      cuisine
      None
      None
      smith_waterman_gotoh
      <function cuisine_cuisine_swg at 0x7feee913fb18>
      from magellan.feature.simfunctions import *\nf...



In [87]:

    
# Add trigger: we have 0 FP and 2 FN, so we'll apply name matching rule
neg_trigger = mg.MatchTrigger()
neg_trigger.add_cond_rule('name_name_lev(ltuple, rtuple) < 0.1', feat_table)
neg_trigger.add_cond_status(True)
neg_trigger.add_action(0)









    Out[87]:





True



In [88]:

    
# Check whether the added trigger improves the accuracy in the test set.
# Steps
# 1. Train DT using U
# 2. Predict V using DT
# 3. Apply trigger
# 4. Evaluate the result



In [89]:

    
# Train dt using U
dt.fit(table=U, 
       exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'], 
       target_attr='gold')



In [90]:

    
# Predict V using dt
P = dt.predict(table=V, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'], 
              append=True, target_attr='predicted', inplace=False)



In [91]:

    
# Apply trigger
Q = neg_trigger.execute(P, 'predicted', inplace=False)



In [93]:

    
# Evaluate the result
eval_result = mg.eval_matches(Q, 'predicted', 'gold')
mg.print_eval_summary(eval_result)









    



Precision : 97.22% (35/36)
Recall : 100.0% (35/35)
F1 : 98.59%
False positives : 1 (out of 36 positive predictions)
False negatives : 0 (out of 119 negative predictions)



In [94]:

    
UV = mg.train_test_split(K, train_proportion=0.5, random_state=0)
U = UV['train']
V = UV['test']
mg.vis_debug_dt(dt, U, V,
        exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
        target_attr='gold')



In [126]:

    
# Do cross-validation for matcher + trigger using I (K)
result = mg.cv_matcher_and_trigger(dt, neg_trigger, table = K, 
                                   exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
                                  target_attr='gold')









    



0%  100%
[#####] | ETA[sec]: 0.000 
Total time elapsed: 1.729 sec



In [127]:

    
result['cv_stats']



In [128]:

    
# Recall the cv for just the matcher (without trigger) was
result = mg.cv_matcher_and_trigger(dt, [], table = K, 
                                   exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
                                  target_attr='gold')









    



0%  100%
[#####] | ETA[sec]: 0.000 
Total time elapsed: 0.869 sec



In [129]:

    
result['cv_stats']



In [130]:

    
# Now Z is DT (features: feat_subset_iter2) + neg_trigger
# Validate Z using J
# Steps
# 1. Extract feature vectors (using feat_subset_iter2) -- > M
# 2. Train DT using H (feature vectors generated using I)
# 3. Predict M using DT
# 4. Apply negative trigger
# 5. Evaluate the result



In [131]:

    
# Extract feature vectors
M = mg.extract_feature_vecs(J, feature_table=feat_subset_iter2, attrs_after='gold')
# Impute missing values
M.fillna(0, inplace=True)



In [132]:

    
# Train using feature vectors from I 
dt.fit(table=K, 
       exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'], 
       target_attr='gold')



In [133]:

    
# Predict M 
N = dt.predict(table=M, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'], 
              append=True, target_attr='predicted', inplace=False)



In [134]:

    
# Apply trigger
T = neg_trigger.execute(N, 'predicted', inplace=False)



In [135]:

    
# Evaluate the result
eval_result = mg.eval_matches(T, 'gold', 'predicted')
mg.print_eval_summary(eval_result)









    



Precision : 100.0% (16/16)
Recall : 88.89% (16/18)
F1 : 94.12%
False positives : 0 (out of 16 positive predictions)
False negatives : 2 (out of 118 negative predictions)



In [136]:

    
dt.fit(table=K,
       exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
       target_attr='gold')
N = dt.predict(table=M, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
              append=True, target_attr='predicted', inplace=False)
eval_result = mg.eval_matches(N, 'gold', 'predicted')
mg.print_eval_summary(eval_result)









    



Precision : 100.0% (16/16)
Recall : 88.89% (16/18)
F1 : 94.12%
False positives : 0 (out of 16 positive predictions)
False negatives : 2 (out of 118 negative predictions)



In [137]:

    
rf.fit(table=K,
       exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
       target_attr='gold')
N = rf.predict(table=M, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
              append=True, target_attr='predicted', inplace=False)
eval_result = mg.eval_matches(N, 'gold', 'predicted')
mg.print_eval_summary(eval_result)









    



Precision : 100.0% (16/16)
Recall : 88.89% (16/18)
F1 : 94.12%
False positives : 0 (out of 16 positive predictions)
False negatives : 2 (out of 118 negative predictions)



In [138]:

    
svm.fit(table=K,
       exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
       target_attr='gold')
N = svm.predict(table=M, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
              append=True, target_attr='predicted', inplace=False)
eval_result = mg.eval_matches(N, 'gold', 'predicted')
mg.print_eval_summary(eval_result)









    



Precision : 100.0% (16/16)
Recall : 88.89% (16/18)
F1 : 94.12%
False positives : 0 (out of 16 positive predictions)
False negatives : 2 (out of 118 negative predictions)



In [139]:

    
nb.fit(table=K,
       exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
       target_attr='gold')
N = nb.predict(table=M, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
              append=True, target_attr='predicted', inplace=False)
eval_result = mg.eval_matches(N, 'gold', 'predicted')
mg.print_eval_summary(eval_result)









    



Precision : 94.12% (16/17)
Recall : 88.89% (16/18)
F1 : 91.43%
False positives : 1 (out of 17 positive predictions)
False negatives : 2 (out of 117 negative predictions)



In [140]:

    
lg.fit(table=K,
       exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
       target_attr='gold')
N = lg.predict(table=M, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
              append=True, target_attr='predicted', inplace=False)
eval_result = mg.eval_matches(N, 'gold', 'predicted')
mg.print_eval_summary(eval_result)









    



Precision : 100.0% (16/16)
Recall : 88.89% (16/18)
F1 : 94.12%
False positives : 0 (out of 16 positive predictions)
False negatives : 2 (out of 118 negative predictions)



In [141]:

    
ln.fit(table=K,
       exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
       target_attr='gold')
N = ln.predict(table=M, exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
              append=True, target_attr='predicted', inplace=False)
eval_result = mg.eval_matches(N, 'gold', 'predicted')
mg.print_eval_summary(eval_result)









    



Precision : 100.0% (16/16)
Recall : 88.89% (16/18)
F1 : 94.12%
False positives : 0 (out of 16 positive predictions)
False negatives : 2 (out of 118 negative predictions)



In [125]:

    
UV = mg.train_test_split(M, train_proportion=0.5, random_state=0)
U = UV['train']
V = UV['test']
mg.vis_debug_dt(dt, U, V,
        exclude_attrs=['_id', 'ltable.id', 'rtable.id', 'gold'],
        target_attr='gold')



In [142]:

    
T.to_csv('stage3_final_matches.csv')









    Out[142]:





True

	ID	name	votes	rating	phone	address	zip	cuisine	reviewcount
0	0	Strings Ramen Shop	15	3.1	(312) 374-3450	2141 S. Archer Avenue, Chicago	60616	Asian,Chinese,Ramen	2
1	1	Francesco's Hole In The Wall	179	4.0	(847) 272-0155	254 Skokie Boulevard	60062	Italian	6

	ID	name	votes	rating	phone	address	zip	cuisine
0	0	Patinoâ€™s Grill	35	5	(773) 280-9562	2943 W Irving Park Rd	60618	American (Traditional)
1	1	Grandma Jâ€™s Local Kitchen	188	4	(773) 227-3626	1552 N Kedzie Ave	60651	Breakfast & Brunch

	feature_name	left_attribute	right_attribute	left_attr_tokenizer	right_attr_tokenizer	simfunction	function	function_source
0	ID_ID_exm	ID	ID	None	None	exact_match	<function ID_ID_exm at 0x7feee96ab500>	from magellan.feature.simfunctions import *\nf...
1	ID_ID_anm	ID	ID	None	None	abs_norm	<function ID_ID_anm at 0x7feee96ab230>	from magellan.feature.simfunctions import *\nf...
2	ID_ID_lev	ID	ID	None	None	lev	<function ID_ID_lev at 0x7feee96ab848>	from magellan.feature.simfunctions import *\nf...
3	name_name_jac_qgm_3_qgm_3	name	name	qgm_3	qgm_3	jaccard	<function name_name_jac_qgm_3_qgm_3 at 0x7feee...	from magellan.feature.simfunctions import *\nf...
4	name_name_cos_dlm_dc0_dlm_dc0	name	name	dlm_dc0	dlm_dc0	cosine	<function name_name_cos_dlm_dc0_dlm_dc0 at 0x7...	from magellan.feature.simfunctions import *\nf...
5	name_name_jac_dlm_dc0_dlm_dc0	name	name	dlm_dc0	dlm_dc0	jaccard	<function name_name_jac_dlm_dc0_dlm_dc0 at 0x7...	from magellan.feature.simfunctions import *\nf...
6	name_name_mel	name	name	None	None	monge_elkan	<function name_name_mel at 0x7feee96aba28>	from magellan.feature.simfunctions import *\nf...
7	name_name_lev	name	name	None	None	lev	<function name_name_lev at 0x7feee96abaa0>	from magellan.feature.simfunctions import *\nf...
8	name_name_nmw	name	name	None	None	needleman_wunsch	<function name_name_nmw at 0x7feee96abb18>	from magellan.feature.simfunctions import *\nf...
9	name_name_sw	name	name	None	None	smith_waterman	<function name_name_sw at 0x7feee96abb90>	from magellan.feature.simfunctions import *\nf...
10	name_name_swg	name	name	None	None	smith_waterman_gotoh	<function name_name_swg at 0x7feee96abc08>	from magellan.feature.simfunctions import *\nf...
11	votes_votes_exm	votes	votes	None	None	exact_match	<function votes_votes_exm at 0x7feee96abc80>	from magellan.feature.simfunctions import *\nf...
12	votes_votes_anm	votes	votes	None	None	abs_norm	<function votes_votes_anm at 0x7feee96abcf8>	from magellan.feature.simfunctions import *\nf...
13	votes_votes_lev	votes	votes	None	None	lev	<function votes_votes_lev at 0x7feee96abd70>	from magellan.feature.simfunctions import *\nf...
14	rating_rating_exm	rating	rating	None	None	exact_match	<function rating_rating_exm at 0x7feee96abde8>	from magellan.feature.simfunctions import *\nf...
15	rating_rating_anm	rating	rating	None	None	abs_norm	<function rating_rating_anm at 0x7feee96abe60>	from magellan.feature.simfunctions import *\nf...
16	rating_rating_lev	rating	rating	None	None	lev	<function rating_rating_lev at 0x7feee96abed8>	from magellan.feature.simfunctions import *\nf...
17	phone_phone_jac_qgm_3_qgm_3	phone	phone	qgm_3	qgm_3	jaccard	<function phone_phone_jac_qgm_3_qgm_3 at 0x7fe...	from magellan.feature.simfunctions import *\nf...
18	phone_phone_cos_dlm_dc0_dlm_dc0	phone	phone	dlm_dc0	dlm_dc0	cosine	<function phone_phone_cos_dlm_dc0_dlm_dc0 at 0...	from magellan.feature.simfunctions import *\nf...
19	phone_phone_jac_dlm_dc0_dlm_dc0	phone	phone	dlm_dc0	dlm_dc0	jaccard	<function phone_phone_jac_dlm_dc0_dlm_dc0 at 0...	from magellan.feature.simfunctions import *\nf...
20	phone_phone_mel	phone	phone	None	None	monge_elkan	<function phone_phone_mel at 0x7feee913f050>	from magellan.feature.simfunctions import *\nf...
21	phone_phone_lev	phone	phone	None	None	lev	<function phone_phone_lev at 0x7feee913f0c8>	from magellan.feature.simfunctions import *\nf...
22	phone_phone_nmw	phone	phone	None	None	needleman_wunsch	<function phone_phone_nmw at 0x7feee913f140>	from magellan.feature.simfunctions import *\nf...
23	phone_phone_sw	phone	phone	None	None	smith_waterman	<function phone_phone_sw at 0x7feee913f1b8>	from magellan.feature.simfunctions import *\nf...
24	phone_phone_swg	phone	phone	None	None	smith_waterman_gotoh	<function phone_phone_swg at 0x7feee913f230>	from magellan.feature.simfunctions import *\nf...
25	address_address_jac_qgm_3_qgm_3	address	address	qgm_3	qgm_3	jaccard	<function address_address_jac_qgm_3_qgm_3 at 0...	from magellan.feature.simfunctions import *\nf...
26	address_address_cos_dlm_dc0_dlm_dc0	address	address	dlm_dc0	dlm_dc0	cosine	<function address_address_cos_dlm_dc0_dlm_dc0 ...	from magellan.feature.simfunctions import *\nf...
27	address_address_jac_dlm_dc0_dlm_dc0	address	address	dlm_dc0	dlm_dc0	jaccard	<function address_address_jac_dlm_dc0_dlm_dc0 ...	from magellan.feature.simfunctions import *\nf...
28	address_address_mel	address	address	None	None	monge_elkan	<function address_address_mel at 0x7feee913f410>	from magellan.feature.simfunctions import *\nf...
29	address_address_lev	address	address	None	None	lev	<function address_address_lev at 0x7feee913f488>	from magellan.feature.simfunctions import *\nf...
30	address_address_nmw	address	address	None	None	needleman_wunsch	<function address_address_nmw at 0x7feee913f500>	from magellan.feature.simfunctions import *\nf...
31	address_address_sw	address	address	None	None	smith_waterman	<function address_address_sw at 0x7feee913f578>	from magellan.feature.simfunctions import *\nf...
32	address_address_swg	address	address	None	None	smith_waterman_gotoh	<function address_address_swg at 0x7feee913f5f0>	from magellan.feature.simfunctions import *\nf...
33	zip_zip_exm	zip	zip	None	None	exact_match	<function zip_zip_exm at 0x7feee913f668>	from magellan.feature.simfunctions import *\nf...
34	zip_zip_anm	zip	zip	None	None	abs_norm	<function zip_zip_anm at 0x7feee913f6e0>	from magellan.feature.simfunctions import *\nf...
35	zip_zip_lev	zip	zip	None	None	lev	<function zip_zip_lev at 0x7feee913f758>	from magellan.feature.simfunctions import *\nf...
36	cuisine_cuisine_jac_qgm_3_qgm_3	cuisine	cuisine	qgm_3	qgm_3	jaccard	<function cuisine_cuisine_jac_qgm_3_qgm_3 at 0...	from magellan.feature.simfunctions import *\nf...
37	cuisine_cuisine_cos_dlm_dc0_dlm_dc0	cuisine	cuisine	dlm_dc0	dlm_dc0	cosine	<function cuisine_cuisine_cos_dlm_dc0_dlm_dc0 ...	from magellan.feature.simfunctions import *\nf...
38	cuisine_cuisine_jac_dlm_dc0_dlm_dc0	cuisine	cuisine	dlm_dc0	dlm_dc0	jaccard	<function cuisine_cuisine_jac_dlm_dc0_dlm_dc0 ...	from magellan.feature.simfunctions import *\nf...
39	cuisine_cuisine_mel	cuisine	cuisine	None	None	monge_elkan	<function cuisine_cuisine_mel at 0x7feee913f938>	from magellan.feature.simfunctions import *\nf...
40	cuisine_cuisine_lev	cuisine	cuisine	None	None	lev	<function cuisine_cuisine_lev at 0x7feee913f9b0>	from magellan.feature.simfunctions import *\nf...
41	cuisine_cuisine_nmw	cuisine	cuisine	None	None	needleman_wunsch	<function cuisine_cuisine_nmw at 0x7feee913fa28>	from magellan.feature.simfunctions import *\nf...
42	cuisine_cuisine_sw	cuisine	cuisine	None	None	smith_waterman	<function cuisine_cuisine_sw at 0x7feee913faa0>	from magellan.feature.simfunctions import *\nf...
43	cuisine_cuisine_swg	cuisine	cuisine	None	None	smith_waterman_gotoh	<function cuisine_cuisine_swg at 0x7feee913fb18>	from magellan.feature.simfunctions import *\nf...

	_id	ltable.id	rtable.id	address_address_jac_qgm_3_qgm_3	address_address_cos_dlm_dc0_dlm_dc0	address_address_jac_dlm_dc0_dlm_dc0	address_address_mel	address_address_lev	address_address_nmw	address_address_sw	address_address_swg	gold
0	0	4451	2989	1.000000	1.00000	1.0	1.000000	1.000000	1.000000	1.000000	1.000000	1
1	1	4303	1976	0.560000	0.57735	0.4	0.800000	0.739130	0.847826	0.800000	0.800000	0
2	2	1405	645	0.026316	0.00000	0.0	0.200000	0.214286	0.500000	0.200000	0.200000	0
3	3	3932	1869	0.600000	0.75000	0.6	0.777778	0.777778	0.888889	0.805556	0.777778	0
4	4	3450	1580	0.684211	0.75000	0.6	0.833333	0.833333	0.916667	0.833333	0.833333	0

	Name	Matcher	Num folds	Fold 1	Fold 2	Fold 3	Fold 4	Fold 5	Mean score
0	DecisionTree	<magellan.matcher.dtmatcher.DTMatcher object a...	5	0.909091	0.615385	0.600000	0.777778	0.882353	0.756921
1	RF	<magellan.matcher.rfmatcher.RFMatcher object a...	5	0.900000	0.800000	0.857143	0.714286	0.937500	0.841786
2	SVM	<magellan.matcher.svmmatcher.SVMMatcher object...	5	1.000000	1.000000	1.000000	0.714286	0.000000	0.742857
3	NB	<magellan.matcher.nbmatcher.NBMatcher object a...	5	0.777778	0.380952	0.533333	0.312500	0.809524	0.562817
4	LogReg	<magellan.matcher.logregmatcher.LogRegMatcher ...	5	1.000000	0.875000	0.800000	0.714286	1.000000	0.877857
5	LinReg	<magellan.matcher.linregmatcher.LinRegMatcher ...	5	0.846154	0.666667	0.875000	0.545455	0.944444	0.775544

	_id	ltable.id	rtable.id	address_address_jac_qgm_3_qgm_3	address_address_cos_dlm_dc0_dlm_dc0	address_address_jac_dlm_dc0_dlm_dc0	address_address_mel	address_address_lev	address_address_nmw	address_address_sw	address_address_swg	gold
0	0	5208	2507	0.514286	0.755929	0.571429	1.000000	0.540541	0.540541	1.000000	1.000000	1
1	1	5497	2616	0.500000	0.666667	0.500000	0.714286	0.785714	0.892857	0.714286	0.714286	0
2	2	3753	2373	0.666667	0.750000	0.600000	0.836364	0.863636	0.931818	0.840909	0.836364	0
3	3	3951	1632	0.550000	0.333333	0.200000	0.812500	0.684211	0.763158	0.812500	0.812500	0
4	4	4350	3203	0.533333	0.750000	0.600000	0.769231	0.785714	0.892857	0.846154	0.769231	0

	Metric	Num folds	Fold 1	Fold 2	Fold 3	Fold 4	Fold 5	Mean score
0	precision	5	1	1	0.941176	0.933333	1	0.974902
1	recall	5	1	1	0.941176	0.933333	1	0.974902
2	f1	5	1	1	0.941176	0.933333	1	0.974902

	Metric	Num folds	Fold 1	Fold 2	Fold 3	Fold 4	Fold 5	Mean score
0	precision	5	0.947368	1.000000	0.941176	1	0.909091	0.959527
1	recall	5	1.000000	0.933333	1.000000	1	0.833333	0.953333
2	f1	5	0.972973	0.965517	0.969697	1	0.869565	0.955550