In [34]:
import py_entitymatching as em
import os

In [35]:
path = '../Desktop/cs838_stage3/'
sampled_movies = em.read_csv_metadata(path+'datasets/tmp_movies_8.csv', key='id')
sampled_tracks = em.read_csv_metadata(path+'datasets/tmp_tracks_8.csv', key='id')
tbl_labeled = em.read_csv_metadata(path+'datasets/sampled_8.csv', ltable=sampled_movies, rtable=sampled_tracks)

In [36]:
train_test = em.split_train_test(tbl_labeled, train_proportion=0.7)

dev_set = train_test['train']
eval_set = train_test['test']
em.to_csv_metadata(dev_set, path+'datasets/dev_set.csv')
em.to_csv_metadata(eval_set, path+'datasets/eval_set.csv')


Out[36]:
True

In [37]:
match_t = em.get_tokenizers_for_matching()
match_s = em.get_sim_funs_for_matching()
atypes1 = em.get_attr_types(sampled_movies)
atypes2 = em.get_attr_types(sampled_tracks)
match_c = em.get_attr_corres(sampled_movies, sampled_tracks)
match_f = em.get_features(sampled_movies, sampled_tracks, atypes1, atypes2, match_c, match_t, match_s)

In [38]:
H = em.extract_feature_vecs(dev_set, 
                            feature_table=match_f, 
                            attrs_after='label',
                            show_progress=False)
H.fillna(value=0, inplace=True)

In [39]:
dt = em.DTMatcher(name='DecisionTree', random_state=0)
svm = em.SVMMatcher(name='SVM', random_state=0)
rf = em.RFMatcher(name='RF', random_state=0)
lg = em.LogRegMatcher(name='LogReg', random_state=0)
ln = em.LinRegMatcher(name='LinReg')
nb = em.NBMatcher(name='NaiveBayes')

In [40]:
result_p= em.select_matcher([dt, svm, rf, lg, ln, nb], table=H, 
        exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'],
        k=5,
        target_attr='label', metric='precision', random_state=0)
print result_p['cv_stats']

# recall of matchers for 5-fold cross validations
result_r= em.select_matcher([dt, svm, rf, lg, ln, nb], table=H, 
        exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'],
        k=5,
        target_attr='label', metric='recall', random_state=0)
print result_r['cv_stats']

# F1 of matchers for 5-fold cross validations
result_f1 = em.select_matcher([dt, svm, rf, lg, ln, nb], table=H, 
        exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'],
        k=5,
        target_attr='label', metric='f1', random_state=0)
print result_f1['cv_stats']


           Name  \
0  DecisionTree   
1           SVM   
2            RF   
3        LogReg   
4        LinReg   
5    NaiveBayes   

                                                                            Matcher  \
0          <py_entitymatching.matcher.dtmatcher.DTMatcher object at 0x7f09c6602e10>   
1        <py_entitymatching.matcher.svmmatcher.SVMMatcher object at 0x7f09c6602910>   
2          <py_entitymatching.matcher.rfmatcher.RFMatcher object at 0x7f09c6602cd0>   
3  <py_entitymatching.matcher.logregmatcher.LogRegMatcher object at 0x7f09c66028d0>   
4  <py_entitymatching.matcher.linregmatcher.LinRegMatcher object at 0x7f09c6603b50>   
5          <py_entitymatching.matcher.nbmatcher.NBMatcher object at 0x7f09c66035d0>   

   Num folds    Fold 1    Fold 2  Fold 3  Fold 4    Fold 5  Mean score  
0          5  1.000000  0.571429     1.0     1.0  0.909091    0.896104  
1          5  1.000000  0.833333     1.0     1.0  1.000000    0.966667  
2          5  1.000000  0.888889     1.0     1.0  0.909091    0.959596  
3          5  1.000000  0.888889     1.0     1.0  1.000000    0.977778  
4          5  1.000000  0.888889     1.0     1.0  1.000000    0.977778  
5          5  0.923077  0.888889     1.0     1.0  1.000000    0.962393  
           Name  \
0  DecisionTree   
1           SVM   
2            RF   
3        LogReg   
4        LinReg   
5    NaiveBayes   

                                                                            Matcher  \
0          <py_entitymatching.matcher.dtmatcher.DTMatcher object at 0x7f09c6602e10>   
1        <py_entitymatching.matcher.svmmatcher.SVMMatcher object at 0x7f09c6602910>   
2          <py_entitymatching.matcher.rfmatcher.RFMatcher object at 0x7f09c6602cd0>   
3  <py_entitymatching.matcher.logregmatcher.LogRegMatcher object at 0x7f09c66028d0>   
4  <py_entitymatching.matcher.linregmatcher.LinRegMatcher object at 0x7f09c6603b50>   
5          <py_entitymatching.matcher.nbmatcher.NBMatcher object at 0x7f09c66035d0>   

   Num folds    Fold 1    Fold 2    Fold 3  Fold 4  Fold 5  Mean score  
0          5  1.000000  0.888889  0.785714     0.6     1.0    0.854921  
1          5  0.916667  0.555556  0.857143     0.8     0.9    0.805873  
2          5  1.000000  0.888889  0.857143     0.8     1.0    0.909206  
3          5  1.000000  0.888889  0.928571     0.8     1.0    0.923492  
4          5  1.000000  0.888889  0.928571     0.8     1.0    0.923492  
5          5  1.000000  0.888889  0.928571     0.8     1.0    0.923492  
           Name  \
0  DecisionTree   
1           SVM   
2            RF   
3        LogReg   
4        LinReg   
5    NaiveBayes   

                                                                            Matcher  \
0          <py_entitymatching.matcher.dtmatcher.DTMatcher object at 0x7f09c6602e10>   
1        <py_entitymatching.matcher.svmmatcher.SVMMatcher object at 0x7f09c6602910>   
2          <py_entitymatching.matcher.rfmatcher.RFMatcher object at 0x7f09c6602cd0>   
3  <py_entitymatching.matcher.logregmatcher.LogRegMatcher object at 0x7f09c66028d0>   
4  <py_entitymatching.matcher.linregmatcher.LinRegMatcher object at 0x7f09c6603b50>   
5          <py_entitymatching.matcher.nbmatcher.NBMatcher object at 0x7f09c66035d0>   

   Num folds    Fold 1    Fold 2    Fold 3    Fold 4    Fold 5  Mean score  
0          5  1.000000  0.695652  0.880000  0.750000  0.952381    0.855607  
1          5  0.956522  0.666667  0.923077  0.888889  0.947368    0.876505  
2          5  1.000000  0.888889  0.923077  0.888889  0.952381    0.930647  
3          5  1.000000  0.888889  0.962963  0.888889  1.000000    0.948148  
4          5  1.000000  0.888889  0.962963  0.888889  1.000000    0.948148  
5          5  0.960000  0.888889  0.962963  0.888889  1.000000    0.940148  

In [41]:
L = em.extract_feature_vecs(eval_set, feature_table=match_f,
                            attrs_after='label', show_progress=False)

L.fillna(value=0, inplace=True)

# Training the best matcher using feature vectors from development set
rf.fit(table=H, 
       exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'], 
       target_attr='label')

In [42]:
predictions = rf.predict(table=L, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'], 
              append=True, target_attr='predicted', inplace=False)

# evaluating the predictions
eval_result = em.eval_matches(predictions, 'label', 'predicted')
em.print_eval_summary(eval_result)


Precision : 93.75% (15/16)
Recall : 100.0% (15/15)
F1 : 96.77%
False positives : 1 (out of 16 positive predictions)
False negatives : 0 (out of 104 negative predictions)