Introduction

This IPython notebook illustrates how to remove features from feature table. First, we need to import py_entitymatching package and other libraries as follows:



In [1]:

    
# Import py_entitymatching package
import py_entitymatching as em
import os
import pandas as pd

Then, read the (sample) input tables for blocking purposes



In [2]:

    
# Get the datasets directory
datasets_dir = em.get_install_path() + os.sep + 'datasets'

# Get the paths of the input tables
path_A = datasets_dir + os.sep + 'person_table_A.csv'
path_B = datasets_dir + os.sep + 'person_table_B.csv'



In [3]:

    
# Read the CSV files and set 'ID' as the key attribute
A = em.read_csv_metadata(path_A, key='ID')
B = em.read_csv_metadata(path_B, key='ID')



In [4]:

    
# Get features (for blocking)
feature_table = em.get_features_for_blocking(A, B, validate_inferred_attr_types=False)
# Get features (for matching)
# feature_table = em.get_features_for_matching(A, B)

Removing Features from Feature Table



In [5]:

    
type(feature_table)









    Out[5]:





pandas.core.frame.DataFrame



In [6]:

    
feature_table.head()









    Out[6]:






  
    
      
      feature_name
      left_attribute
      right_attribute
      left_attr_tokenizer
      right_attr_tokenizer
      simfunction
      function
      function_source
      is_auto_generated
    
  
  
    
      0
      ID_ID_lev_dist
      ID
      ID
      None
      None
      lev_dist
      <function ID_ID_lev_dist at 0x10b5987b8>
      from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...
      True
    
    
      1
      ID_ID_lev_sim
      ID
      ID
      None
      None
      lev_sim
      <function ID_ID_lev_sim at 0x10f9b0620>
      from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...
      True
    
    
      2
      ID_ID_jar
      ID
      ID
      None
      None
      jaro
      <function ID_ID_jar at 0x10f9b0950>
      from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...
      True
    
    
      3
      ID_ID_jwn
      ID
      ID
      None
      None
      jaro_winkler
      <function ID_ID_jwn at 0x10f9b09d8>
      from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...
      True
    
    
      4
      ID_ID_exm
      ID
      ID
      None
      None
      exact_match
      <function ID_ID_exm at 0x10f9b08c8>
      from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...
      True



In [7]:

    
# Drop first row
feature_table = feature_table.drop(0)



In [8]:

    
feature_table.head()









    Out[8]:






  
    
      
      feature_name
      left_attribute
      right_attribute
      left_attr_tokenizer
      right_attr_tokenizer
      simfunction
      function
      function_source
      is_auto_generated
    
  
  
    
      1
      ID_ID_lev_sim
      ID
      ID
      None
      None
      lev_sim
      <function ID_ID_lev_sim at 0x10f9b0620>
      from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...
      True
    
    
      2
      ID_ID_jar
      ID
      ID
      None
      None
      jaro
      <function ID_ID_jar at 0x10f9b0950>
      from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...
      True
    
    
      3
      ID_ID_jwn
      ID
      ID
      None
      None
      jaro_winkler
      <function ID_ID_jwn at 0x10f9b09d8>
      from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...
      True
    
    
      4
      ID_ID_exm
      ID
      ID
      None
      None
      exact_match
      <function ID_ID_exm at 0x10f9b08c8>
      from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...
      True
    
    
      5
      ID_ID_jac_qgm_3_qgm_3
      ID
      ID
      qgm_3
      qgm_3
      jaccard
      <function ID_ID_jac_qgm_3_qgm_3 at 0x10f9b0a60>
      from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...
      True



In [9]:

    
#Remove all the features except involving name (Include only the features where the left attribute is name)
feature_table = feature_table[feature_table.left_attribute=='name']



In [10]:

    
feature_table









    Out[10]:






  
    
      
      feature_name
      left_attribute
      right_attribute
      left_attr_tokenizer
      right_attr_tokenizer
      simfunction
      function
      function_source
      is_auto_generated
    
  
  
    
      6
      name_name_jac_qgm_3_qgm_3
      name
      name
      qgm_3
      qgm_3
      jaccard
      <function name_name_jac_qgm_3_qgm_3 at 0x10f9b0ae8>
      from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...
      True
    
    
      7
      name_name_cos_dlm_dc0_dlm_dc0
      name
      name
      dlm_dc0
      dlm_dc0
      cosine
      <function name_name_cos_dlm_dc0_dlm_dc0 at 0x10f9b0b70>
      from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...
      True
    
    
      8
      name_name_jac_dlm_dc0_dlm_dc0
      name
      name
      dlm_dc0
      dlm_dc0
      jaccard
      <function name_name_jac_dlm_dc0_dlm_dc0 at 0x10f9b0bf8>
      from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...
      True
    
    
      9
      name_name_mel
      name
      name
      None
      None
      monge_elkan
      <function name_name_mel at 0x10f9b0c80>
      from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...
      True
    
    
      10
      name_name_lev_dist
      name
      name
      None
      None
      lev_dist
      <function name_name_lev_dist at 0x10f9b0d08>
      from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...
      True
    
    
      11
      name_name_lev_sim
      name
      name
      None
      None
      lev_sim
      <function name_name_lev_sim at 0x10f9b0d90>
      from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...
      True
    
    
      12
      name_name_nmw
      name
      name
      None
      None
      needleman_wunsch
      <function name_name_nmw at 0x10f9b0e18>
      from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...
      True
    
    
      13
      name_name_sw
      name
      name
      None
      None
      smith_waterman
      <function name_name_sw at 0x10f9b0ea0>
      from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...
      True



In [11]:

    
#Remove all the features except involving jaccard (Include only the features where the sim function is jaccard)
feature_table = feature_table[feature_table.simfunction=='jaccard']



In [12]:

    
feature_table









    Out[12]:






  
    
      
      feature_name
      left_attribute
      right_attribute
      left_attr_tokenizer
      right_attr_tokenizer
      simfunction
      function
      function_source
      is_auto_generated
    
  
  
    
      6
      name_name_jac_qgm_3_qgm_3
      name
      name
      qgm_3
      qgm_3
      jaccard
      <function name_name_jac_qgm_3_qgm_3 at 0x10f9b0ae8>
      from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...
      True
    
    
      8
      name_name_jac_dlm_dc0_dlm_dc0
      name
      name
      dlm_dc0
      dlm_dc0
      jaccard
      <function name_name_jac_dlm_dc0_dlm_dc0 at 0x10f9b0bf8>
      from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...
      True

	feature_name	left_attribute	right_attribute	left_attr_tokenizer	right_attr_tokenizer	simfunction	function	function_source	is_auto_generated
0	ID_ID_lev_dist	ID	ID	None	None	lev_dist	<function ID_ID_lev_dist at 0x10b5987b8>	from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...	True
1	ID_ID_lev_sim	ID	ID	None	None	lev_sim	<function ID_ID_lev_sim at 0x10f9b0620>	from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...	True
2	ID_ID_jar	ID	ID	None	None	jaro	<function ID_ID_jar at 0x10f9b0950>	from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...	True
3	ID_ID_jwn	ID	ID	None	None	jaro_winkler	<function ID_ID_jwn at 0x10f9b09d8>	from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...	True
4	ID_ID_exm	ID	ID	None	None	exact_match	<function ID_ID_exm at 0x10f9b08c8>	from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...	True

	feature_name	left_attribute	right_attribute	left_attr_tokenizer	right_attr_tokenizer	simfunction	function	function_source	is_auto_generated
6	name_name_jac_qgm_3_qgm_3	name	name	qgm_3	qgm_3	jaccard	<function name_name_jac_qgm_3_qgm_3 at 0x10f9b0ae8>	from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...	True
7	name_name_cos_dlm_dc0_dlm_dc0	name	name	dlm_dc0	dlm_dc0	cosine	<function name_name_cos_dlm_dc0_dlm_dc0 at 0x10f9b0b70>	from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...	True
8	name_name_jac_dlm_dc0_dlm_dc0	name	name	dlm_dc0	dlm_dc0	jaccard	<function name_name_jac_dlm_dc0_dlm_dc0 at 0x10f9b0bf8>	from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...	True
9	name_name_mel	name	name	None	None	monge_elkan	<function name_name_mel at 0x10f9b0c80>	from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...	True
10	name_name_lev_dist	name	name	None	None	lev_dist	<function name_name_lev_dist at 0x10f9b0d08>	from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...	True
11	name_name_lev_sim	name	name	None	None	lev_sim	<function name_name_lev_sim at 0x10f9b0d90>	from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...	True
12	name_name_nmw	name	name	None	None	needleman_wunsch	<function name_name_nmw at 0x10f9b0e18>	from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...	True
13	name_name_sw	name	name	None	None	smith_waterman	<function name_name_sw at 0x10f9b0ea0>	from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...	True