This notebook accompanies the article on Practical Business Python
This notebook relies on fuzzymatcher and the Python Record Linkage Toolkit
In [1]:
import pandas as pd
from pathlib import Path
import fuzzymatcher
import recordlinkage
In [2]:
hospital_accounts = pd.read_csv(
'https://github.com/chris1610/pbpython/raw/master/data/hospital_account_info.csv'
)
hospital_reimbursement = pd.read_csv(
'https://raw.githubusercontent.com/chris1610/pbpython/master/data/hospital_reimbursement.csv'
)
In [3]:
hospital_accounts.head()
Out[3]:
In [4]:
hospital_reimbursement.head()
Out[4]:
In [5]:
# Columns to match on from df_left
left_on = ["Facility Name", "Address", "City", "State"]
# Columns to match on from df_right
right_on = [
"Provider Name", "Provider Street Address", "Provider City",
"Provider State"
]
In [6]:
# Now perform the match
# It will take several minutes to run on this data set
matched_results = fuzzymatcher.fuzzy_left_join(hospital_accounts,
hospital_reimbursement,
left_on,
right_on,
left_id_col='Account_Num',
right_id_col='Provider_Num')
In [7]:
matched_results.head()
Out[7]:
In [8]:
# Reorder the columns to make viewing easier
cols = [
"best_match_score", "Facility Name", "Provider Name", "Address", "Provider Street Address",
"Provider City", "City", "Provider State", "State"
]
In [9]:
# Let's see the best matches
matched_results[cols].sort_values(by=['best_match_score'], ascending=False).head(5)
Out[9]:
In [10]:
# Now the worst matches
matched_results[cols].sort_values(by=['best_match_score'],
ascending=True).head(5)
Out[10]:
In [11]:
# Look at the matches around 1
matched_results[cols].query("best_match_score <= 1").sort_values(
by=['best_match_score'], ascending=False).head(10)
Out[11]:
In [12]:
matched_results[cols].query("best_match_score <= .80").sort_values(
by=['best_match_score'], ascending=False).head(5)
Out[12]:
In [13]:
# Re-read in the data using the index_col
hospital_accounts = pd.read_csv(
'https://github.com/chris1610/pbpython/raw/master/data/hospital_account_info.csv',
index_col='Account_Num'
)
hospital_reimbursement = pd.read_csv(
'https://raw.githubusercontent.com/chris1610/pbpython/master/data/hospital_reimbursement.csv',
index_col='Provider_Num'
)
In [14]:
hospital_accounts.head()
Out[14]:
In [15]:
hospital_reimbursement.head()
Out[15]:
In [16]:
# Build the indexer
indexer = recordlinkage.Index()
# Can use full or block
#indexer.full()
#indexer.block(left_on='State', right_on='Provider State')
# Use sortedneighbor as a good option if data is not clean
indexer.sortedneighbourhood(left_on='State', right_on='Provider State')
Out[16]:
In [17]:
candidates = indexer.index(hospital_accounts, hospital_reimbursement)
In [18]:
# Let's see how many matches we want to do
print(len(candidates))
In [19]:
# Takes 3 minutes using the full index.
# 14s using sorted neighbor
# 7s using blocking
compare = recordlinkage.Compare()
compare.exact('City', 'Provider City', label='City')
compare.string('Facility Name',
'Provider Name',
threshold=0.85,
label='Hosp_Name')
compare.string('Address',
'Provider Street Address',
method='jarowinkler',
threshold=0.85,
label='Hosp_Address')
features = compare.compute(candidates, hospital_accounts,
hospital_reimbursement)
In [20]:
features
Out[20]:
In [21]:
# What are the score totals?
features.sum(axis=1).value_counts().sort_index(ascending=False)
Out[21]:
In [22]:
# Get the potential matches
potential_matches = features[features.sum(axis=1) > 1].reset_index()
In [23]:
potential_matches['Score'] = potential_matches.loc[:, 'City':'Hosp_Address'].sum(axis=1)
potential_matches.head()
Out[23]:
In [24]:
hospital_accounts.loc[51216,:]
Out[24]:
In [25]:
hospital_reimbursement.loc[268781,:]
Out[25]:
In [26]:
# Add some convenience columns for comparing data
hospital_accounts['Acct_Name_Lookup'] = hospital_accounts[[
'Facility Name', 'Address', 'City', 'State'
]].apply(lambda x: '_'.join(x), axis=1)
In [27]:
hospital_reimbursement['Reimbursement_Name_Lookup'] = hospital_reimbursement[[
'Provider Name', 'Provider Street Address', 'Provider City',
'Provider State'
]].apply(lambda x: '_'.join(x), axis=1)
In [28]:
reimbursement_lookup = hospital_reimbursement[['Reimbursement_Name_Lookup']].reset_index()
account_lookup = hospital_accounts[['Acct_Name_Lookup']].reset_index()
In [29]:
account_lookup.head()
Out[29]:
In [30]:
reimbursement_lookup.head()
Out[30]:
In [31]:
account_merge = potential_matches.merge(account_lookup, how='left')
In [32]:
account_merge.head()
Out[32]:
In [33]:
reimbursement_lookup.head()
Out[33]:
In [34]:
# Let's build a dataframe to compare
final_merge = account_merge.merge(reimbursement_lookup, how='left')
In [35]:
cols = [
'Account_Num', 'Provider_Num', 'Score', 'Acct_Name_Lookup',
'Reimbursement_Name_Lookup'
]
In [36]:
final_merge[cols].sort_values(by=['Account_Num', 'Score'], ascending=False)
Out[36]:
In [37]:
# If you need to save it to Excel -
#final_merge.sort_values(by=['Account_Num', 'Score'],
# ascending=False).to_excel('merge_list.xlsx',
# index=False)
In [38]:
final_merge[final_merge['Account_Num']==11035][cols]
Out[38]:
In [39]:
final_merge[final_merge['Account_Num']==56375][cols]
Out[39]:
In [40]:
hospital_dupes = pd.read_csv(
'https://github.com/chris1610/pbpython/raw/master/data/hospital_account_dupes.csv',
index_col='Account_Num')
In [41]:
hospital_dupes.head()
Out[41]:
In [42]:
# Deduping follows the same process, you just use 1 single dataframe
dupe_indexer = recordlinkage.Index()
dupe_indexer.sortedneighbourhood(left_on='State')
dupe_candidate_links = dupe_indexer.index(hospital_dupes)
In [43]:
# Comparison step
compare_dupes = recordlinkage.Compare()
compare_dupes.string('City', 'City', threshold=0.85, label='City')
compare_dupes.string('Phone Number',
'Phone Number',
threshold=0.85,
label='Phone_Num')
compare_dupes.string('Facility Name',
'Facility Name',
threshold=0.80,
label='Hosp_Name')
compare_dupes.string('Address',
'Address',
threshold=0.85,
label='Hosp_Address')
dupe_features = compare_dupes.compute(dupe_candidate_links, hospital_dupes)
In [44]:
dupe_features
Out[44]:
In [45]:
dupe_features.sum(axis=1).value_counts().sort_index(ascending=False)
Out[45]:
In [46]:
potential_dupes = dupe_features[dupe_features.sum(axis=1) > 2].reset_index()
potential_dupes['Score'] = potential_dupes.loc[:, 'City':'Hosp_Address'].sum(axis=1)
In [47]:
potential_dupes.sort_values(by=['Score'], ascending=True)
Out[47]:
In [48]:
# Take a look at one of the potential duplicates
hospital_dupes[hospital_dupes.index.isin([51567, 41166])]
Out[48]: