Python Tools for Record Linking and Fuzzy Matching

This notebook accompanies the article on Practical Business Python

This notebook relies on fuzzymatcher and the Python Record Linkage Toolkit


In [1]:
import pandas as pd
from pathlib import Path
import fuzzymatcher
import recordlinkage

Example using fuzzymatcher


In [2]:
hospital_accounts = pd.read_csv(
    'https://github.com/chris1610/pbpython/raw/master/data/hospital_account_info.csv'
)
hospital_reimbursement = pd.read_csv(
    'https://raw.githubusercontent.com/chris1610/pbpython/master/data/hospital_reimbursement.csv'
)

In [3]:
hospital_accounts.head()


Out[3]:
Account_Num Facility Name Address City State ZIP Code County Name Phone Number Hospital Type Hospital Ownership
0 10605 SAGE MEMORIAL HOSPITAL STATE ROUTE 264 SOUTH 191 GANADO AZ 86505 APACHE (928) 755-4541 Critical Access Hospitals Voluntary non-profit - Private
1 24250 WOODRIDGE BEHAVIORAL CENTER 600 NORTH 7TH STREET WEST MEMPHIS AR 72301 CRITTENDEN (870) 394-4113 Psychiatric Proprietary
2 10341 DOUGLAS GARDENS HOSPITAL 5200 NE 2ND AVE MIAMI FL 33137 MIAMI-DADE (305) 751-8626 Acute Care Hospitals Voluntary non-profit - Private
3 81101 SUNCOAST BEHAVIORAL HEALTH CENTER 4480 51ST ST W BRADENTON FL 34210 MANATEE (941) 792-2222 Psychiatric Proprietary
4 39835 TREASURE VALLEY HOSPITAL 8800 WEST EMERALD STREET BOISE ID 83704 ADA (208) 373-5000 Acute Care Hospitals Proprietary

In [4]:
hospital_reimbursement.head()


Out[4]:
Provider_Num Provider Name Provider Street Address Provider City Provider State Provider Zip Code Total Discharges Average Covered Charges Average Total Payments Average Medicare Payments
0 839987 SOUTHEAST ALABAMA MEDICAL CENTER 1108 ROSS CLARK CIRCLE DOTHAN AL 36301 118 20855.61 5026.19 4115.52
1 519118 MARSHALL MEDICAL CENTER SOUTH 2505 U S HIGHWAY 431 NORTH BOAZ AL 35957 43 13289.09 5413.63 4490.93
2 733073 ELIZA COFFEE MEMORIAL HOSPITAL 205 MARENGO STREET FLORENCE AL 35631 73 22261.60 4922.18 4021.79
3 201752 MIZELL MEMORIAL HOSPITAL 702 N MAIN ST OPP AL 36467 12 10901.33 5343.50 4284.17
4 678488 ST VINCENT'S EAST 50 MEDICAL PARK EAST DRIVE BIRMINGHAM AL 35235 74 28117.95 5947.12 4819.53

In [5]:
# Columns to match on from df_left
left_on = ["Facility Name", "Address", "City", "State"]

# Columns to match on from df_right
right_on = [
    "Provider Name", "Provider Street Address", "Provider City",
    "Provider State"
]

In [6]:
# Now perform the match
# It will take several minutes to run on this data set
matched_results = fuzzymatcher.fuzzy_left_join(hospital_accounts,
                                               hospital_reimbursement,
                                               left_on,
                                               right_on,
                                               left_id_col='Account_Num',
                                               right_id_col='Provider_Num')

In [7]:
matched_results.head()


Out[7]:
best_match_score __id_left __id_right Account_Num Facility Name Address City State ZIP Code County Name ... Provider_Num Provider Name Provider Street Address Provider City Provider State Provider Zip Code Total Discharges Average Covered Charges Average Total Payments Average Medicare Payments
0 -0.746613 10605 643595 10605 SAGE MEMORIAL HOSPITAL STATE ROUTE 264 SOUTH 191 GANADO AZ 86505 APACHE ... 643595 TYLER MEMORIAL HOSPITAL 5950 STATE ROUTE 6 WEST TUNKHANNOCK PA 18657 18 20482.94 5783.22 4929.22
234 -0.609873 24250 426767 24250 WOODRIDGE BEHAVIORAL CENTER 600 NORTH 7TH STREET WEST MEMPHIS AR 72301 CRITTENDEN ... 426767 CRISP REGIONAL HOSPITAL 902 7TH STREET NORTH CORDELE GA 31015 18 14655.94 5680.28 4899.39
446 -0.404473 10341 730311 10341 DOUGLAS GARDENS HOSPITAL 5200 NE 2ND AVE MIAMI FL 33137 MIAMI-DADE ... 730311 JACKSON MEMORIAL HOSPITAL 1611 NW 12TH AVE MIAMI FL 33136 113 30021.91 14448.75 11043.58
675 -0.333020 81101 302173 81101 SUNCOAST BEHAVIORAL HEALTH CENTER 4480 51ST ST W BRADENTON FL 34210 MANATEE ... 302173 BLAKE MEDICAL CENTER 2020 59TH ST W BRADENTON FL 34209 135 46792.59 5651.22 4534.90
893 -0.629261 39835 416060 39835 TREASURE VALLEY HOSPITAL 8800 WEST EMERALD STREET BOISE ID 83704 ADA ... 416060 ST LUKE'S REGIONAL MEDICAL CENTER 190 EAST BANNOCK STREET BOISE ID 83712 68 26564.97 6514.51 5631.87

5 rows × 23 columns


In [8]:
# Reorder the columns to make viewing easier
cols = [
    "best_match_score", "Facility Name", "Provider Name", "Address", "Provider Street Address",
    "Provider City", "City", "Provider State", "State"
]

In [9]:
# Let's see the best matches
matched_results[cols].sort_values(by=['best_match_score'], ascending=False).head(5)


Out[9]:
best_match_score Facility Name Provider Name Address Provider Street Address Provider City City Provider State State
77846 3.090931 RARITAN BAY MEDICAL CENTER PERTH AMBOY DIVISION RARITAN BAY MEDICAL CENTER PERTH AMBOY DIVISION 530 NEW BRUNSWICK AVE 530 NEW BRUNSWICK AVE PERTH AMBOY PERTH AMBOY NJ NJ
532466 2.799072 ROBERT WOOD JOHNSON UNIVERSITY HOSPITAL ROBERT WOOD JOHNSON UNIVERSITY HOSPITAL ONE ROBERT WOOD JOHNSON PLACE ONE ROBERT WOOD JOHNSON PLACE NEW BRUNSWICK NEW BRUNSWICK NJ NJ
78435 2.785132 AVERA MCKENNAN HOSPITAL & UNIVERSITY HEALTH CE... AVERA MCKENNAN HOSPITAL & UNIVERSITY HEALTH CE... 1325 S CLIFF AVE POST OFFICE BOX 5045 1325 S CLIFF AVE POST OFFICE BOX 5045 SIOUX FALLS SIOUX FALLS SD SD
242697 2.778860 JOHN T MATHER MEMORIAL HOSPITAL OF PORT JEFFE... JOHN T MATHER MEMORIAL HOSPITAL OF PORT JEFFE... 75 NORTH COUNTRY ROAD 75 NORTH COUNTRY ROAD PORT JEFFERSON PORT JEFFERSON NY NY
447347 2.721425 MAYO CLINIC HEALTH SYSTEM - RED WING MAYO CLINIC HEALTH SYSTEM IN RED WING 701 HEWITT BOULEVARD, PO BOX 95 701 HEWITT BOULEVARD, PO BOX 95 RED WING RED WING MN MN

In [10]:
# Now the worst matches
matched_results[cols].sort_values(by=['best_match_score'],
                                  ascending=True).head(5)


Out[10]:
best_match_score Facility Name Provider Name Address Provider Street Address Provider City City Provider State State
426256 -2.268231 CENTRO MEDICO WILMA N VAZQUEZ BAPTIST MEDICAL CENTER EAST CARR. 2 KM 39.5 ROAD NUMBER 2 BO ALGARROBO 400 TAYLOR ROAD MONTGOMERY VEGA BAJA AL PR
83051 -2.124071 DOCTOR CENTER HOSPITAL SAN FERNANDO DE LA CARO... OVERLAKE HOSPITAL MEDICAL CENTER EDIF JESUS T PINEIRO AVE FERNANDEZ JUNCOS BO P... 1035-116TH AVE NE BELLEVUE CAROLINA WA PR
42613 -2.106746 HOSPITAL ONCOLOGICO DR ISAAC GONZALEZ MARTINEZ SCRIPPS MERCY HOSPITAL BO. MONACILLOS CARR 22 CENTRO MEDICO DE PUERTO... 4077 5TH AVE SAN DIEGO SAN JUAN CA PR
450232 -2.050888 CENTRO DE SALUD CONDUCTUAL MENONITA-CIMA MILFORD REGIONAL MEDICAL CENTER CARR ESTATAL 14 INTERIOR SARGENTO GERARDO SANT... 14 PROSPECT STREET MILFORD AIBONITO MA PR
476086 -1.996508 ADMIN DE SERVICIOS MEDICOS PUERTO RIC MAINE MEDICAL CENTER BO MONACILLO CARR NUM 22 22 BRAMHALL ST PORTLAND SAN JUAN ME PR

In [11]:
# Look at the matches around 1
matched_results[cols].query("best_match_score <= 1").sort_values(
    by=['best_match_score'], ascending=False).head(10)


Out[11]:
best_match_score Facility Name Provider Name Address Provider Street Address Provider City City Provider State State
413779 0.999843 PRISMA HEALTH BAPTIST PALMETTO HEALTH BAPTIST 1330 TAYLOR AT MARION ST TAYLOR AT MARION ST COLUMBIA COLUMBIA SC SC
148813 0.999724 CHRIST HOSPITAL CHRIST HOSPITAL 2139 AUBURN AVENUE 2139 AUBURN AVENUE CINCINNATI CINCINNATI OH OH
387796 0.997759 RUSSELL COUNTY HOSPITAL RUSSELL COUNTY MEDICAL CENTER 58 CARROLL STREET 58 CARROLL STREET LEBANON LEBANON VA VA
377611 0.993306 SKY RIDGE MEDICAL CENTER SKY RIDGE MEDICAL CENTER 10101 RIDGEGATE PARKWAY 10101 RIDGE GATE PARKWAY LONE TREE LONE TREE CO CO
303356 0.991168 HIGHLAND HOSPITAL HIGHLAND HOSPITAL 1000 SOUTH AVENUE 1000 SOUTH AVENUE ROCHESTER ROCHESTER NY NY
538105 0.990481 PAOLI HOSPITAL MAIN LINE HOSPITAL PAOLI 255 WEST LANCASTER AVENUE 255 WEST LANCASTER AVENUE PAOLI PAOLI PA PA
251502 0.986695 NYU WINTHROP HOSPITAL WINTHROP-UNIVERSITY HOSPITAL 259 FIRST STREET 259 FIRST STREET MINEOLA MINEOLA NY NY
310674 0.986440 ADVENTHEALTH GORDON GORDON HOSPITAL 1035 RED BUD ROAD 1035 RED BUD ROAD CALHOUN CALHOUN GA GA
203931 0.985763 MEMORIAL MEDICAL CENTER MEMORIAL MEDICAL CENTER 701 N FIRST ST 701 N FIRST ST SPRINGFIELD SPRINGFIELD IL IL
102187 0.984978 HENDRICK MEDICAL CENTER HENDRICK MEDICAL CENTER 1900 PINE 1900 PINE ABILENE ABILENE TX TX

In [12]:
matched_results[cols].query("best_match_score <= .80").sort_values(
    by=['best_match_score'], ascending=False).head(5)


Out[12]:
best_match_score Facility Name Provider Name Address Provider Street Address Provider City City Provider State State
518066 0.792471 METHODIST HOSPITAL SOUTH SOUTH TEXAS REGIONAL MEDICAL CENTER 1905 HWY 97 EAST 1905 HWY 97 EAST JOURDANTON JOURDANTON TX TX
416905 0.791668 ADVENTIST HEALTH UKIAH VALLEY UKIAH VALLEY MEDICAL CENTER 275 HOSPITAL DRIVE 275 HOSPITAL DRIVE UKIAH UKIAH CA CA
303110 0.787163 MADISON HEALTH MADISON COUNTY HOSPITAL INC 210 NORTH MAIN STREET 210 NORTH MAIN STREET LONDON LONDON OH OH
388140 0.776632 PENN HIGHLANDS CLEARFIELD CLEARFIELD HOSPITAL 809 TURNPIKE AVE 809 TURNPIKE AVE CLEARFIELD CLEARFIELD PA PA
492811 0.775573 MEMORIAL HOSPITAL AT GULFPORT MEMORIAL HOSPITAL AT GULFPORT 4500 13TH STREET 4500 13TH ST-P O BOX 1810 GULFPORT GULFPORT MS MS

Example using Python Record Linkage Toolkit


In [13]:
# Re-read in the data using the index_col
hospital_accounts = pd.read_csv(
    'https://github.com/chris1610/pbpython/raw/master/data/hospital_account_info.csv',
    index_col='Account_Num'
)
hospital_reimbursement = pd.read_csv(
    'https://raw.githubusercontent.com/chris1610/pbpython/master/data/hospital_reimbursement.csv',
    index_col='Provider_Num'
)

In [14]:
hospital_accounts.head()


Out[14]:
Facility Name Address City State ZIP Code County Name Phone Number Hospital Type Hospital Ownership
Account_Num
10605 SAGE MEMORIAL HOSPITAL STATE ROUTE 264 SOUTH 191 GANADO AZ 86505 APACHE (928) 755-4541 Critical Access Hospitals Voluntary non-profit - Private
24250 WOODRIDGE BEHAVIORAL CENTER 600 NORTH 7TH STREET WEST MEMPHIS AR 72301 CRITTENDEN (870) 394-4113 Psychiatric Proprietary
10341 DOUGLAS GARDENS HOSPITAL 5200 NE 2ND AVE MIAMI FL 33137 MIAMI-DADE (305) 751-8626 Acute Care Hospitals Voluntary non-profit - Private
81101 SUNCOAST BEHAVIORAL HEALTH CENTER 4480 51ST ST W BRADENTON FL 34210 MANATEE (941) 792-2222 Psychiatric Proprietary
39835 TREASURE VALLEY HOSPITAL 8800 WEST EMERALD STREET BOISE ID 83704 ADA (208) 373-5000 Acute Care Hospitals Proprietary

In [15]:
hospital_reimbursement.head()


Out[15]:
Provider Name Provider Street Address Provider City Provider State Provider Zip Code Total Discharges Average Covered Charges Average Total Payments Average Medicare Payments
Provider_Num
839987 SOUTHEAST ALABAMA MEDICAL CENTER 1108 ROSS CLARK CIRCLE DOTHAN AL 36301 118 20855.61 5026.19 4115.52
519118 MARSHALL MEDICAL CENTER SOUTH 2505 U S HIGHWAY 431 NORTH BOAZ AL 35957 43 13289.09 5413.63 4490.93
733073 ELIZA COFFEE MEMORIAL HOSPITAL 205 MARENGO STREET FLORENCE AL 35631 73 22261.60 4922.18 4021.79
201752 MIZELL MEMORIAL HOSPITAL 702 N MAIN ST OPP AL 36467 12 10901.33 5343.50 4284.17
678488 ST VINCENT'S EAST 50 MEDICAL PARK EAST DRIVE BIRMINGHAM AL 35235 74 28117.95 5947.12 4819.53

In [16]:
# Build the indexer
indexer = recordlinkage.Index()
# Can use full or block
#indexer.full()
#indexer.block(left_on='State', right_on='Provider State')

# Use sortedneighbor as a good option if data is not clean
indexer.sortedneighbourhood(left_on='State', right_on='Provider State')


Out[16]:
<Index>

In [17]:
candidates = indexer.index(hospital_accounts, hospital_reimbursement)

In [18]:
# Let's see how many matches we want to do
print(len(candidates))


998860

In [19]:
# Takes 3 minutes using the full index.
# 14s using sorted neighbor
# 7s using blocking
compare = recordlinkage.Compare()
compare.exact('City', 'Provider City', label='City')
compare.string('Facility Name',
               'Provider Name',
               threshold=0.85,
               label='Hosp_Name')
compare.string('Address',
               'Provider Street Address',
               method='jarowinkler',
               threshold=0.85,
               label='Hosp_Address')
features = compare.compute(candidates, hospital_accounts,
                           hospital_reimbursement)

In [20]:
features


Out[20]:
City Hosp_Name Hosp_Address
Account_Num Provider_Num
10605 537184 0 0.0 0.0
803181 0 0.0 0.0
450616 0 0.0 0.0
854377 0 0.0 0.0
560361 0 0.0 0.0
... ... ... ... ...
70226 815904 0 0.0 0.0
746090 0 0.0 0.0
193062 0 0.0 0.0
834984 0 0.0 0.0
365095 0 0.0 0.0

998860 rows × 3 columns


In [21]:
# What are the score totals?
features.sum(axis=1).value_counts().sort_index(ascending=False)


Out[21]:
3.0      2285
2.0       451
1.0      7937
0.0    988187
dtype: int64

In [22]:
# Get the potential matches
potential_matches = features[features.sum(axis=1) > 1].reset_index()

In [23]:
potential_matches['Score'] = potential_matches.loc[:, 'City':'Hosp_Address'].sum(axis=1)
potential_matches.head()


Out[23]:
Account_Num Provider_Num City Hosp_Name Hosp_Address Score
0 51216 268781 0 1.0 1.0 2.0
1 55272 556917 1 1.0 1.0 3.0
2 87807 854637 1 1.0 1.0 3.0
3 51151 783146 1 0.0 1.0 2.0
4 11740 260374 1 1.0 1.0 3.0

In [24]:
hospital_accounts.loc[51216,:]


Out[24]:
Facility Name             ST FRANCIS MEDICAL CENTER
Address                       2400 ST FRANCIS DRIVE
City                                   BRECKENRIDGE
State                                            MN
ZIP Code                                      56520
County Name                                  WILKIN
Phone Number                         (218) 643-3000
Hospital Type             Critical Access Hospitals
Hospital Ownership    Voluntary non-profit - Church
Name: 51216, dtype: object

In [25]:
hospital_reimbursement.loc[268781,:]


Out[25]:
Provider Name                SAINT FRANCIS MEDICAL CENTER
Provider Street Address                 211 ST FRANCIS DR
Provider City                              CAPE GIRARDEAU
Provider State                                         MO
Provider Zip Code                                   63703
Total Discharges                                      141
Average Covered Charges                           42515.1
Average Total Payments                            5902.24
Average Medicare Payments                         4993.43
Name: 268781, dtype: object

In [26]:
# Add some convenience columns for comparing data
hospital_accounts['Acct_Name_Lookup'] = hospital_accounts[[
    'Facility Name', 'Address', 'City', 'State'
]].apply(lambda x: '_'.join(x), axis=1)

In [27]:
hospital_reimbursement['Reimbursement_Name_Lookup'] = hospital_reimbursement[[
    'Provider Name', 'Provider Street Address', 'Provider City',
    'Provider State'
]].apply(lambda x: '_'.join(x), axis=1)

In [28]:
reimbursement_lookup = hospital_reimbursement[['Reimbursement_Name_Lookup']].reset_index()
account_lookup = hospital_accounts[['Acct_Name_Lookup']].reset_index()

In [29]:
account_lookup.head()


Out[29]:
Account_Num Acct_Name_Lookup
0 10605 SAGE MEMORIAL HOSPITAL_STATE ROUTE 264 SOUTH 1...
1 24250 WOODRIDGE BEHAVIORAL CENTER_600 NORTH 7TH STRE...
2 10341 DOUGLAS GARDENS HOSPITAL_5200 NE 2ND AVE_MIAMI_FL
3 81101 SUNCOAST BEHAVIORAL HEALTH CENTER_4480 51ST ST...
4 39835 TREASURE VALLEY HOSPITAL_8800 WEST EMERALD STR...

In [30]:
reimbursement_lookup.head()


Out[30]:
Provider_Num Reimbursement_Name_Lookup
0 839987 SOUTHEAST ALABAMA MEDICAL CENTER_1108 ROSS CLA...
1 519118 MARSHALL MEDICAL CENTER SOUTH_2505 U S HIGHWAY...
2 733073 ELIZA COFFEE MEMORIAL HOSPITAL_205 MARENGO STR...
3 201752 MIZELL MEMORIAL HOSPITAL_702 N MAIN ST_OPP_AL
4 678488 ST VINCENT'S EAST_50 MEDICAL PARK EAST DRIVE_B...

In [31]:
account_merge = potential_matches.merge(account_lookup, how='left')

In [32]:
account_merge.head()


Out[32]:
Account_Num Provider_Num City Hosp_Name Hosp_Address Score Acct_Name_Lookup
0 51216 268781 0 1.0 1.0 2.0 ST FRANCIS MEDICAL CENTER_2400 ST FRANCIS DRIV...
1 55272 556917 1 1.0 1.0 3.0 SCOTTSDALE OSBORN MEDICAL CENTER_7400 EAST OSB...
2 87807 854637 1 1.0 1.0 3.0 ORO VALLEY HOSPITAL_1551 EAST TANGERINE ROAD_O...
3 51151 783146 1 0.0 1.0 2.0 ST. LUKE'S BEHAVIORAL HOSPITAL, LP_1800 EAST V...
4 11740 260374 1 1.0 1.0 3.0 SUMMIT HEALTHCARE REGIONAL MEDICAL CENTER_2200...

In [33]:
reimbursement_lookup.head()


Out[33]:
Provider_Num Reimbursement_Name_Lookup
0 839987 SOUTHEAST ALABAMA MEDICAL CENTER_1108 ROSS CLA...
1 519118 MARSHALL MEDICAL CENTER SOUTH_2505 U S HIGHWAY...
2 733073 ELIZA COFFEE MEMORIAL HOSPITAL_205 MARENGO STR...
3 201752 MIZELL MEMORIAL HOSPITAL_702 N MAIN ST_OPP_AL
4 678488 ST VINCENT'S EAST_50 MEDICAL PARK EAST DRIVE_B...

In [34]:
# Let's build a dataframe to  compare
final_merge = account_merge.merge(reimbursement_lookup, how='left')

In [35]:
cols = [
    'Account_Num', 'Provider_Num', 'Score', 'Acct_Name_Lookup',
    'Reimbursement_Name_Lookup'
]

In [36]:
final_merge[cols].sort_values(by=['Account_Num', 'Score'], ascending=False)


Out[36]:
Account_Num Provider_Num Score Acct_Name_Lookup Reimbursement_Name_Lookup
2660 94995 825914 3.0 CLAIBORNE MEDICAL CENTER_1850 OLD KNOXVILLE HI... CLAIBORNE MEDICAL CENTER_1850 OLD KNOXVILLE HI...
1975 94953 819181 3.0 LAKE CHARLES MEMORIAL HOSPITAL_1701 OAK PARK B... LAKE CHARLES MEMORIAL HOSPITAL_1701 OAK PARK B...
1042 94943 680596 3.0 VALLEY PRESBYTERIAN HOSPITAL_15107 VANOWEN ST_... VALLEY PRESBYTERIAN HOSPITAL_15107 VANOWEN ST_...
2305 94923 403151 3.0 UNIVERSITY COLO HEALTH MEMORIAL HOSPITAL CENTR... UNIVERSITY COLO HEALTH MEMORIAL HOSPITAL CENTR...
2512 94887 752284 2.0 NEW YORK-PRESBYTERIAN BROOKLYN METHODIST HOSPI... NEW YORK METHODIST HOSPITAL_506 SIXTH STREET_B...
... ... ... ... ... ...
2080 10165 188247 3.0 UTAH VALLEY HOSPITAL_1034 NORTH 500 WEST_PROVO_UT UTAH VALLEY HOSPITAL_1034 NORTH 500 WEST_PROVO_UT
1825 10090 212069 3.0 CANONSBURG GENERAL HOSPITAL_100 MEDICAL BOULEV... CANONSBURG GENERAL HOSPITAL_100 MEDICAL BOULEV...
2424 10043 140535 3.0 BETH ISRAEL DEACONESS HOSPITAL - PLYMOUTH_275 ... BETH ISRAEL DEACONESS HOSPITAL - PLYMOUTH_275 ...
1959 10020 210657 3.0 ST FRANCIS MEDICAL CENTER_309 JACKSON STREET_M... ST FRANCIS MEDICAL CENTER_309 JACKSON STREET_M...
1958 10020 121670 2.0 ST FRANCIS MEDICAL CENTER_309 JACKSON STREET_M... UNIVERSITY HEALTH CONWAY_4864 JACKSON STREET_M...

2736 rows × 5 columns


In [37]:
# If you need to save it to Excel -
#final_merge.sort_values(by=['Account_Num', 'Score'],
#                        ascending=False).to_excel('merge_list.xlsx',
#                                                  index=False)

In [38]:
final_merge[final_merge['Account_Num']==11035][cols]


Out[38]:
Account_Num Provider_Num Score Acct_Name_Lookup Reimbursement_Name_Lookup
2302 11035 550921 2.0 SAINT JOSEPH HOSPITAL_1375 E 19TH AVE_DENVER_CO PRESBYTERIAN ST LUKES MEDICAL CENTER_1719 E 19...
2303 11035 706807 3.0 SAINT JOSEPH HOSPITAL_1375 E 19TH AVE_DENVER_CO SAINT JOSEPH HOSPITAL_1375 EAST 19TH AVE_DENVE...

In [39]:
final_merge[final_merge['Account_Num']==56375][cols]


Out[39]:
Account_Num Provider_Num Score Acct_Name_Lookup Reimbursement_Name_Lookup
155 56375 390402 2.0 BROWARD HEALTH IMPERIAL POINT_6401 N FEDERAL H... HOLY CROSS HOSPITAL_4725 N FEDERAL HWY_FORT LA...
156 56375 451229 3.0 BROWARD HEALTH IMPERIAL POINT_6401 N FEDERAL H... BROWARD HEALTH IMPERIAL POINT_6401 N FEDERAL H...

Dedupe the data


In [40]:
hospital_dupes = pd.read_csv(
    'https://github.com/chris1610/pbpython/raw/master/data/hospital_account_dupes.csv',
    index_col='Account_Num')

In [41]:
hospital_dupes.head()


Out[41]:
Facility Name Address City State ZIP Code County Name Phone Number Hospital Type Hospital Ownership
Account_Num
71730 SAGE MEMORIAL HOSPITAL STATE ROUTE 264 SOUTH 191 GANADO AZ 86505 APACHE (928) 755-4541 Critical Access Hospitals Voluntary non-profit - Private
70116 WOODRIDGE BEHAVIORAL CENTER 600 NORTH 7TH STREET WEST MEMPHIS AR 72301 CRITTENDEN (870) 394-4113 Psychiatric Proprietary
87991 DOUGLAS GARDENS HOSPITAL 5200 NE 2ND AVE MIAMI FL 33137 MIAMI-DADE (305) 751-8626 Acute Care Hospitals Voluntary non-profit - Private
22662 SUNCOAST BEHAVIORAL HEALTH CENTER 4480 51ST ST W BRADENTON FL 34210 MANATEE (941) 792-2222 Psychiatric Proprietary
63165 TREASURE VALLEY HOSPITAL 8800 WEST EMERALD STREET BOISE ID 83704 ADA (208) 373-5000 Acute Care Hospitals Proprietary

In [42]:
# Deduping follows the same process, you just use 1 single dataframe
dupe_indexer = recordlinkage.Index()
dupe_indexer.sortedneighbourhood(left_on='State')
dupe_candidate_links = dupe_indexer.index(hospital_dupes)

In [43]:
# Comparison step
compare_dupes = recordlinkage.Compare()
compare_dupes.string('City', 'City', threshold=0.85, label='City')
compare_dupes.string('Phone Number',
                     'Phone Number',
                     threshold=0.85,
                     label='Phone_Num')
compare_dupes.string('Facility Name',
                     'Facility Name',
                     threshold=0.80,
                     label='Hosp_Name')
compare_dupes.string('Address',
                     'Address',
                     threshold=0.85,
                     label='Hosp_Address')
dupe_features = compare_dupes.compute(dupe_candidate_links, hospital_dupes)

In [44]:
dupe_features


Out[44]:
City Phone_Num Hosp_Name Hosp_Address
Account_Num_1 Account_Num_2
26270 28485 0.0 0.0 0.0 0.0
30430 0.0 0.0 0.0 0.0
43602 0.0 0.0 0.0 0.0
59585 28485 0.0 0.0 0.0 0.0
30430 0.0 0.0 0.0 0.0
... ... ... ... ... ...
64029 38600 0.0 0.0 0.0 0.0
35413 0.0 0.0 0.0 0.0
81525 0.0 0.0 0.0 0.0
82916 0.0 0.0 0.0 0.0
18907 0.0 0.0 0.0 0.0

981277 rows × 4 columns


In [45]:
dupe_features.sum(axis=1).value_counts().sort_index(ascending=False)


Out[45]:
3.0         7
2.0       206
1.0      7859
0.0    973205
dtype: int64

In [46]:
potential_dupes = dupe_features[dupe_features.sum(axis=1) > 2].reset_index()
potential_dupes['Score'] = potential_dupes.loc[:, 'City':'Hosp_Address'].sum(axis=1)

In [47]:
potential_dupes.sort_values(by=['Score'], ascending=True)


Out[47]:
Account_Num_1 Account_Num_2 City Phone_Num Hosp_Name Hosp_Address Score
0 28494 37949 1.0 1.0 0.0 1.0 3.0
1 74835 77000 1.0 1.0 0.0 1.0 3.0
2 24549 28485 1.0 1.0 0.0 1.0 3.0
3 70366 52654 1.0 1.0 0.0 1.0 3.0
4 61685 24849 1.0 1.0 0.0 1.0 3.0
5 51567 41166 1.0 1.0 1.0 0.0 3.0
6 26495 41079 1.0 1.0 0.0 1.0 3.0

In [48]:
# Take a look at one of the potential duplicates
hospital_dupes[hospital_dupes.index.isin([51567, 41166])]


Out[48]:
Facility Name Address City State ZIP Code County Name Phone Number Hospital Type Hospital Ownership
Account_Num
41166 ST VINCENT HOSPITAL 835 S VAN BUREN ST GREEN BAY WI 54301 BROWN (920) 433-0111 Acute Care Hospitals Voluntary non-profit - Church
51567 SAINT VINCENT HOSPITAL 835 SOUTH VAN BUREN ST GREEN BAY WI 54301 BROWN (920) 433-0112 Acute Care Hospitals Voluntary non-profit - Church