In [1]:
import codecs, nltk, json, string, cPickle as pickle, random, collections, dedupe, numpy as np, itertools, time, re
import pandas as pd
from string import punctuation
pd.set_option('display.max_rows', 500)


/Users/jost/courses/clinicaltrials/env/lib/python2.7/site-packages/dedupe/backport.py:12: UserWarning: NumPy linked against 'Accelerate.framework'. Multiprocessing will be disabled. http://mail.scipy.org/pipermail/numpy-discussion/2012-August/063589.html
  warnings.warn("NumPy linked against 'Accelerate.framework'. "

Get facility data and prepare list to dedupe


In [2]:
column_names = ["facility_id",
                "nct_id",
                "status",
                "facility_name",
                "city",
                "state",
                "zipcode",
                "country"]

facilities = pd.read_csv('../data/facilities.txt', names=column_names, sep="|", encoding='utf-8', quoting=3)

for c in column_names[2:]:
    facilities[c] = facilities[c].apply(lambda x: x if pd.notnull(x) else u'')

In [3]:
states = facilities[facilities.country == 'United States'].groupby(facilities.state).count().to_dict()['state'].keys()
print [(k, v) for k, v in enumerate(states)]


[(0, u'Mississippi'), (1, u'Oklahoma'), (2, u'Delaware'), (3, u'Minnesota'), (4, u'Illinois'), (5, u'Arkansas'), (6, u'New Mexico'), (7, u'Indiana'), (8, u'Maryland'), (9, u'Louisiana'), (10, u'Idaho'), (11, u'Wyoming'), (12, u'Tennessee'), (13, u'Arizona'), (14, u'Iowa'), (15, u'Michigan'), (16, u'Kansas'), (17, u'Utah'), (18, u'Virginia'), (19, u'Oregon'), (20, u'Connecticut'), (21, u'Montana'), (22, u'California'), (23, u'Massachusetts'), (24, u'West Virginia'), (25, u'South Carolina'), (26, u'New Hampshire'), (27, u'Wisconsin'), (28, u'Vermont'), (29, u'Georgia'), (30, u'North Dakota'), (31, u'Pennsylvania'), (32, u'Florida'), (33, u'Alaska'), (34, u'Kentucky'), (35, u'Hawaii'), (36, u'Nebraska'), (37, u'Missouri'), (38, u'Ohio'), (39, u'Alabama'), (40, u'New York'), (41, u'South Dakota'), (42, u'Colorado'), (43, u'New Jersey'), (44, u'Washington'), (45, u'North Carolina'), (46, u'District of Columbia'), (47, u'Texas'), (48, u'Nevada'), (49, u'Maine'), (50, u'Rhode Island')]

In [4]:
bad_names = [r'investigat[a-z]+ site',
             r'research site',
             r'research facility',
             r'local institution',
             r'study site',
             r'clinical site',
             r'call center',
             r'site ref',
             r'site[ :]+#?[0-9]+',
             r'^#?[0-9\.]+$',
             r'for additional information',
             r'call for information',
             r'the study is '
             ]

potential = facilities[(facilities.country == 'United States') & 
                       (facilities.facility_name.apply(lambda x: x != u'' and min([not re.search(b,x.lower()) for b in bad_names])))][['facility_id','facility_name','city','state','zipcode','country']].drop_duplicates(['facility_name','city','state','zipcode','country'])

In [5]:
potential.shape


Out[5]:
(101559, 6)

Train model

(Done on data from all states)


In [6]:
random_pairs = dedupe.randomPairs(len(potential), 1000) 
id_lookup = {i: k for i, k in enumerate(potential.facility_id)}
potential_indexed = potential.set_index('facility_id')

temp_d = {}

for i in range(len(potential)):
    temp_d[i] = dedupe.frozendict(potential_indexed.loc[id_lookup[i]].to_dict())

def random_pair_generator():
    for k1, k2 in random_pairs:
        yield (temp_d[k1], temp_d[k2])

todo =  tuple(pair for pair in random_pair_generator())

In [7]:
variables = [
             {'field' : 'facility_name', 'type' : 'String'},
             {'field' : 'city', 'type' : 'ShortString', 'has missing' : True},
             {'field' : 'state', 'type' : 'ShortString'},
             {'field' : 'zipcode', 'type' : 'ShortString', 'has missing': True}
             ]
deduper = dedupe.Dedupe(variables, todo)

In [8]:
todo[:5]


Out[8]:
((<frozendict {'city': u'Nashville', 'facility_name': u'Sarah Cannon Research Institute BKM120D2204/BKM120D2205', 'state': u'Tennessee', 'zipcode': u'37203', 'country': u'United States'}>,
  <frozendict {'city': u'Pasadena', 'facility_name': u'Kyowa PD Site', 'state': u'California', 'zipcode': u'91104', 'country': u'United States'}>),
 (<frozendict {'city': u'New York', 'facility_name': u"Children's Tumor Fundation", 'state': u'New York', 'zipcode': u'10005', 'country': u'United States'}>,
  <frozendict {'city': u'Los Angeles', 'facility_name': u'Kerlan Jobe Orthopaedic Foundation', 'state': u'California', 'zipcode': u'90045', 'country': u'United States'}>),
 (<frozendict {'city': u'Winston Salem', 'facility_name': u'Wake Forest University School of Medicine', 'state': u'North Carolina', 'zipcode': u'27157-1082', 'country': u'United States'}>,
  <frozendict {'city': u'Stanford', 'facility_name': u'Stanford University Medical Center/Palo Alto VA', 'state': u'California', 'zipcode': u'94306', 'country': u'United States'}>),
 (<frozendict {'city': u'Charlotte', 'facility_name': u'Carolinas HealthCare System, Carolinas Med. Ctr.', 'state': u'North Carolina', 'zipcode': u'28203', 'country': u'United States'}>,
  <frozendict {'city': u'Spokane', 'facility_name': u'Spokane Allergy and Asthma Clinical Research', 'state': u'Washington', 'zipcode': u'99204', 'country': u'United States'}>),
 (<frozendict {'city': u'San Antonio', 'facility_name': u'Audie L Murphy VA Hospital - Pathology Laboratory', 'state': u'Texas', 'zipcode': u'78229', 'country': u'United States'}>,
  <frozendict {'city': u'New York', 'facility_name': u'New York Downtown Hospital', 'state': u'New York', 'zipcode': u'10038', 'country': u'United States'}>))

In [8]:
# if already have training data...
training_file = '../data/dedupe_training2.json'
deduper.readTraining(open(training_file,'r'))


INFO:dedupe.api:reading training from file
INFO:dedupe.api:Learned Weights
INFO:dedupe.api:('(city: ShortString)', -0.3377057909965515)
INFO:dedupe.api:('(facility_name: String)', -0.5926158428192139)
INFO:dedupe.api:('(zipcode: ShortString)', -0.1446063369512558)
INFO:dedupe.api:('(state: ShortString)', -0.15747596323490143)
INFO:dedupe.api:('((zipcode: ShortString): Not Missing)', 0.014522064477205276)
INFO:dedupe.api:('((city: ShortString): Not Missing)', 0.27762895822525024)
INFO:dedupe.api:('bias', 0.9713605642318726)

In [9]:
dedupe.convenience.consoleLabel(deduper)


INFO:dedupe.api:Learned Weights
INFO:dedupe.api:('(city: ShortString)', 0.2886696755886078)
INFO:dedupe.api:('(facility_name: String)', 0.2886696755886078)
INFO:dedupe.api:('(zipcode: ShortString)', 0.2886696755886078)
INFO:dedupe.api:('(state: ShortString)', 0.2886696755886078)
INFO:dedupe.api:('((zipcode: ShortString): Not Missing)', 0.5773393511772156)
INFO:dedupe.api:('((city: ShortString): Not Missing)', 0.5773393511772156)
INFO:dedupe.api:('bias', 0.7933143377304077)
INFO:dedupe.training:1.0
city : Albuquerque
facility_name : Albuquerque Neuroscience, Inc.
zipcode : 87102
state : New Mexico

city : Los Angeles
facility_name : 100 UCLA Medical Plaza, Suites 205-210
zipcode : 90095
state : California

Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished
n
INFO:dedupe.api:Learned Weights
INFO:dedupe.api:('(city: ShortString)', -0.2744453549385071)
INFO:dedupe.api:('(facility_name: String)', -0.19279177486896515)
INFO:dedupe.api:('(zipcode: ShortString)', -0.2744453549385071)
INFO:dedupe.api:('(state: ShortString)', -0.2744453549385071)
INFO:dedupe.api:('((zipcode: ShortString): Not Missing)', 0.6647651195526123)
INFO:dedupe.api:('((city: ShortString): Not Missing)', 0.6647651195526123)
INFO:dedupe.api:('bias', 1.0299216508865356)
INFO:dedupe.training:1.0
city : New York
facility_name : Columbia University
zipcode : 10022
state : New York

city : New York
facility_name : Columbia University Department of General Medicine
zipcode : 10032
state : New York

Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished
y
INFO:dedupe.api:Learned Weights
INFO:dedupe.api:('(city: ShortString)', -0.36342090368270874)
INFO:dedupe.api:('(facility_name: String)', -0.21308083832263947)
INFO:dedupe.api:('(zipcode: ShortString)', -0.07062216103076935)
INFO:dedupe.api:('(state: ShortString)', -0.36342090368270874)
INFO:dedupe.api:('((zipcode: ShortString): Not Missing)', 0.6956732869148254)
INFO:dedupe.api:('((city: ShortString): Not Missing)', 0.6956732869148254)
INFO:dedupe.api:('bias', 1.2533769607543945)
INFO:dedupe.training:0.5
city : Lynwood
facility_name : Imperial care Dialysis Center
zipcode : 90262
state : California

city : Los Angeles
facility_name : UCLA-Los Angeles/Brazil AIDS Consortium (LABAC) CRS
zipcode : 90090-1752
state : California

Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished
b
(y)es / (n)o / (u)nsure / (f)inished
n
INFO:dedupe.api:Learned Weights
INFO:dedupe.api:('(city: ShortString)', -0.6292026042938232)
INFO:dedupe.api:('(facility_name: String)', -0.5050867199897766)
INFO:dedupe.api:('(zipcode: ShortString)', -0.08035874366760254)
INFO:dedupe.api:('(state: ShortString)', 0.025177018716931343)
INFO:dedupe.api:('((zipcode: ShortString): Not Missing)', 0.6192004680633545)
INFO:dedupe.api:('((city: ShortString): Not Missing)', 0.6192004680633545)
INFO:dedupe.api:('bias', 1.2575386762619019)
INFO:dedupe.training:0.666666666667
city : Columbia
facility_name : Lee Butterfield, MD
zipcode : 29204
state : South Carolina

city : Columbia
facility_name : S. Carolina Clinical Research Center
zipcode : 29201
state : South Carolina

Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished
n
INFO:dedupe.api:Learned Weights
INFO:dedupe.api:('(city: ShortString)', -0.29083937406539917)
INFO:dedupe.api:('(facility_name: String)', -0.9705556035041809)
INFO:dedupe.api:('(zipcode: ShortString)', -0.05567343905568123)
INFO:dedupe.api:('(state: ShortString)', 0.0726349800825119)
INFO:dedupe.api:('((zipcode: ShortString): Not Missing)', 0.5330178141593933)
INFO:dedupe.api:('((city: ShortString): Not Missing)', 0.5330178141593933)
INFO:dedupe.api:('bias', 1.2072550058364868)
INFO:dedupe.training:0.75
city : Wichita
facility_name : University of Kansas
zipcode : 67211
state : Kansas

city : Prairie Village
facility_name : University of Kansas Medical Center
zipcode : 66208
state : Kansas

Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished
y
INFO:dedupe.api:Learned Weights
INFO:dedupe.api:('(city: ShortString)', -0.07608988881111145)
INFO:dedupe.api:('(facility_name: String)', -1.1341122388839722)
INFO:dedupe.api:('(zipcode: ShortString)', 0.18054765462875366)
INFO:dedupe.api:('(state: ShortString)', -0.09868623316287994)
INFO:dedupe.api:('((zipcode: ShortString): Not Missing)', 0.5012407302856445)
INFO:dedupe.api:('((city: ShortString): Not Missing)', 0.5012407302856445)
INFO:dedupe.api:('bias', 1.284653663635254)
INFO:dedupe.training:0.5
city : Boynton Beach
facility_name : Consultants for Clinical Research of South Florida
zipcode : 33426
state : Florida

city : Linden
facility_name : NJ Heart
zipcode : 07036
state : New Jersey

Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished
n
INFO:dedupe.api:Learned Weights
INFO:dedupe.api:('(city: ShortString)', -0.05782230198383331)
INFO:dedupe.api:('(facility_name: String)', -1.0351841449737549)
INFO:dedupe.api:('(zipcode: ShortString)', 0.05359961837530136)
INFO:dedupe.api:('(state: ShortString)', -0.46648526191711426)
INFO:dedupe.api:('((zipcode: ShortString): Not Missing)', 0.4878253936767578)
INFO:dedupe.api:('((city: ShortString): Not Missing)', 0.4878253936767578)
INFO:dedupe.api:('bias', 1.373679518699646)
INFO:dedupe.training:0.6
city : Los Angeles
facility_name : Kaiser Permanente Los Angeles Medical Center,4867 Sunset Blvd
zipcode : 90027
state : California

city : San Diego
facility_name : Kaiser Permanente
zipcode : 
state : California

Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished
f
Finished labeling

In [10]:
deduper.train(ppc=0.01, uncovered_dupes=5)


INFO:dedupe.api:20 folds
INFO:dedupe.crossvalidation:using cross validation to find optimum alpha...
INFO:dedupe.crossvalidation:optimum alpha: 0.010000
INFO:dedupe.api:Learned Weights
INFO:dedupe.api:('(city: ShortString)', -0.4464088976383209)
INFO:dedupe.api:('(facility_name: String)', -1.2190216779708862)
INFO:dedupe.api:('(zipcode: ShortString)', -0.49488386511802673)
INFO:dedupe.api:('(state: ShortString)', -0.48534145951271057)
INFO:dedupe.api:('((zipcode: ShortString): Not Missing)', 0.14800745248794556)
INFO:dedupe.api:('((city: ShortString): Not Missing)', 1.1579594612121582)
INFO:dedupe.api:('bias', 2.345712661743164)
INFO:dedupe.blocking:Tue Nov 25 09:45:13 2014
INFO:dedupe.blocking:Canopy: TfidfPredicate: (0.4, facility_name)
INFO:dedupe.blocking:Canopy: TfidfPredicate: (0.6, facility_name)
INFO:dedupe.blocking:Canopy: TfidfPredicate: (0.8, facility_name)
INFO:dedupe.blocking:Canopy: TfidfPredicate: (0.2, facility_name)
INFO:dedupe.blocking:Tue Nov 25 09:45:41 2014
INFO:dedupe.training:coverage threshold: 322.64
INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:CompoundPredicate: (SimplePredicate: (nearIntegersPredicate, zipcode), SimplePredicate: (sameSevenCharStartPredicate, facility_name))
INFO:dedupe.training:CompoundPredicate: (SimplePredicate: (wholeFieldPredicate, zipcode), TfidfPredicate: (0.8, facility_name))
INFO:dedupe.training:CompoundPredicate: (SimplePredicate: (sameThreeCharStartPredicate, zipcode), SimplePredicate: (sameSevenCharStartPredicate, facility_name))
INFO:dedupe.training:CompoundPredicate: (SimplePredicate: (wholeFieldPredicate, facility_name), SimplePredicate: (tokenFieldPredicate, city))
INFO:dedupe.training:CompoundPredicate: (SimplePredicate: (tokenFieldPredicate, zipcode), TfidfPredicate: (0.2, facility_name))
INFO:dedupe.training:CompoundPredicate: (SimplePredicate: (wholeFieldPredicate, zipcode), TfidfPredicate: (0.6, facility_name))
INFO:dedupe.training:CompoundPredicate: (SimplePredicate: (sameSevenCharStartPredicate, city), SimplePredicate: (firstTokenPredicate, facility_name))
INFO:dedupe.training:CompoundPredicate: (SimplePredicate: (sameSevenCharStartPredicate, city), TfidfPredicate: (0.2, facility_name))
INFO:dedupe.training:CompoundPredicate: (SimplePredicate: (firstIntegerPredicate, zipcode), SimplePredicate: (sameThreeCharStartPredicate, facility_name))
INFO:dedupe.training:CompoundPredicate: (TfidfPredicate: (0.8, facility_name), SimplePredicate: (sameSevenCharStartPredicate, facility_name))
INFO:dedupe.training:CompoundPredicate: (TfidfPredicate: (0.4, facility_name), SimplePredicate: (wholeFieldPredicate, city))
INFO:dedupe.training:CompoundPredicate: (TfidfPredicate: (0.6, facility_name), SimplePredicate: (sameSevenCharStartPredicate, facility_name))
INFO:dedupe.training:CompoundPredicate: (SimplePredicate: (firstIntegerPredicate, zipcode), SimplePredicate: (commonSixGram, facility_name))
INFO:dedupe.training:CompoundPredicate: (TfidfPredicate: (0.2, facility_name), SimplePredicate: (commonFourGram, zipcode))
INFO:dedupe.training:CompoundPredicate: (SimplePredicate: (sameSevenCharStartPredicate, city), SimplePredicate: (sameThreeCharStartPredicate, facility_name))
INFO:dedupe.training:CompoundPredicate: (TfidfPredicate: (0.4, facility_name), SimplePredicate: (sameSevenCharStartPredicate, facility_name))
INFO:dedupe.training:CompoundPredicate: (SimplePredicate: (tokenFieldPredicate, zipcode), SimplePredicate: (sameSevenCharStartPredicate, state))
INFO:dedupe.training:CompoundPredicate: (SimplePredicate: (tokenFieldPredicate, facility_name), SimplePredicate: (sameSevenCharStartPredicate, city))
INFO:dedupe.training:CompoundPredicate: (SimplePredicate: (tokenFieldPredicate, facility_name), SimplePredicate: (nearIntegersPredicate, zipcode))

In [11]:
settings_file = '../data/dedupe_settings2'
training_file = '../data/dedupe_training2.json'
deduper.writeTraining(open(training_file,'w'))
deduper.writeSettings(open(settings_file,'w'))

In [ ]: