In [1]:
import codecs, nltk, json, string, cPickle as pickle, random, collections, dedupe, numpy as np, itertools, time, re
import pandas as pd
from string import punctuation
pd.set_option('display.max_rows', 500)

from sqlalchemy import create_engine
from connect import mysqlusername, mysqlpassword, mysqlserver, mysqldbname


/Users/jost/courses/clinicaltrials/env/lib/python2.7/site-packages/dedupe/backport.py:12: UserWarning: NumPy linked against 'Accelerate.framework'. Multiprocessing will be disabled. http://mail.scipy.org/pipermail/numpy-discussion/2012-August/063589.html
  warnings.warn("NumPy linked against 'Accelerate.framework'. "

Get facility data and prepare list to dedupe


In [ ]:
mysqlserver = 'localhost'
mysqldbname = 'clinicaltrials2'
conn = create_engine('mysql://%s:%s@%s/%s' % (mysqlusername, mysqlpassword, mysqlserver, mysqldbname))

In [2]:
column_names = ["facility_id",
                "nct_id",
                "status",
                "facility_name",
                "city",
                "state",
                "zipcode",
                "country"]

facilities = pd.read_csv('../data/facilities.txt', names=column_names, sep="|", encoding='utf-8', quoting=3)

for c in column_names[2:]:
    facilities[c] = facilities[c].apply(lambda x: x if pd.notnull(x) else u'')

In [3]:
states = facilities[facilities.country == 'United States'].groupby(facilities.state).count().to_dict()['state'].keys()

In [4]:
bad_names = [r'investigat[a-z]+ site',
             r'research site',
             r'research facility',
             r'local institution',
             r'study site',
             r'clinical site',
             r'call center',
             r'site ref',
             r'site[ :]+#?[0-9]+',
             r'^#?[0-9\.]+$',
             r'for additional information',
             r'call for information',
             r'the study is '
             ]

potential = facilities[(facilities.country == 'United States') & 
                       (facilities.facility_name.apply(lambda x: x != u'' and min([not re.search(b,x.lower()) for b in bad_names])))][['facility_id','facility_name','city','state','zipcode','country']].drop_duplicates(['facility_name','city','state','zipcode','country'])

potential.shape


Out[4]:
(101559, 6)

Load model information and get blocking data ready


In [5]:
settings_file = '../data/dedupe_settings2'
deduper = dedupe.StaticDedupe(open(settings_file,'r'))


INFO:dedupe.api:Learned Weights
INFO:dedupe.api:('(city: ShortString)', -0.4464088976383209)
INFO:dedupe.api:('(facility_name: String)', -1.2190216779708862)
INFO:dedupe.api:('(zipcode: ShortString)', -0.49488386511802673)
INFO:dedupe.api:('(state: ShortString)', -0.48534145951271057)
INFO:dedupe.api:('((zipcode: ShortString): Not Missing)', 0.14800745248794556)
INFO:dedupe.api:('((city: ShortString): Not Missing)', 1.1579594612121582)
INFO:dedupe.api:('bias', 2.345712661743164)
INFO:dedupe.api:[CompoundPredicate: (SimplePredicate: (nearIntegersPredicate, zipcode), SimplePredicate: (sameSevenCharStartPredicate, facility_name)), CompoundPredicate: (SimplePredicate: (wholeFieldPredicate, zipcode), TfidfPredicate: (0.8, facility_name)), CompoundPredicate: (SimplePredicate: (sameThreeCharStartPredicate, zipcode), SimplePredicate: (sameSevenCharStartPredicate, facility_name)), CompoundPredicate: (SimplePredicate: (wholeFieldPredicate, facility_name), SimplePredicate: (tokenFieldPredicate, city)), CompoundPredicate: (SimplePredicate: (tokenFieldPredicate, zipcode), TfidfPredicate: (0.2, facility_name)), CompoundPredicate: (SimplePredicate: (wholeFieldPredicate, zipcode), TfidfPredicate: (0.6, facility_name)), CompoundPredicate: (SimplePredicate: (sameSevenCharStartPredicate, city), SimplePredicate: (firstTokenPredicate, facility_name)), CompoundPredicate: (SimplePredicate: (sameSevenCharStartPredicate, city), TfidfPredicate: (0.2, facility_name)), CompoundPredicate: (SimplePredicate: (firstIntegerPredicate, zipcode), SimplePredicate: (sameThreeCharStartPredicate, facility_name)), CompoundPredicate: (TfidfPredicate: (0.8, facility_name), SimplePredicate: (sameSevenCharStartPredicate, facility_name)), CompoundPredicate: (TfidfPredicate: (0.4, facility_name), SimplePredicate: (wholeFieldPredicate, city)), CompoundPredicate: (TfidfPredicate: (0.6, facility_name), SimplePredicate: (sameSevenCharStartPredicate, facility_name)), CompoundPredicate: (SimplePredicate: (firstIntegerPredicate, zipcode), SimplePredicate: (commonSixGram, facility_name)), CompoundPredicate: (TfidfPredicate: (0.2, facility_name), SimplePredicate: (commonFourGram, zipcode)), CompoundPredicate: (SimplePredicate: (sameSevenCharStartPredicate, city), SimplePredicate: (sameThreeCharStartPredicate, facility_name)), CompoundPredicate: (TfidfPredicate: (0.4, facility_name), SimplePredicate: (sameSevenCharStartPredicate, facility_name)), CompoundPredicate: (SimplePredicate: (tokenFieldPredicate, zipcode), SimplePredicate: (sameSevenCharStartPredicate, state)), CompoundPredicate: (SimplePredicate: (tokenFieldPredicate, facility_name), SimplePredicate: (sameSevenCharStartPredicate, city)), CompoundPredicate: (SimplePredicate: (tokenFieldPredicate, facility_name), SimplePredicate: (nearIntegersPredicate, zipcode))]
INFO:dedupe.api:{'facility_name': set([u'and', u'Oncology', u'Center', u'Cancer', u'Institute', u'of', u'University', u'Research', u's', u'Associates', u'Health', u'Hospital', u'Clinical', u'Medical'])}

In [9]:
potential_indexed = potential.set_index('facility_id')

def candidates_gen(result_set) :
    
    block_key = None
    records = []
    for row in result_set :
        if row['block_key'] != block_key :
            if records:
                yield tuple(records)
            
            block_key = row['block_key']
            records = []
            
        records.append((row['facility_id'], row, set([])))
    
    if records :
        yield records

Clustering by state


In [30]:
dupes = {}
for field in deduper.blocker.tfidf_fields :
    field_data = potential[['facility_id',field]].T.to_dict().values()
    deduper.blocker.tfIdfBlock(field_data, field)
    
for s in states:
    print s
    
    full_data = [(f, potential_indexed.loc[f].to_dict()) for f in potential_indexed[potential_indexed.state == s].index]
    blocks = [(i, j[0], j[1]) for i, j in enumerate(sorted(list(deduper.blocker(full_data))))]
    
    bkfreq = nltk.FreqDist([bk for i, bk, f in blocks])
    
    c = [dict([(k, v if pd.notnull(v) else u'') 
               for k, v in potential_indexed.loc[f].to_dict().items()] + [('block_key', bk), ('facility_id', f)]) 
         for i, bk, f in blocks if bkfreq[bk] > 1]
    
    clustered_dupes = deduper.matchBlocks(candidates_gen(c), .5)
    print '%d clusters' % len(clustered_dupes)
    print 
    
    dupes[s] = clustered_dupes


INFO:dedupe.blocking:Tue Nov 25 21:38:42 2014
INFO:dedupe.blocking:Canopy: TfidfPredicate: (0.4, facility_name)
INFO:dedupe.blocking:Canopy: TfidfPredicate: (0.6, facility_name)
INFO:dedupe.blocking:Canopy: TfidfPredicate: (0.8, facility_name)
INFO:dedupe.blocking:Canopy: TfidfPredicate: (0.2, facility_name)
INFO:dedupe.blocking:Tue Nov 25 21:38:42 2014
Mississippi
61 clusters

Oklahoma
161 clusters

Delaware
32 clusters

Minnesota
223 clusters

Illinois
520 clusters

Arkansas
116 clusters

New Mexico
64 clusters

Indiana
305 clusters

Maryland
362 clusters

Louisiana
235 clusters

Idaho
62 clusters

Wyoming
3 clusters

Tennessee
369 clusters

Arizona
324 clusters

Iowa
104 clusters

Michigan
403 clusters

Kansas
140 clusters

Utah
136 clusters

Virginia
359 clusters

Oregon
196 clusters

Connecticut
232 clusters

Montana
44 clusters
INFO:dedupe.blocking:10000, 1.2280222 seconds

California
1834 clusters

Massachusetts
399 clusters

West Virginia
45 clusters

South Carolina
234 clusters

New Hampshire
49 clusters

Wisconsin
232 clusters

Vermont
40 clusters

Georgia
446 clusters

North Dakota
51 clusters

Pennsylvania
665 clusters

Florida
1243 clusters

Alaska
15 clusters

Kentucky
164 clusters

Hawaii
50 clusters

Nebraska
114 clusters

Missouri
329 clusters

Ohio
561 clusters

Alabama
209 clusters

New York
977 clusters

South Dakota
41 clusters

Colorado
335 clusters

New Jersey
386 clusters

Washington
323 clusters

North Carolina
530 clusters

District of Columbia
85 clusters

Texas
1078 clusters

Nevada
104 clusters

Maine
42 clusters

Rhode Island
78 clusters


In [31]:
pickle.dump(dupes, open('../data/clustered_dupes_state.pkl','wb'))

In [ ]: