In [1]:
import codecs, nltk, json, string, cPickle as pickle, random, collections, dedupe, numpy as np, itertools, time, re
import pandas as pd
from string import punctuation
pd.set_option('display.max_rows', 500)
In [2]:
column_names = ["facility_id",
"nct_id",
"status",
"facility_name",
"city",
"state",
"zipcode",
"country"]
facilities = pd.read_csv('../data/facilities.txt', names=column_names, sep="|", encoding='utf-8', quoting=3)
for c in column_names[2:]:
facilities[c] = facilities[c].apply(lambda x: x if pd.notnull(x) else u'')
In [3]:
states = facilities[facilities.country == 'United States'].groupby(facilities.state).count().to_dict()['state'].keys()
print [(k, v) for k, v in enumerate(states)]
In [4]:
bad_names = [r'investigat[a-z]+ site',
r'research site',
r'research facility',
r'local institution',
r'study site',
r'clinical site',
r'call center',
r'site ref',
r'site[ :]+#?[0-9]+',
r'^#?[0-9\.]+$',
r'for additional information',
r'call for information',
r'the study is '
]
potential = facilities[(facilities.country == 'United States') &
(facilities.facility_name.apply(lambda x: x != u'' and min([not re.search(b,x.lower()) for b in bad_names])))][['facility_id','facility_name','city','state','zipcode','country']].drop_duplicates(['facility_name','city','state','zipcode','country'])
In [5]:
potential.shape
Out[5]:
In [6]:
random_pairs = dedupe.randomPairs(len(potential), 1000)
id_lookup = {i: k for i, k in enumerate(potential.facility_id)}
potential_indexed = potential.set_index('facility_id')
temp_d = {}
for i in range(len(potential)):
temp_d[i] = dedupe.frozendict(potential_indexed.loc[id_lookup[i]].to_dict())
def random_pair_generator():
for k1, k2 in random_pairs:
yield (temp_d[k1], temp_d[k2])
todo = tuple(pair for pair in random_pair_generator())
In [7]:
variables = [
{'field' : 'facility_name', 'type' : 'String'},
{'field' : 'city', 'type' : 'ShortString', 'has missing' : True},
{'field' : 'state', 'type' : 'ShortString'},
{'field' : 'zipcode', 'type' : 'ShortString', 'has missing': True}
]
deduper = dedupe.Dedupe(variables, todo)
In [8]:
todo[:5]
Out[8]:
In [8]:
# if already have training data...
training_file = '../data/dedupe_training2.json'
deduper.readTraining(open(training_file,'r'))
In [9]:
dedupe.convenience.consoleLabel(deduper)
In [10]:
deduper.train(ppc=0.01, uncovered_dupes=5)
In [11]:
settings_file = '../data/dedupe_settings2'
training_file = '../data/dedupe_training2.json'
deduper.writeTraining(open(training_file,'w'))
deduper.writeSettings(open(settings_file,'w'))
In [ ]: