In [ ]:
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process as fuzzy_proc
import numpy as np
In [ ]:
#_______________________
class ComboFuzzer:
def __init__(self,fuzzers):
self.fuzzers = fuzzers
# Define the normalisation variable in advance
# NB: defined as inverse for speed
self.norm = 1/np.sqrt(len(fuzzers))
def combo_fuzz(self,target,candidate):
_score = 0
for _fuzz in self.fuzzers:
_raw_score = (_fuzz(target,candidate)/100)
_score += _raw_score**2
return np.sqrt(_score)*self.norm
#_______________________
class LatLonGetter:
def __init__(self,grid_df,scorer):
self.scorer = scorer
# Find the null aliases
self.df = grid_df
null_alias = pd.isnull(self.df.alias)
not_null = self.df.loc[~null_alias]
# Now generate the list of names + not null aliases
alias_names = list(not_null.alias.values)
std_names = list(grid_df.Name.values)
self.all_possible_values = std_names + alias_names
self.lower_possible_values = [x.lower() for x in
self.all_possible_values]
self.fuzzy_matches = {}
# with open("fuzzy_scores.pydict") as f:
# self.fuzzy_matches = ast.literal_eval(f.read())
# self.fuzzy_matches = {"ibm" : ("IBM (United States)",1.),
# "microsoft" : ("Microsoft (United States)",1.),
# "xerox" : ("Xerox (United States)", 1.),
# "pricewaterhousecoopers" : ("PricewaterhouseCoopers (United States)",1.),
# "university of california berkeley": ("University of California, Berkeley",1.),
# "university of california santa cruz": ("University of California, Santa Cruz",1.),
# "linkoping university": ("Linköping University",1.),
# "nec" : ("NEC (United States)",1.),
# "university of michigan" : ("Michigan State University",1.),
# "google" : ("Google (United States)",1.),
# "yahoo" : ("Yahoo (United States)",1.),
# "at t" : ("AT&T (United States)",1.),
# "at t labs" : ("AT&T (United States",1.)}
def get_latlon(self,mak_name):
assert mak_name != ""
# Super-fast check to see if there is an exact match
try:
idx = self.lower_possible_values.index(mak_name)
match = self.all_possible_values[idx]
score = 1.
# Otherwise, fuzzy match
except ValueError:
# If already done a fuzzy match for this
if mak_name in self.fuzzy_matches:
match,score = self.fuzzy_matches[mak_name]
# Otherwise, do the fuzzy match
else:
match,score = fuzzy_proc.extractOne(query=mak_name,
choices=self.all_possible_values,
scorer=self.scorer)
self.fuzzy_matches[mak_name] = (match,score)
# Check whether the match was a Name or alias
condition = grid_df.Name == match
if condition.sum() == 0:
condition = grid_df.alias == match
_df = grid_df.loc[condition]
# Get the lat/lon
lat = _df["lat"].values[0]
lon = _df["lng"].values[0]
return (lat,lon,score)
def process_latlons(self,mak_institutes):
isnull = pd.isnull(mak_institutes)
if type(isnull) is bool:
if isnull:
return []
elif all(isnull):
return []
return [self.get_latlon(mak_name)
for mak_name in mak_institutes]
#_______________________
# Fuzzy combination of partial ratio and token sort ratio
grid_full = pd.read_csv("/Users/hep/Downloads/grid20170810/grid.csv",low_memory=False)
grid_address = pd.read_csv("/Users/hep/Downloads/grid20170810/full_tables/addresses.csv",low_memory=False)
grid_alias = pd.read_csv("/Users/hep/Downloads/grid20170810/full_tables/aliases.csv",low_memory=False)
grid_df = grid_full.join(grid_address.set_index(keys=["grid_id"]),on="ID")
grid_df = grid_df.join(grid_alias.set_index(keys=["grid_id"]),on="ID")
grid_df = grid_df[["Name","lat","lng","ID","alias"]]
grid_df.head()
cf = ComboFuzzer([fuzz.token_sort_ratio,fuzz.partial_ratio])
llg = LatLonGetter(grid_df=grid_df,scorer=cf.combo_fuzz)
In [ ]:
mak_df = pd.read_csv("/Users/hep/Downloads/19_10_2017_organisations_to_geocode.csv",names=["institutes","n"])
lat_lon_score = llg.process_latlons(mak_df["institutes"])
In [ ]:
mak_df["latitude"] = [lat for lat,lon,score in lat_lon_score]
mak_df["longitude"] = [lon for lat,lon,score in lat_lon_score]
mak_df["grid_mak_match_score"] = [score for lat,lon,score in lat_lon_score]
In [ ]:
mak_df.head()
In [ ]:
mak_df["grid_mak_match_score"].describe()
In [ ]: