In [ ]:
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process as fuzzy_proc
import numpy as np

In [ ]:
#_______________________
class ComboFuzzer:
    def __init__(self,fuzzers):
        self.fuzzers = fuzzers
        # Define the normalisation variable in advance
        # NB: defined as inverse for speed
        self.norm = 1/np.sqrt(len(fuzzers))
    
    def combo_fuzz(self,target,candidate):
        _score = 0
        for _fuzz in self.fuzzers:
            _raw_score = (_fuzz(target,candidate)/100)
            _score += _raw_score**2
        return np.sqrt(_score)*self.norm

#_______________________
class LatLonGetter:
    def __init__(self,grid_df,scorer):
        self.scorer = scorer
        # Find the null aliases
        self.df = grid_df
        null_alias = pd.isnull(self.df.alias)
        not_null = self.df.loc[~null_alias]
        # Now generate the list of names + not null aliases
        alias_names = list(not_null.alias.values)
        std_names = list(grid_df.Name.values)
        self.all_possible_values = std_names + alias_names
        self.lower_possible_values = [x.lower() for x in 
                                      self.all_possible_values]
        self.fuzzy_matches = {}
#         with open("fuzzy_scores.pydict") as f:
#             self.fuzzy_matches = ast.literal_eval(f.read())
#         self.fuzzy_matches = {"ibm" : ("IBM (United States)",1.),
#                               "microsoft" : ("Microsoft (United States)",1.),
#                               "xerox" : ("Xerox (United States)", 1.),
#                               "pricewaterhousecoopers" : ("PricewaterhouseCoopers (United States)",1.),
#                               "university of california berkeley": ("University of California, Berkeley",1.),
#                               "university of california santa cruz": ("University of California, Santa Cruz",1.),
#                               "linkoping university": ("Linköping University",1.),
#                               "nec" : ("NEC (United States)",1.),
#                               "university of michigan" : ("Michigan State University",1.),
#                               "google" : ("Google (United States)",1.),
#                               "yahoo" : ("Yahoo (United States)",1.),
#                               "at t" : ("AT&T (United States)",1.),
#                               "at t labs" : ("AT&T (United States",1.)}
        
    def get_latlon(self,mak_name):
        assert mak_name != ""
        # Super-fast check to see if there is an exact match
        try:
            idx = self.lower_possible_values.index(mak_name)
            match = self.all_possible_values[idx]
            score = 1.
        # Otherwise, fuzzy match
        except ValueError:
            # If already done a fuzzy match for this
            if mak_name in self.fuzzy_matches:
                match,score = self.fuzzy_matches[mak_name]
            # Otherwise, do the fuzzy match
            else:
                match,score = fuzzy_proc.extractOne(query=mak_name,
                                                    choices=self.all_possible_values,
                                                    scorer=self.scorer)
                self.fuzzy_matches[mak_name] = (match,score)

        # Check whether the match was a Name or alias
        condition = grid_df.Name == match
        if condition.sum() == 0:
            condition = grid_df.alias == match
        _df = grid_df.loc[condition]

        # Get the lat/lon
        lat = _df["lat"].values[0]
        lon = _df["lng"].values[0]
        return (lat,lon,score)


    def process_latlons(self,mak_institutes):
        isnull = pd.isnull(mak_institutes)
        if type(isnull) is bool:
            if isnull:
                return []
        elif all(isnull):     
            return []
        return [self.get_latlon(mak_name) 
                for mak_name in mak_institutes]
    
#_______________________
# Fuzzy combination of partial ratio and token sort ratio

grid_full = pd.read_csv("/Users/hep/Downloads/grid20170810/grid.csv",low_memory=False)
grid_address = pd.read_csv("/Users/hep/Downloads/grid20170810/full_tables/addresses.csv",low_memory=False)
grid_alias = pd.read_csv("/Users/hep/Downloads/grid20170810/full_tables/aliases.csv",low_memory=False)

grid_df = grid_full.join(grid_address.set_index(keys=["grid_id"]),on="ID")
grid_df = grid_df.join(grid_alias.set_index(keys=["grid_id"]),on="ID")
grid_df = grid_df[["Name","lat","lng","ID","alias"]]
grid_df.head()

cf = ComboFuzzer([fuzz.token_sort_ratio,fuzz.partial_ratio])
llg = LatLonGetter(grid_df=grid_df,scorer=cf.combo_fuzz)

In [ ]:
mak_df = pd.read_csv("/Users/hep/Downloads/19_10_2017_organisations_to_geocode.csv",names=["institutes","n"])
lat_lon_score = llg.process_latlons(mak_df["institutes"])

In [ ]:
mak_df["latitude"] = [lat for lat,lon,score in lat_lon_score]
mak_df["longitude"] = [lon for lat,lon,score in lat_lon_score]
mak_df["grid_mak_match_score"] = [score for lat,lon,score in lat_lon_score]

In [ ]:
mak_df.head()

In [ ]:
mak_df["grid_mak_match_score"].describe()

In [ ]: