In [239]:
import requests
import json
from alphabet_detector import AlphabetDetector
import pandas as pd
import numpy as np
from fuzzywuzzy import process as fuzzy_proc
from fuzzywuzzy import fuzz
import ast

In [30]:
ad = AlphabetDetector()
def title_processor(title):
    result = "".join([x if len(ad.detect_alphabet(x)) > 0 or x.isnumeric()
                      else " " for x in title.lower()])
    while "  " in result:
        result = result.replace("  "," ")
    return result

In [117]:
headers = {
    'Ocp-Apim-Subscription-Key': 'a9a9efa851b44d5bbd6c841215a99e00',
    'Content-Type': 'application/x-www-form-urlencoded'
}

def process_titles(raw_titles):

    titles = [(pid,title_processor(t)) for pid,t in raw_titles]

    title_count = 800
    title_offset = 0
    query_count = 1000

    calls = 0
    
    data = []
    while title_offset < len(titles):

        calls += 1
        if calls > 10:
            break
        
        last_title = title_offset+title_count
        if last_title > len(titles):
            last_title = None

        titles_subset = titles[title_offset:last_title]
        expr = ["Ti='"+t+"'" for _,t in titles_subset]
        expr = ','.join(expr)
        expr = "expr=OR("+expr+")"
        title_offset += title_count

        query = expr+"&count="+str(query_count)+"&attributes=Id,Ti,D,AA.AuN,AA.AuId,F.FId,J.JId,AA.AfId,CC,ECC,AA.AfN,J.JN"    
        #print(query)
        
        r = requests.post('https://westus.api.cognitive.microsoft.com/academic/v1.0/evaluate', 
                          data=query.encode("utf-8"), headers=headers)
        js = r.json()

        print(len(js["entities"]),len(titles))
        
        for pid,t in titles_subset:
            matched = False
            for row in js["entities"]:
                if t != row["Ti"]:
                    continue
                insts = list(set(author["AfN"] for author in row["AA"] if "AfN" in author))
                data.append(dict(pid=pid,title=t,institutes=insts,citations=row["CC"],date=row["D"],matched=True))
                matched = True
                break
            if not matched:
                data.append(dict(pid=pid,title=t,matched=False))

    print("Made",calls,"calls")
    return data

In [118]:
#raw_titles = [(1,"Search for invisible decays of a Higgs boson using vector-boson fusion in pp collisions at s√=8 TeV with the ATLAS detector"),
#              (2,"Muon-induced background to proton decay in the p→K+ν decay channel with large underground liquid argon TPC detectors"),
#              (3,"personalizing search via automated analysis of interests and activities")]

df = pd.read_csv("/Users/hep/Downloads/ai_id_title.csv")
raw_titles = df[["id","title"]].values
data = process_titles(raw_titles)


629 7017
758 7017
776 7017
786 7017
796 7017
797 7017
809 7017
790 7017
660 7017
Made 9 calls

In [119]:
ncite = 0
ninst = 0
nmatch = 0 
nboth = 0
for row in data:
    if not row["matched"]:
        continue
    nmatch += 1
    if row["citations"] > 0:
        ncite += 1
    if len(row["institutes"]) > 0:
        ninst += 1
    if row["citations"] > 0 and len(row["institutes"]) > 0:
        nboth += 1
print(len(data),nmatch,ncite,ninst,nboth)


7017 6408 4459 5544 4117

In [120]:
with open('/Users/hep/Downloads/ai_id_title_MAK-matched.json', 'w') as fp:
    json.dump(data, fp)

In [264]:
mak_df = pd.read_json('/Users/hep/Nesta/coll_int_ai_case/notebooks/MAK_disambiguate/modules/CS_STATS_id_title_tag_MAK-matched.json')

In [265]:
mak_df.head()


Out[265]:
citations date institutes matched pid title
0 12.0 2008-11-13 [max planck society, heidelberg institute for ... True http://arxiv.org/abs/0811.2055v2 gpu based interactive visualization of billion...
1 3.0 2007-01-10 [bielefeld university, washington university i... True http://arxiv.org/abs/0707.0808v1 the cyborg astrobiologist porting from a weara...
2 12.0 2008-09-20 [university of california berkeley, university... True http://arxiv.org/abs/0706.4108v1 event weighted tests for detecting periodicity...
3 2.0 2008-11-01 [massachusetts institute of technology] True http://arxiv.org/abs/0706.4048v1 getting more from your multicore exploiting op...
4 0.0 2007-01-06 [harvard university] True http://arxiv.org/abs/cs/0701035v1 finding astronomical communities through co re...

In [266]:
mak_df.loc[~pd.isnull(mak_df["citations"]),"citations"].describe()


Out[266]:
count    122198.000000
mean         10.942757
std          82.726236
min           0.000000
25%           0.000000
50%           1.000000
75%           7.000000
max       15810.000000
Name: citations, dtype: float64

In [267]:
grid_full = pd.read_csv("/Users/hep/Downloads/grid20170810/grid.csv",low_memory=False)
grid_address = pd.read_csv("/Users/hep/Downloads/grid20170810/full_tables/addresses.csv",low_memory=False)
grid_alias = pd.read_csv("/Users/hep/Downloads/grid20170810/full_tables/aliases.csv",low_memory=False)

grid_df = grid_full.join(grid_address.set_index(keys=["grid_id"]),on="ID")
grid_df = grid_df.join(grid_alias.set_index(keys=["grid_id"]),on="ID")
grid_df = grid_df[["Name","lat","lng","ID","alias"]]
grid_df.head()


Out[267]:
Name lat lng ID alias
0 Australian National University -35.277800 149.120500 grid.1001.0 NaN
1 Monash University -37.908300 145.138000 grid.1002.3 NaN
2 University of Queensland -27.495964 153.009627 grid.1003.2 NaN
3 Macquarie University -33.775259 151.112915 grid.1004.5 NaN
4 UNSW Australia -33.917731 151.230964 grid.1005.4 University of New South Wales

In [268]:
#_______________________
class ComboFuzzer:
    def __init__(self,fuzzers):
        self.fuzzers = fuzzers
        # Define the normalisation variable in advance
        # NB: defined as inverse for speed
        self.norm = 1/np.sqrt(len(fuzzers))
    
    def combo_fuzz(self,target,candidate):
        _score = 0
        for _fuzz in self.fuzzers:
            _raw_score = (_fuzz(target,candidate)/100)
            _score += _raw_score**2
        return np.sqrt(_score)*self.norm

In [273]:
#_______________________
class LatLonGetter:
    def __init__(self,grid_df,scorer):
        self.scorer = scorer
        # Find the null aliases
        self.df = grid_df
        null_alias = pd.isnull(self.df.alias)
        not_null = self.df.loc[~null_alias]
        # Now generate the list of names + not null aliases
        alias_names = list(not_null.alias.values)
        std_names = list(grid_df.Name.values)
        self.all_possible_values = std_names + alias_names
        self.lower_possible_values = [x.lower() for x in 
                                      self.all_possible_values]
        with open("fuzzy_scores.pydict") as f:
            self.fuzzy_matches = ast.literal_eval(f.read())
#         self.fuzzy_matches = {"ibm" : ("IBM (United States)",1.),
#                               "microsoft" : ("Microsoft (United States)",1.),
#                               "xerox" : ("Xerox (United States)", 1.),
#                               "pricewaterhousecoopers" : ("PricewaterhouseCoopers (United States)",1.),
#                               "university of california berkeley": ("University of California, Berkeley",1.),
#                               "university of california santa cruz": ("University of California, Santa Cruz",1.),
#                               "linkoping university": ("Linköping University",1.),
#                               "nec" : ("NEC (United States)",1.),
#                               "university of michigan" : ("Michigan State University",1.),
#                               "google" : ("Google (United States)",1.),
#                               "yahoo" : ("Yahoo (United States)",1.),
#                               "at t" : ("AT&T (United States)",1.),
#                               "at t labs" : ("AT&T (United States",1.)}
        
    def get_latlon(self,mak_name):

        # Super-fast check to see if there is an exact match
        try:
            idx = self.lower_possible_values.index(mak_name)
            match = self.all_possible_values[idx]
            score = 1.
        # Otherwise, fuzzy match
        except ValueError:
            # If already done a fuzzy match for this
            if mak_name in self.fuzzy_matches:
                match,score = self.fuzzy_matches[mak_name]
            # Otherwise, do the fuzzy match
            else:
                match,score = fuzzy_proc.extractOne(query=mak_name,
                                                    choices=self.all_possible_values,
                                                    scorer=self.scorer)
                self.fuzzy_matches[mak_name] = (match,score)
        
        # Check whether the match was a Name or alias
        condition = grid_df.Name == match
        if condition.sum() == 0:
            condition = grid_df.alias == match
        _df = grid_df.loc[condition]

        # Get the lat/lon
        lat = _df["lat"].values[0]
        lon = _df["lng"].values[0]
        return (lat,lon,score)


    def process_latlons(self,mak_institutes):
        isnull = pd.isnull(mak_institutes)
        if type(isnull) is bool:
            if isnull:
                return []
        elif all(isnull):     
            return []
        return [self.get_latlon(mak_name) 
                for mak_name in mak_institutes]
    
#_______________________
# Fuzzy combination of partial ratio and token sort ratio
cf = ComboFuzzer([fuzz.token_sort_ratio,fuzz.partial_ratio])
llg = LatLonGetter(grid_df=grid_df,scorer=cf.combo_fuzz)

print("Already got",len(llg.fuzzy_matches),"matches")


Already got 41 matches

In [275]:
mak_df["lat_lon_score"] = [llg.process_latlons(insts) for insts in mak_df["institutes"]]
with open("fuzzy_scores.pydict","w") as f:
    print("writing",len(llg.fuzzy_matches))
    f.write(str(llg.fuzzy_matches))


writing 2028

In [277]:
mak_df.to_json('/Users/hep/Downloads/CS_STATS_id_title_tag_MAK-matched_GRID-matched.json')

In [278]:
mak_df.columns


Out[278]:
Index(['citations', 'date', 'institutes', 'matched', 'pid', 'title',
       'lat_lon_score'],
      dtype='object')

In [279]:
pd.read_json('/Users/hep/Nesta/coll_int_ai_case/notebooks/MAK_disambiguate/modules/CS_STATS_id_title_tag_MAK-matched.json').columns


Out[279]:
Index(['citations', 'date', 'institutes', 'matched', 'pid', 'title'], dtype='object')

In [282]:
mak_df.loc[mak_df.matched == True].head()


Out[282]:
citations date institutes matched pid title lat_lon_score
0 12.0 2008-11-13 [max planck society, heidelberg institute for ... True http://arxiv.org/abs/0811.2055v2 gpu based interactive visualization of billion... [(48.141292, 11.581925, 1.0), (49.415617, 8.73...
1 3.0 2007-01-10 [bielefeld university, washington university i... True http://arxiv.org/abs/0707.0808v1 the cyborg astrobiologist porting from a weara... [(52.037778, 8.493056, 1.0), (38.649033, -90.3...
2 12.0 2008-09-20 [university of california berkeley, university... True http://arxiv.org/abs/0706.4108v1 event weighted tests for detecting periodicity... [(37.872162, -122.258572, 1.0), (52.355792, 4....
3 2.0 2008-11-01 [massachusetts institute of technology] True http://arxiv.org/abs/0706.4048v1 getting more from your multicore exploiting op... [(42.35982, -71.09211, 1.0)]
4 0.0 2007-01-06 [harvard university] True http://arxiv.org/abs/cs/0701035v1 finding astronomical communities through co re... [(42.377053, -71.116657, 1.0)]

Output from MAK and GRID matching

Method

Matching to MAK

MAK can be queried by concatenating OR-statements together. The number of results from a MAK query can be no larger than 1000, so we nominally query with 600 sub-queries. We use the paper title from arXiv for the matching, which are prepared by the following procedure:

  1. Identify any foreign characters as non-symbolic.
  2. Replace all symbolic characters with spaces.
  3. Ensure no more than one space separates characters.

This procedure returns a 90% match rate, which may be missing paper where the title is different from that presented on arXiv, or where the paper has not been published in a journal. It may be possible to recuperate some of these missing 10% of papers in the future, for example by matching paper credentials, although this is currently not a limiting factor in our analysis.

Matching to GRID

The GRID dataset contains institute names, and aliases (where applicable), and a corresponding geospatial coordinate (latitude and longitude). Each institute name from MAK is matched to the comprehensive list from GRID in the following manner:

  1. If there is an exact match amongst the institute names or aliases, then extract the coordinates of this match. Assign a "score" of 1 to this match (see step 3. for the definition of "score").
  2. Otherwise, check whether a match has previously been found. If so, extract the coordinates and score of this match.
  3. Otherwise, calculate a matching score of the MAK by convoluting the matching scores of various fuzzy-matching algorithms in the following manner: $$ \frac{1}{\sqrt{N}} \sqrt{ \sum_{n=0}^{N} F_{n}(w_{MAK},W_{GRID})^{2} } $$

where $N$ is the number of fuzzy-matching algorithms to use, $F_{n}()$ returns a fuzzy-matching score (in the range $0 \rightarrow 1$) from the $n^{\text{th}}$ algorithm, $w_{MAK}$ is the name from MAK to be matched and $W_{GRID}$ is the comprensive list of institutes in the GRID data.

I currently use the token_sort_ratio and partial_ratio algorithms implemented in the fuzzywuzzy module.

Fields

field source description
citations MAK number of citations
date MAK date of publication
matched joel flag indicating a successful match between arXiv and MAK
pid arXiv arXiv publication ID, for matching back to arXiv data
title joel the normalised publication title, used for matching to MAK
institutes MAK list of institutes from successful matches between arXiv and MAK
lat_lon_score GRID / joel A list of triplets, with a one-to-one correspondence with institutes. The first two fields are, respectively, latitude and longitude. The third field is the best fuzzy-matching score between GRID and MAK institutes.

It is generally recommended to only use institutes with scores of 1 of used, which is sufficient for 80% of individual institute-paper matches. Therefore the above method yields an approximate efficiency of 72%, although there are known issues with the GRID matching procedure which leads to a very small number of false matches.


In [289]:
condition = mak_df.lat_lon_score.apply(lambda x: all(_x[2] == 1.0 for _x in x) and len(x) > 0)

In [290]:
(condition & (mak_df.matched == True)).sum()


Out[290]:
74752

In [291]:
(mak_df.matched == True).sum()


Out[291]:
122198

In [292]:
74752/122198


Out[292]:
0.6117285061948641

In [299]:
sum(1 for lat,lon,score in list(mak_df.lat_lon_score.values) if score == 1. )


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-299-46108cea6e97> in <module>()
----> 1 sum(1 for lat,lon,score in list(mak_df.lat_lon_score.values) if score == 1. )

<ipython-input-299-46108cea6e97> in <genexpr>(.0)
----> 1 sum(1 for lat,lon,score in list(mak_df.lat_lon_score.values) if score == 1. )

ValueError: not enough values to unpack (expected 3, got 2)

In [305]:
n_good = 0
n_total = 0
for row in mak_df.lat_lon_score.values:
    for lat,log,score in row:
        n_total += 1
        if score == 1.0:
            n_good += 1
print(n_good/n_total)


0.8080228249255462

In [ ]: