In [239]:

    
import requests
import json
from alphabet_detector import AlphabetDetector
import pandas as pd
import numpy as np
from fuzzywuzzy import process as fuzzy_proc
from fuzzywuzzy import fuzz
import ast



In [30]:

    
ad = AlphabetDetector()
def title_processor(title):
    result = "".join([x if len(ad.detect_alphabet(x)) > 0 or x.isnumeric()
                      else " " for x in title.lower()])
    while "  " in result:
        result = result.replace("  "," ")
    return result



In [117]:

    
headers = {
    'Ocp-Apim-Subscription-Key': 'a9a9efa851b44d5bbd6c841215a99e00',
    'Content-Type': 'application/x-www-form-urlencoded'
}

def process_titles(raw_titles):

    titles = [(pid,title_processor(t)) for pid,t in raw_titles]

    title_count = 800
    title_offset = 0
    query_count = 1000

    calls = 0
    
    data = []
    while title_offset < len(titles):

        calls += 1
        if calls > 10:
            break
        
        last_title = title_offset+title_count
        if last_title > len(titles):
            last_title = None

        titles_subset = titles[title_offset:last_title]
        expr = ["Ti='"+t+"'" for _,t in titles_subset]
        expr = ','.join(expr)
        expr = "expr=OR("+expr+")"
        title_offset += title_count

        query = expr+"&count="+str(query_count)+"&attributes=Id,Ti,D,AA.AuN,AA.AuId,F.FId,J.JId,AA.AfId,CC,ECC,AA.AfN,J.JN"    
        #print(query)
        
        r = requests.post('https://westus.api.cognitive.microsoft.com/academic/v1.0/evaluate', 
                          data=query.encode("utf-8"), headers=headers)
        js = r.json()

        print(len(js["entities"]),len(titles))
        
        for pid,t in titles_subset:
            matched = False
            for row in js["entities"]:
                if t != row["Ti"]:
                    continue
                insts = list(set(author["AfN"] for author in row["AA"] if "AfN" in author))
                data.append(dict(pid=pid,title=t,institutes=insts,citations=row["CC"],date=row["D"],matched=True))
                matched = True
                break
            if not matched:
                data.append(dict(pid=pid,title=t,matched=False))

    print("Made",calls,"calls")
    return data



In [118]:

    
#raw_titles = [(1,"Search for invisible decays of a Higgs boson using vector-boson fusion in pp collisions at s√=8 TeV with the ATLAS detector"),
#              (2,"Muon-induced background to proton decay in the p→K+ν decay channel with large underground liquid argon TPC detectors"),
#              (3,"personalizing search via automated analysis of interests and activities")]

df = pd.read_csv("/Users/hep/Downloads/ai_id_title.csv")
raw_titles = df[["id","title"]].values
data = process_titles(raw_titles)









    



629 7017
758 7017
776 7017
786 7017
796 7017
797 7017
809 7017
790 7017
660 7017
Made 9 calls



In [119]:

    
ncite = 0
ninst = 0
nmatch = 0 
nboth = 0
for row in data:
    if not row["matched"]:
        continue
    nmatch += 1
    if row["citations"] > 0:
        ncite += 1
    if len(row["institutes"]) > 0:
        ninst += 1
    if row["citations"] > 0 and len(row["institutes"]) > 0:
        nboth += 1
print(len(data),nmatch,ncite,ninst,nboth)









    



7017 6408 4459 5544 4117



In [120]:

    
with open('/Users/hep/Downloads/ai_id_title_MAK-matched.json', 'w') as fp:
    json.dump(data, fp)



In [264]:

    
mak_df = pd.read_json('/Users/hep/Nesta/coll_int_ai_case/notebooks/MAK_disambiguate/modules/CS_STATS_id_title_tag_MAK-matched.json')



In [265]:

    
mak_df.head()









    Out[265]:






  
    
      
      citations
      date
      institutes
      matched
      pid
      title
    
  
  
    
      0
      12.0
      2008-11-13
      [max planck society, heidelberg institute for ...
      True
      http://arxiv.org/abs/0811.2055v2
      gpu based interactive visualization of billion...
    
    
      1
      3.0
      2007-01-10
      [bielefeld university, washington university i...
      True
      http://arxiv.org/abs/0707.0808v1
      the cyborg astrobiologist porting from a weara...
    
    
      2
      12.0
      2008-09-20
      [university of california berkeley, university...
      True
      http://arxiv.org/abs/0706.4108v1
      event weighted tests for detecting periodicity...
    
    
      3
      2.0
      2008-11-01
      [massachusetts institute of technology]
      True
      http://arxiv.org/abs/0706.4048v1
      getting more from your multicore exploiting op...
    
    
      4
      0.0
      2007-01-06
      [harvard university]
      True
      http://arxiv.org/abs/cs/0701035v1
      finding astronomical communities through co re...



In [266]:

    
mak_df.loc[~pd.isnull(mak_df["citations"]),"citations"].describe()









    Out[266]:





count    122198.000000
mean         10.942757
std          82.726236
min           0.000000
25%           0.000000
50%           1.000000
75%           7.000000
max       15810.000000
Name: citations, dtype: float64



In [267]:

    
grid_full = pd.read_csv("/Users/hep/Downloads/grid20170810/grid.csv",low_memory=False)
grid_address = pd.read_csv("/Users/hep/Downloads/grid20170810/full_tables/addresses.csv",low_memory=False)
grid_alias = pd.read_csv("/Users/hep/Downloads/grid20170810/full_tables/aliases.csv",low_memory=False)

grid_df = grid_full.join(grid_address.set_index(keys=["grid_id"]),on="ID")
grid_df = grid_df.join(grid_alias.set_index(keys=["grid_id"]),on="ID")
grid_df = grid_df[["Name","lat","lng","ID","alias"]]
grid_df.head()









    Out[267]:






  
    
      
      Name
      lat
      lng
      ID
      alias
    
  
  
    
      0
      Australian National University
      -35.277800
      149.120500
      grid.1001.0
      NaN
    
    
      1
      Monash University
      -37.908300
      145.138000
      grid.1002.3
      NaN
    
    
      2
      University of Queensland
      -27.495964
      153.009627
      grid.1003.2
      NaN
    
    
      3
      Macquarie University
      -33.775259
      151.112915
      grid.1004.5
      NaN
    
    
      4
      UNSW Australia
      -33.917731
      151.230964
      grid.1005.4
      University of New South Wales



In [268]:

    
#_______________________
class ComboFuzzer:
    def __init__(self,fuzzers):
        self.fuzzers = fuzzers
        # Define the normalisation variable in advance
        # NB: defined as inverse for speed
        self.norm = 1/np.sqrt(len(fuzzers))
    
    def combo_fuzz(self,target,candidate):
        _score = 0
        for _fuzz in self.fuzzers:
            _raw_score = (_fuzz(target,candidate)/100)
            _score += _raw_score**2
        return np.sqrt(_score)*self.norm



In [273]:

    
#_______________________
class LatLonGetter:
    def __init__(self,grid_df,scorer):
        self.scorer = scorer
        # Find the null aliases
        self.df = grid_df
        null_alias = pd.isnull(self.df.alias)
        not_null = self.df.loc[~null_alias]
        # Now generate the list of names + not null aliases
        alias_names = list(not_null.alias.values)
        std_names = list(grid_df.Name.values)
        self.all_possible_values = std_names + alias_names
        self.lower_possible_values = [x.lower() for x in 
                                      self.all_possible_values]
        with open("fuzzy_scores.pydict") as f:
            self.fuzzy_matches = ast.literal_eval(f.read())
#         self.fuzzy_matches = {"ibm" : ("IBM (United States)",1.),
#                               "microsoft" : ("Microsoft (United States)",1.),
#                               "xerox" : ("Xerox (United States)", 1.),
#                               "pricewaterhousecoopers" : ("PricewaterhouseCoopers (United States)",1.),
#                               "university of california berkeley": ("University of California, Berkeley",1.),
#                               "university of california santa cruz": ("University of California, Santa Cruz",1.),
#                               "linkoping university": ("Linköping University",1.),
#                               "nec" : ("NEC (United States)",1.),
#                               "university of michigan" : ("Michigan State University",1.),
#                               "google" : ("Google (United States)",1.),
#                               "yahoo" : ("Yahoo (United States)",1.),
#                               "at t" : ("AT&T (United States)",1.),
#                               "at t labs" : ("AT&T (United States",1.)}
        
    def get_latlon(self,mak_name):

        # Super-fast check to see if there is an exact match
        try:
            idx = self.lower_possible_values.index(mak_name)
            match = self.all_possible_values[idx]
            score = 1.
        # Otherwise, fuzzy match
        except ValueError:
            # If already done a fuzzy match for this
            if mak_name in self.fuzzy_matches:
                match,score = self.fuzzy_matches[mak_name]
            # Otherwise, do the fuzzy match
            else:
                match,score = fuzzy_proc.extractOne(query=mak_name,
                                                    choices=self.all_possible_values,
                                                    scorer=self.scorer)
                self.fuzzy_matches[mak_name] = (match,score)
        
        # Check whether the match was a Name or alias
        condition = grid_df.Name == match
        if condition.sum() == 0:
            condition = grid_df.alias == match
        _df = grid_df.loc[condition]

        # Get the lat/lon
        lat = _df["lat"].values[0]
        lon = _df["lng"].values[0]
        return (lat,lon,score)


    def process_latlons(self,mak_institutes):
        isnull = pd.isnull(mak_institutes)
        if type(isnull) is bool:
            if isnull:
                return []
        elif all(isnull):     
            return []
        return [self.get_latlon(mak_name) 
                for mak_name in mak_institutes]
    
#_______________________
# Fuzzy combination of partial ratio and token sort ratio
cf = ComboFuzzer([fuzz.token_sort_ratio,fuzz.partial_ratio])
llg = LatLonGetter(grid_df=grid_df,scorer=cf.combo_fuzz)

print("Already got",len(llg.fuzzy_matches),"matches")









    



Already got 41 matches



In [275]:

    
mak_df["lat_lon_score"] = [llg.process_latlons(insts) for insts in mak_df["institutes"]]
with open("fuzzy_scores.pydict","w") as f:
    print("writing",len(llg.fuzzy_matches))
    f.write(str(llg.fuzzy_matches))









    



writing 2028



In [277]:

    
mak_df.to_json('/Users/hep/Downloads/CS_STATS_id_title_tag_MAK-matched_GRID-matched.json')



In [278]:

    
mak_df.columns









    Out[278]:





Index(['citations', 'date', 'institutes', 'matched', 'pid', 'title',
       'lat_lon_score'],
      dtype='object')



In [279]:

    
pd.read_json('/Users/hep/Nesta/coll_int_ai_case/notebooks/MAK_disambiguate/modules/CS_STATS_id_title_tag_MAK-matched.json').columns









    Out[279]:





Index(['citations', 'date', 'institutes', 'matched', 'pid', 'title'], dtype='object')



In [282]:

    
mak_df.loc[mak_df.matched == True].head()









    Out[282]:






  
    
      
      citations
      date
      institutes
      matched
      pid
      title
      lat_lon_score
    
  
  
    
      0
      12.0
      2008-11-13
      [max planck society, heidelberg institute for ...
      True
      http://arxiv.org/abs/0811.2055v2
      gpu based interactive visualization of billion...
      [(48.141292, 11.581925, 1.0), (49.415617, 8.73...
    
    
      1
      3.0
      2007-01-10
      [bielefeld university, washington university i...
      True
      http://arxiv.org/abs/0707.0808v1
      the cyborg astrobiologist porting from a weara...
      [(52.037778, 8.493056, 1.0), (38.649033, -90.3...
    
    
      2
      12.0
      2008-09-20
      [university of california berkeley, university...
      True
      http://arxiv.org/abs/0706.4108v1
      event weighted tests for detecting periodicity...
      [(37.872162, -122.258572, 1.0), (52.355792, 4....
    
    
      3
      2.0
      2008-11-01
      [massachusetts institute of technology]
      True
      http://arxiv.org/abs/0706.4048v1
      getting more from your multicore exploiting op...
      [(42.35982, -71.09211, 1.0)]
    
    
      4
      0.0
      2007-01-06
      [harvard university]
      True
      http://arxiv.org/abs/cs/0701035v1
      finding astronomical communities through co re...
      [(42.377053, -71.116657, 1.0)]

Output from MAK and GRID matching

Method

Matching to MAK

MAK can be queried by concatenating OR-statements together. The number of results from a MAK query can be no larger than 1000, so we nominally query with 600 sub-queries. We use the paper title from arXiv for the matching, which are prepared by the following procedure:

Identify any foreign characters as non-symbolic.
Replace all symbolic characters with spaces.
Ensure no more than one space separates characters.

This procedure returns a 90% match rate, which may be missing paper where the title is different from that presented on arXiv, or where the paper has not been published in a journal. It may be possible to recuperate some of these missing 10% of papers in the future, for example by matching paper credentials, although this is currently not a limiting factor in our analysis.

Matching to GRID

The GRID dataset contains institute names, and aliases (where applicable), and a corresponding geospatial coordinate (latitude and longitude). Each institute name from MAK is matched to the comprehensive list from GRID in the following manner:

If there is an exact match amongst the institute names or aliases, then extract the coordinates of this match. Assign a "score" of 1 to this match (see step 3. for the definition of "score").
Otherwise, check whether a match has previously been found. If so, extract the coordinates and score of this match.
Otherwise, calculate a matching score of the MAK by convoluting the matching scores of various fuzzy-matching algorithms in the following manner: $$ \frac{1}{\sqrt{N}} \sqrt{ \sum_{n=0}^{N} F_{n}(w_{MAK},W_{GRID})^{2} } $$

where $N$ is the number of fuzzy-matching algorithms to use, $F_{n}()$ returns a fuzzy-matching score (in the range $0 \rightarrow 1$) from the $n^{\text{th}}$ algorithm, $w_{MAK}$ is the name from MAK to be matched and $W_{GRID}$ is the comprensive list of institutes in the GRID data.

I currently use the token_sort_ratio and partial_ratio algorithms implemented in the fuzzywuzzy module.

Fields

field	source	description
citations	MAK	number of citations
date	MAK	date of publication
matched	joel	flag indicating a successful match between arXiv and MAK
pid	arXiv	arXiv publication ID, for matching back to arXiv data
title	joel	the normalised publication title, used for matching to MAK
institutes	MAK	list of institutes from successful matches between arXiv and MAK
lat_lon_score	GRID / joel	A list of triplets, with a one-to-one correspondence with institutes. The first two fields are, respectively, latitude and longitude. The third field is the best fuzzy-matching score between GRID and MAK institutes.

It is generally recommended to only use institutes with scores of 1 of used, which is sufficient for 80% of individual institute-paper matches. Therefore the above method yields an approximate efficiency of 72%, although there are known issues with the GRID matching procedure which leads to a very small number of false matches.



In [289]:

    
condition = mak_df.lat_lon_score.apply(lambda x: all(_x[2] == 1.0 for _x in x) and len(x) > 0)



In [290]:

    
(condition & (mak_df.matched == True)).sum()









    Out[290]:





74752



In [291]:

    
(mak_df.matched == True).sum()









    Out[291]:





122198



In [292]:

    
74752/122198









    Out[292]:





0.6117285061948641



In [299]:

    
sum(1 for lat,lon,score in list(mak_df.lat_lon_score.values) if score == 1. )









    



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-299-46108cea6e97> in <module>()
----> 1 sum(1 for lat,lon,score in list(mak_df.lat_lon_score.values) if score == 1. )

<ipython-input-299-46108cea6e97> in <genexpr>(.0)
----> 1 sum(1 for lat,lon,score in list(mak_df.lat_lon_score.values) if score == 1. )

ValueError: not enough values to unpack (expected 3, got 2)



In [305]:

    
n_good = 0
n_total = 0
for row in mak_df.lat_lon_score.values:
    for lat,log,score in row:
        n_total += 1
        if score == 1.0:
            n_good += 1
print(n_good/n_total)









    



0.8080228249255462



In [ ]:

	citations	date	institutes	matched	pid	title
0	12.0	2008-11-13	[max planck society, heidelberg institute for ...	True	http://arxiv.org/abs/0811.2055v2	gpu based interactive visualization of billion...
1	3.0	2007-01-10	[bielefeld university, washington university i...	True	http://arxiv.org/abs/0707.0808v1	the cyborg astrobiologist porting from a weara...
2	12.0	2008-09-20	[university of california berkeley, university...	True	http://arxiv.org/abs/0706.4108v1	event weighted tests for detecting periodicity...
3	2.0	2008-11-01	[massachusetts institute of technology]	True	http://arxiv.org/abs/0706.4048v1	getting more from your multicore exploiting op...
4	0.0	2007-01-06	[harvard university]	True	http://arxiv.org/abs/cs/0701035v1	finding astronomical communities through co re...

	Name	lat	lng	ID	alias
0	Australian National University	-35.277800	149.120500	grid.1001.0	NaN
1	Monash University	-37.908300	145.138000	grid.1002.3	NaN
2	University of Queensland	-27.495964	153.009627	grid.1003.2	NaN
3	Macquarie University	-33.775259	151.112915	grid.1004.5	NaN
4	UNSW Australia	-33.917731	151.230964	grid.1005.4	University of New South Wales