In [239]:
import requests
import json
from alphabet_detector import AlphabetDetector
import pandas as pd
import numpy as np
from fuzzywuzzy import process as fuzzy_proc
from fuzzywuzzy import fuzz
import ast
In [30]:
ad = AlphabetDetector()
def title_processor(title):
result = "".join([x if len(ad.detect_alphabet(x)) > 0 or x.isnumeric()
else " " for x in title.lower()])
while " " in result:
result = result.replace(" "," ")
return result
In [117]:
headers = {
'Ocp-Apim-Subscription-Key': 'a9a9efa851b44d5bbd6c841215a99e00',
'Content-Type': 'application/x-www-form-urlencoded'
}
def process_titles(raw_titles):
titles = [(pid,title_processor(t)) for pid,t in raw_titles]
title_count = 800
title_offset = 0
query_count = 1000
calls = 0
data = []
while title_offset < len(titles):
calls += 1
if calls > 10:
break
last_title = title_offset+title_count
if last_title > len(titles):
last_title = None
titles_subset = titles[title_offset:last_title]
expr = ["Ti='"+t+"'" for _,t in titles_subset]
expr = ','.join(expr)
expr = "expr=OR("+expr+")"
title_offset += title_count
query = expr+"&count="+str(query_count)+"&attributes=Id,Ti,D,AA.AuN,AA.AuId,F.FId,J.JId,AA.AfId,CC,ECC,AA.AfN,J.JN"
#print(query)
r = requests.post('https://westus.api.cognitive.microsoft.com/academic/v1.0/evaluate',
data=query.encode("utf-8"), headers=headers)
js = r.json()
print(len(js["entities"]),len(titles))
for pid,t in titles_subset:
matched = False
for row in js["entities"]:
if t != row["Ti"]:
continue
insts = list(set(author["AfN"] for author in row["AA"] if "AfN" in author))
data.append(dict(pid=pid,title=t,institutes=insts,citations=row["CC"],date=row["D"],matched=True))
matched = True
break
if not matched:
data.append(dict(pid=pid,title=t,matched=False))
print("Made",calls,"calls")
return data
In [118]:
#raw_titles = [(1,"Search for invisible decays of a Higgs boson using vector-boson fusion in pp collisions at s√=8 TeV with the ATLAS detector"),
# (2,"Muon-induced background to proton decay in the p→K+ν decay channel with large underground liquid argon TPC detectors"),
# (3,"personalizing search via automated analysis of interests and activities")]
df = pd.read_csv("/Users/hep/Downloads/ai_id_title.csv")
raw_titles = df[["id","title"]].values
data = process_titles(raw_titles)
In [119]:
ncite = 0
ninst = 0
nmatch = 0
nboth = 0
for row in data:
if not row["matched"]:
continue
nmatch += 1
if row["citations"] > 0:
ncite += 1
if len(row["institutes"]) > 0:
ninst += 1
if row["citations"] > 0 and len(row["institutes"]) > 0:
nboth += 1
print(len(data),nmatch,ncite,ninst,nboth)
In [120]:
with open('/Users/hep/Downloads/ai_id_title_MAK-matched.json', 'w') as fp:
json.dump(data, fp)
In [264]:
mak_df = pd.read_json('/Users/hep/Nesta/coll_int_ai_case/notebooks/MAK_disambiguate/modules/CS_STATS_id_title_tag_MAK-matched.json')
In [265]:
mak_df.head()
Out[265]:
In [266]:
mak_df.loc[~pd.isnull(mak_df["citations"]),"citations"].describe()
Out[266]:
In [267]:
grid_full = pd.read_csv("/Users/hep/Downloads/grid20170810/grid.csv",low_memory=False)
grid_address = pd.read_csv("/Users/hep/Downloads/grid20170810/full_tables/addresses.csv",low_memory=False)
grid_alias = pd.read_csv("/Users/hep/Downloads/grid20170810/full_tables/aliases.csv",low_memory=False)
grid_df = grid_full.join(grid_address.set_index(keys=["grid_id"]),on="ID")
grid_df = grid_df.join(grid_alias.set_index(keys=["grid_id"]),on="ID")
grid_df = grid_df[["Name","lat","lng","ID","alias"]]
grid_df.head()
Out[267]:
In [268]:
#_______________________
class ComboFuzzer:
def __init__(self,fuzzers):
self.fuzzers = fuzzers
# Define the normalisation variable in advance
# NB: defined as inverse for speed
self.norm = 1/np.sqrt(len(fuzzers))
def combo_fuzz(self,target,candidate):
_score = 0
for _fuzz in self.fuzzers:
_raw_score = (_fuzz(target,candidate)/100)
_score += _raw_score**2
return np.sqrt(_score)*self.norm
In [273]:
#_______________________
class LatLonGetter:
def __init__(self,grid_df,scorer):
self.scorer = scorer
# Find the null aliases
self.df = grid_df
null_alias = pd.isnull(self.df.alias)
not_null = self.df.loc[~null_alias]
# Now generate the list of names + not null aliases
alias_names = list(not_null.alias.values)
std_names = list(grid_df.Name.values)
self.all_possible_values = std_names + alias_names
self.lower_possible_values = [x.lower() for x in
self.all_possible_values]
with open("fuzzy_scores.pydict") as f:
self.fuzzy_matches = ast.literal_eval(f.read())
# self.fuzzy_matches = {"ibm" : ("IBM (United States)",1.),
# "microsoft" : ("Microsoft (United States)",1.),
# "xerox" : ("Xerox (United States)", 1.),
# "pricewaterhousecoopers" : ("PricewaterhouseCoopers (United States)",1.),
# "university of california berkeley": ("University of California, Berkeley",1.),
# "university of california santa cruz": ("University of California, Santa Cruz",1.),
# "linkoping university": ("Linköping University",1.),
# "nec" : ("NEC (United States)",1.),
# "university of michigan" : ("Michigan State University",1.),
# "google" : ("Google (United States)",1.),
# "yahoo" : ("Yahoo (United States)",1.),
# "at t" : ("AT&T (United States)",1.),
# "at t labs" : ("AT&T (United States",1.)}
def get_latlon(self,mak_name):
# Super-fast check to see if there is an exact match
try:
idx = self.lower_possible_values.index(mak_name)
match = self.all_possible_values[idx]
score = 1.
# Otherwise, fuzzy match
except ValueError:
# If already done a fuzzy match for this
if mak_name in self.fuzzy_matches:
match,score = self.fuzzy_matches[mak_name]
# Otherwise, do the fuzzy match
else:
match,score = fuzzy_proc.extractOne(query=mak_name,
choices=self.all_possible_values,
scorer=self.scorer)
self.fuzzy_matches[mak_name] = (match,score)
# Check whether the match was a Name or alias
condition = grid_df.Name == match
if condition.sum() == 0:
condition = grid_df.alias == match
_df = grid_df.loc[condition]
# Get the lat/lon
lat = _df["lat"].values[0]
lon = _df["lng"].values[0]
return (lat,lon,score)
def process_latlons(self,mak_institutes):
isnull = pd.isnull(mak_institutes)
if type(isnull) is bool:
if isnull:
return []
elif all(isnull):
return []
return [self.get_latlon(mak_name)
for mak_name in mak_institutes]
#_______________________
# Fuzzy combination of partial ratio and token sort ratio
cf = ComboFuzzer([fuzz.token_sort_ratio,fuzz.partial_ratio])
llg = LatLonGetter(grid_df=grid_df,scorer=cf.combo_fuzz)
print("Already got",len(llg.fuzzy_matches),"matches")
In [275]:
mak_df["lat_lon_score"] = [llg.process_latlons(insts) for insts in mak_df["institutes"]]
with open("fuzzy_scores.pydict","w") as f:
print("writing",len(llg.fuzzy_matches))
f.write(str(llg.fuzzy_matches))
In [277]:
mak_df.to_json('/Users/hep/Downloads/CS_STATS_id_title_tag_MAK-matched_GRID-matched.json')
In [278]:
mak_df.columns
Out[278]:
In [279]:
pd.read_json('/Users/hep/Nesta/coll_int_ai_case/notebooks/MAK_disambiguate/modules/CS_STATS_id_title_tag_MAK-matched.json').columns
Out[279]:
In [282]:
mak_df.loc[mak_df.matched == True].head()
Out[282]:
MAK can be queried by concatenating OR-statements together. The number of results from a MAK query can be no larger than 1000, so we nominally query with 600 sub-queries. We use the paper title from arXiv for the matching, which are prepared by the following procedure:
This procedure returns a 90% match rate, which may be missing paper where the title is different from that presented on arXiv, or where the paper has not been published in a journal. It may be possible to recuperate some of these missing 10% of papers in the future, for example by matching paper credentials, although this is currently not a limiting factor in our analysis.
The GRID dataset contains institute names, and aliases (where applicable), and a corresponding geospatial coordinate (latitude and longitude). Each institute name from MAK is matched to the comprehensive list from GRID in the following manner:
where $N$ is the number of fuzzy-matching algorithms to use, $F_{n}()$ returns a fuzzy-matching score (in the range $0 \rightarrow 1$) from the $n^{\text{th}}$ algorithm, $w_{MAK}$ is the name from MAK to be matched and $W_{GRID}$ is the comprensive list of institutes in the GRID data.
I currently use the token_sort_ratio
and partial_ratio
algorithms implemented in the fuzzywuzzy
module.
field | source | description |
---|---|---|
citations | MAK | number of citations |
date | MAK | date of publication |
matched | joel | flag indicating a successful match between arXiv and MAK |
pid | arXiv | arXiv publication ID, for matching back to arXiv data |
title | joel | the normalised publication title, used for matching to MAK |
institutes | MAK | list of institutes from successful matches between arXiv and MAK |
lat_lon_score | GRID / joel | A list of triplets, with a one-to-one correspondence with institutes. The first two fields are, respectively, latitude and longitude. The third field is the best fuzzy-matching score between GRID and MAK institutes. |
It is generally recommended to only use institutes with scores of 1 of used, which is sufficient for 80% of individual institute-paper matches. Therefore the above method yields an approximate efficiency of 72%, although there are known issues with the GRID matching procedure which leads to a very small number of false matches.
In [289]:
condition = mak_df.lat_lon_score.apply(lambda x: all(_x[2] == 1.0 for _x in x) and len(x) > 0)
In [290]:
(condition & (mak_df.matched == True)).sum()
Out[290]:
In [291]:
(mak_df.matched == True).sum()
Out[291]:
In [292]:
74752/122198
Out[292]:
In [299]:
sum(1 for lat,lon,score in list(mak_df.lat_lon_score.values) if score == 1. )
In [305]:
n_good = 0
n_total = 0
for row in mak_df.lat_lon_score.values:
for lat,log,score in row:
n_total += 1
if score == 1.0:
n_good += 1
print(n_good/n_total)
In [ ]: