In [12]:
%matplotlib inline
from matplotlib import pyplot as plt
import requests
import json
import numpy as np
from collections import Counter
In [61]:
headers = {
'Ocp-Apim-Subscription-Key': 'a9a9efa851b44d5bbd6c841215a99e00',
'Content-Type': 'application/x-www-form-urlencoded'
}
results = []
count = 500
offset = 0
while count > 0:
query ="expr=Composite(AA.AuN=='john smith')&count="+str(count)+"&offset="+str(offset)+"+&attributes=D,AA.AuN,AA.AuId,F.FId,J.JId,AA.AfId"
offset += count
r = requests.post('https://westus.api.cognitive.microsoft.com/academic/v1.0/evaluate',
data=query, headers=headers)
js = r.json()
count = len(js["entities"])
results += js["entities"]
In [62]:
ids=[]
for _row in results:
for _author in _row["AA"]:
ids.append(_author["AuId"])
In [66]:
class Author:
def __init__(self,author_id):
self.author_id = author_id
self.coauthors = []
self.fields = []
self.journals = []
self.affiliations = []
self.dates = []
def append(self,coauthors,fields,journal,affiliation,date):
coauthors.remove(self.author_id)
self.coauthors.append(coauthors)
self.fields.append(fields)
self.journals.append(journal)
self.affiliations.append(affiliation)
self.dates.append(date)
authors = {}
for _row in results:
_coauthors = [_a["AuId"] for _a in _row["AA"]]
_fields = None
if "F" in _row:
_fields = [_f["FId"] for _f in _row["F"]]
_journal = None
if "J" in _row:
_journal = _row["J"]["JId"]
_date = None
if "D" in _row:
_date = _row["D"]
for _a in _row["AA"]:
_author_id = _a["AuId"]
_affiliation = None
if "AfId" in _a:
_affiliation = _a["AfId"]
if _author_id not in authors:
authors[_author_id] = Author(_author_id)
authors[_author_id].append(_coauthors,_fields,_journal,_affiliation,_date)
In [87]:
# Plot the greatest separation in consecutive publication dates
max_days = []
max_years = []
parse = lambda d:int(d.split("-")[0])
for _,_a in authors.items():
_dates = sorted(list(map(dateutil.parser.parse,_a.dates)))
if len(_dates) == 1:
continue
else:
max_years.append(max(np.diff([d.year for d in _dates])))
max_days.append(max(np.diff(_dates)).days)
#medians = [max(_yrs) for _yrs in years if len(_yrs) > 1]
#iqr = [max(_yrs) - min(_yrs) for _yrs in years if len(_yrs) > 1]
#plt.hist(medians,iqr)
c = Counter(max_years)
fig,ax = plt.subplots(figsize=(12,4))
ax.hist(max_years,bins=range(0,max(max_years)))
ax.set_title("Maximum number of calendar years between consecutive papers")
print(c)
fig,ax = plt.subplots(figsize=(12,4))
ax.hist(max_days,bins=200)
ax.set_title("Maximum number of days between consecutive papers")
Out[87]:
In [ ]: