Note: This work is obtained from http://kitchingroup.cheme.cmu.edu/blog/2015/04/03/Getting-data-from-the-Scopus-API/, its author is really the author of that information. I have mainly put in a more useful way.
The access to Scopus API is restricted. First, you need a Elseview-API. You can obtain one from http://dev.elsevier.com/myapikey.html.
In my case, because I have access by a Proxy (and using a VPN, but it is transparent) I also need a PROXY_URL. That both information are not in the repository because they are personal and private. You should create your own my_scopus.py file to run that code without changes.
In [1]:
import requests
import json
from my_scopus import MY_API_KEY, PROXY_URL, MY_AUTHOR_ID
First, we define a function to access to the information
In [2]:
def print_json(resp_json):
print(json.dumps(resp_json,
sort_keys=True,
indent=4, separators=(',', ': ')))
In [3]:
def scopus_get_info_api(url, proxy=PROXY_URL,*,verbose=False,json=True):
"""
Returns the information obtained by the Elseview API
"""
proxies = {
"http": PROXY_URL
}
resp = requests.get("http://api.elsevier.com/content/" +url,
headers={'Accept':'application/json',
'X-ELS-APIKey': MY_API_KEY}, proxies=proxies)
if verbose:
print_json(resp.json())
if json:
return resp.json()
else:
return resp.text.encode('utf-8')
Then, a util function to show the information
A function that return the information of the author.
In [4]:
def scopus_get_author(author_id):
msg = "author?author_id={}&view=metrics".format(author_id)
resp = scopus_get_info_api(msg)
return resp['author-retrieval-response'][0]
Example, to obtain my h-index
In [5]:
author_info = scopus_get_author(MY_AUTHOR_ID)
print_json(author_info)
h_index = author_info['h-index']
print("My automatic h_index is {}".format(h_index))
Now, we are going to extract the list of published papers.
In [6]:
def scopus_search_list(query, field, max=100, *, debug=False):
msg = "search/scopus?query={}&nofield={}&count={}".format(query, field, max)
if debug:
print_json(scopus_get_info_api(msg))
resp = scopus_get_info_api(msg)['search-results']
list = []
if resp['entry']:
list = resp['entry']
return list
In [7]:
def extract_info_papers(list):
def get_type(code):
if code in ['ar','re', 'ed', 'ip']:
return 'article'
elif code == 'cp':
return 'congress'
else:
return code
return [{'id': info['dc:identifier'],
'title': info['dc:title'],
'url': info['prism:url'],
'citations': int(info['citedby-count']),
'type': get_type(info['subtype']),
'year': info['prism:coverDate'][:4],
'journal': info['prism:publicationName']} for info in list]
In [8]:
def scopus_papers_from_author(author_id, *, max=100):
"""
Return the list of papers from the author
"""
query = "AU-ID({})".format(author_id)
field = "dc:identifier"
list = scopus_search_list(query, field, max)
#print_json(list)
return extract_info_papers(list)
In [9]:
papers = scopus_papers_from_author(MY_AUTHOR_ID)
print('{} papers'.format(len(papers)))
In [10]:
import pandas as pd
In [11]:
df = pd.DataFrame.from_dict(papers)
print(df.head())
In [12]:
papers_journal = df[df['type']=='article']
citations = papers_journal.groupby(['year']).sum()
In [13]:
%pylab inline
from matplotlib import pyplot as plt
import seaborn as sns
sns.set()
In [14]:
ax = citations.plot(kind='bar', legend=None)
ax.set_xlabel('Year')
ax.set_ylabel('Citations')
Out[14]:
In [15]:
def get_scopus_info(SCOPUS_ID):
url = ("abstract/scopus_id/"
+ SCOPUS_ID
+ "?field=authors,title,publicationName,volume,issueIdentifier,"
+ "prism:pageRange,coverDate,article-number,doi,citedby-count,prism:aggregationType")
resp = scopus_get_info_api(url, json=True)
results = resp['abstracts-retrieval-response']
authors_info = results['authors']
info = results['coredata']
fstring = '{authors}, {title}, {journal}, {volume}, {articlenum}, ({date}). {doi} (cited {cites} times).\n'
return fstring.format(authors=', '.join([au['ce:indexed-name'] for au in authors_info['author']]),
title=info['dc:title'],
journal=info['prism:publicationName'],
volume=info.get('prism:volume') or 1,
articlenum=info.get('prism:pageRange') or
info.get('article-number'),
date=info['prism:coverDate'],
doi='doi:' +(info.get('prism:doi') or 'NA'),
cites=int(info['citedby-count']))
df_lasts = df[df['year']=='2015']
for id in df.sort(['citations'], ascending=[0])['id']:
#print("id: '{}'".format(id))
print(get_scopus_info(id))
In [16]:
def get_author_info(paper_id):
url = ("abstract/scopus_id/"
+ paper_id
+ "?field=authors,title,publicationName,volume,issueIdentifier,"
+ "prism:pageRange,coverDate,article-number,doi,citedby-count,prism:aggregationType")
resp = scopus_get_info_api(url, json=True)
results = resp['abstracts-retrieval-response']
authors_info = results['authors']['author']
authors_id = [au['ce:indexed-name'] for au in authors_info]
return authors_id
In [17]:
from collections import defaultdict
def get_authors_list(papers_id):
number = defaultdict(int)
for i, paper_id in enumerate(papers_id):
authors = get_author_info(paper_id)
for author in authors:
number[author] += 1
return number
authors = get_authors_list(df['id'])
print(authors)
In [18]:
def show_author_list(authors):
names = authors.keys()
names = sorted(names, key=lambda k: authors[k], reverse=True)
for name in names:
print("{}: {}".format(name, authors[name]))
In [19]:
show_author_list(authors = get_authors_list(df.id))
In [20]:
revistas = sorted(set(papers_journal['journal']))
for revista in revistas:
print(revista)
In [21]:
def scopus_search_papers(words, type='ar'):
"""
Return the list of papers from the author
"""
query = "TITLE-ABS-KEY({}) AND PUBYEAR > 2010 AND DOCTYPE({})".format(words, type)
field = "dc:identifier"
list = scopus_search_list(query, field, 200)
return extract_info_papers(list)
In [22]:
results = scopus_search_papers("large scale optimization evolutionary")
In [23]:
papers_lsgo = pd.DataFrame.from_dict(results)
num_total = len(papers_lsgo)
print(sorted(set(papers_lsgo['year'])))
In [24]:
lsgo_journal = papers_lsgo.groupby(['journal']).sum()
lsgo_journal.columns = ['number']
lsgo_journal = lsgo_journal.sort('number', ascending=False)
lsgo_journal = lsgo_journal[lsgo_journal['number']>0]
print(lsgo_journal)
In [25]:
papers_lsgo = papers_lsgo.sort('citations', ascending=False)
papers_lsgo = papers_lsgo[papers_lsgo['citations']>0]
for id in papers_lsgo['id'][:10]:
print(get_scopus_info(id))
Count the references and number for each author
In [26]:
show_author_list(authors = get_authors_list(papers_lsgo.id))
In [27]:
results_cp = scopus_search_papers("large scale optimization evolutionary", type='cp')
results = pd.DataFrame.from_dict(results_cp)
show_author_list(authors = get_authors_list(results.id))
In [28]:
results_cp = scopus_search_papers("large scale optimization differential evolution", type='cp')
results = pd.DataFrame.from_dict(results_cp)
show_author_list(authors = get_authors_list(results.id))
In [29]:
results = results.sort(['citations'], ascending=False)
for id in results.id[:10]:
print(get_scopus_info(id))