Faraday Institute MSM Fast Start affiliates SCOPUS miner



In [1]:

    
import pandas as pd, json

Affiliate list



In [12]:

    
members=[{'f':'Denes','l':'Csala','u':'Lancaster University'},\
         {'f':'Harry','l':'Hoster','u':'Lancaster University'},\
         {'f':'Gregory','l':'Offer','u':'Imperial College London'},\
         {'f':'Monica','l':'Marinescu','u':'Imperial College London'},\
         {'f':'Billy','l':'Wu','u':'Imperial College London'},\
         {'f':'Aron','l':'Walsh','u':'Imperial College London'},\
         {'f':'Sam','l':'Cooper','u':'Imperial College London'},\
         {'f':'Dhammika','l':'Widanage','u':'University of Warwick'},\
         {'f':'Emma','l':'Kendrick','u':'University of Birmingham'},\
         {'f':'James','l':'Marco','u':'University of Warwick'},\
         {'f':'Charles','l':'Monroe','u':'University of Oxford'},\
         {'f':'David','l':'Howey','u':'University of Oxford'},\
         {'f':'Jon','l':'Chapman','u':'University of Oxford'},\
         {'f':'Colin','l':'Please','u':'University of Oxford'},\
         {'f':'Denis','l':'Kramer','u':'University of Southampton'},\
         {'f':'Chris-Kriton','l':'Skylaris','u':'University of Southampton'},\
         {'f':'Giles','l':'Richardson','u':'University of Southampton'},\
         {'f':'Dan','l':'Brett','u':'University College London'},\
         {'f':'David','l':'Scanlon','u':'University College London'},\
         {'f':'Paul','l':'Shearing','u':'University College London'},\
         {'f':'Saiful','l':'Islam','u':'University of Bath'},\
         {'f':'Benjamin','l':'Morgan','u':'University of Bath'}]

Initialize SCOPUS API key



In [13]:

    
key_file = open("key", "r") #IP-based, at uni
#key_file = open("key2", "r") #at home
key = key_file.read()
from pyscopus.scopus import Scopus
pyscopus = Scopus(key)

Retrieve SCOPUS author IDs



In [14]:

    
authors=[]
for i in members:
    print i
    query_dict = {'affil': i['u'], 'authfirst': i['f'], 'authlast': i['l']}
    author_results = pyscopus.search_author(query_dict);
    if len(author_results)>0: scopusid=author_results[0]['author_id']
    else: scopusid='0000'
    i['s']=scopusid
    authors.append(i)









    



{'u': 'Lancaster University', 'l': 'Csala', 'f': 'Denes'}
A total number of  1  records for the query.
            affiliation    author_id  document_count         name
0  Lancaster University  56223929700               6  Denes Csala
{'u': 'Lancaster University', 'l': 'Hoster', 'f': 'Harry'}
A total number of  1  records for the query.
            affiliation   author_id  document_count                name
0  Lancaster University  6701895930              87  Harry Ernst Hoster
{'u': 'Imperial College London', 'l': 'Offer', 'f': 'Gregory'}
A total number of  1  records for the query.
               affiliation    author_id  document_count                 name
0  Imperial College London  23098060000              72  Gregory James Offer
{'u': 'Imperial College London', 'l': 'Marinescu', 'f': 'Monica'}
A total number of  1  records for the query.
               affiliation    author_id  document_count                 name
0  Imperial College London  36701631300              17  Monica M. Marinescu
{'u': 'Imperial College London', 'l': 'Wu', 'f': 'Billy'}
A total number of  2  records for the query.
               affiliation    author_id  document_count      name
0  Imperial College London   7403591460              17  Billy Wu
1  Imperial College London  56767341500               2  Billy Wu
{'u': 'Imperial College London', 'l': 'Walsh', 'f': 'Aron'}
A total number of  1  records for the query.
               affiliation    author_id  document_count        name
0  Imperial College London  35315151400             268  Aron Walsh
{'u': 'Imperial College London', 'l': 'Cooper', 'f': 'Sam'}
A total number of  1  records for the query.
               affiliation    author_id  document_count        name
0  Imperial College London  56167913500               7  Sam Cooper
{'u': 'University of Warwick', 'l': 'Widanage', 'f': 'Dhammika'}
A total number of  1  records for the query.
                 affiliation    author_id  document_count  \
0  The University of Warwick  16317933200              26   

                   name  
0  W. Dhammika Widanage  
{'u': 'University of Birmingham', 'l': 'Kendrick', 'f': 'Emma'}
A total number of  1  records for the query.
                affiliation   author_id  document_count           name
0  University of Birmingham  9844777200              53  Emma Kendrick
{'u': 'University of Warwick', 'l': 'Marco', 'f': 'James'}
A total number of  1  records for the query.
                 affiliation    author_id  document_count            name
0  The University of Warwick  16305142700              80  James G. Marco
{'u': 'University of Oxford', 'l': 'Monroe', 'f': 'Charles'}
A total number of  1  records for the query.
            affiliation   author_id  document_count               name
0  University of Oxford  7006243205              60  Charles W. Monroe
{'u': 'University of Oxford', 'l': 'Howey', 'f': 'David'}
A total number of  1  records for the query.
            affiliation    author_id  document_count            name
0  University of Oxford  34267561800              75  David A. Howey
{'u': 'University of Oxford', 'l': 'Chapman', 'f': 'Jon'}
A total number of  2  records for the query.
            affiliation    author_id  document_count                 name
0  University of Oxford   7403045089             143  S. Jonathan Chapman
1  University of Oxford  57196046040               1          Jon Chapman
{'u': 'University of Oxford', 'l': 'Please', 'f': 'Colin'}
A total number of  1  records for the query.
            affiliation   author_id  document_count          name
0  University of Oxford  7003528228             109  Colin Please
{'u': 'University of Southampton', 'l': 'Kramer', 'f': 'Denis'}
A total number of  1  records for the query.
                 affiliation   author_id  document_count          name
0  University of Southampton  7203031715              39  Denis Kramer
{'u': 'University of Southampton', 'l': 'Skylaris', 'f': 'Chris-Kriton'}
A total number of  1  records for the query.
                 affiliation   author_id  document_count  \
0  University of Southampton  6603502266              89   

                    name  
0  Chris Kriton Skylaris  
{'u': 'University of Southampton', 'l': 'Richardson', 'f': 'Giles'}
A total number of  1  records for the query.
                 affiliation   author_id  document_count                 name
0  University of Southampton  7202431002              50  Giles W. Richardson
{'u': 'University College London', 'l': 'Brett', 'f': 'Dan'}
A total number of  2  records for the query.
  affiliation    author_id  document_count            name
0         UCL  13805905800             209  Dan J.L. Brett
1         UCL  57197715236               1  Dan J.L. Brett
{'u': 'University College London', 'l': 'Scanlon', 'f': 'David'}
A total number of  1  records for the query.
  affiliation    author_id  document_count              name
0         UCL  16647397400             145  David O. Scanlon
{'u': 'University College London', 'l': 'Shearing', 'f': 'Paul'}
A total number of  1  records for the query.
  affiliation    author_id  document_count              name
0         UCL  24178516700             145  Paul R. Shearing
{'u': 'University of Bath', 'l': 'Islam', 'f': 'Saiful'}
A total number of  1  records for the query.
          affiliation    author_id  document_count             name
0  University of Bath  55547120924              96  M. Saiful Islam
{'u': 'University of Bath', 'l': 'Morgan', 'f': 'Benjamin'}
A total number of  1  records for the query.
          affiliation    author_id  document_count                  name
0  University of Bath  55174608300              50  Benjamin John Morgan



In [15]:

    
pd.DataFrame(authors)









    Out[15]:






  
    
      
      f
      l
      s
      u
    
  
  
    
      0
      Denes
      Csala
      56223929700
      Lancaster University
    
    
      1
      Harry
      Hoster
      6701895930
      Lancaster University
    
    
      2
      Gregory
      Offer
      23098060000
      Imperial College London
    
    
      3
      Monica
      Marinescu
      36701631300
      Imperial College London
    
    
      4
      Billy
      Wu
      7403591460
      Imperial College London
    
    
      5
      Aron
      Walsh
      35315151400
      Imperial College London
    
    
      6
      Sam
      Cooper
      56167913500
      Imperial College London
    
    
      7
      Dhammika
      Widanage
      16317933200
      University of Warwick
    
    
      8
      Emma
      Kendrick
      9844777200
      University of Birmingham
    
    
      9
      James
      Marco
      16305142700
      University of Warwick
    
    
      10
      Charles
      Monroe
      7006243205
      University of Oxford
    
    
      11
      David
      Howey
      34267561800
      University of Oxford
    
    
      12
      Jon
      Chapman
      7403045089
      University of Oxford
    
    
      13
      Colin
      Please
      7003528228
      University of Oxford
    
    
      14
      Denis
      Kramer
      7203031715
      University of Southampton
    
    
      15
      Chris-Kriton
      Skylaris
      6603502266
      University of Southampton
    
    
      16
      Giles
      Richardson
      7202431002
      University of Southampton
    
    
      17
      Dan
      Brett
      13805905800
      University College London
    
    
      18
      David
      Scanlon
      16647397400
      University College London
    
    
      19
      Paul
      Shearing
      24178516700
      University College London
    
    
      20
      Saiful
      Islam
      55547120924
      University of Bath
    
    
      21
      Benjamin
      Morgan
      55174608300
      University of Bath

Retrieve SCOPUS publication IDs for each author



In [16]:

    
pubs=[]
for i in authors:
    i['pubs']=pyscopus.search_author_publication(i['s'])
    pubs.append(i)









    



A toal number of  6  records for author  56223929700
A toal number of  87  records for author  6701895930
A toal number of  72  records for author  23098060000
A toal number of  17  records for author  36701631300
A toal number of  17  records for author  7403591460
A toal number of  268  records for author  35315151400
A toal number of  7  records for author  56167913500
A toal number of  26  records for author  16317933200
A toal number of  53  records for author  9844777200
A toal number of  80  records for author  16305142700
A toal number of  60  records for author  7006243205
A toal number of  75  records for author  34267561800
A toal number of  143  records for author  7403045089
A toal number of  109  records for author  7003528228
A toal number of  39  records for author  7203031715
A toal number of  89  records for author  6603502266
A toal number of  50  records for author  7202431002
A toal number of  209  records for author  13805905800
A toal number of  145  records for author  16647397400
A toal number of  145  records for author  24178516700
A toal number of  96  records for author  55547120924
A toal number of  50  records for author  55174608300

Retrieve abstracts for each publication ID for each author



In [17]:

    
import json



In [15]:

    
file('f1.json','w').write(json.dumps(pubs))



In [18]:

    
abstracts=[]
minyear=2011
for i in pubs:
    print i['f'],i['l']
    p=[]
    for j in i['pubs']:
        if int(j['cover_date'][:4])>minyear:
            try:
                p.append({'title':j['title'],'date':j['cover_date'],'journal':j['publication_name']\
                      ,'abstract':pyscopus.retrieve_abstract(j['scopus_id'],show=False)['text']});
            except:
                p.append({'title':j['title'],'date':j['cover_date'],'journal':j['publication_name']\
                      ,'abstract':''});
    i['abs']=p
    abstracts.append(i)









    



Denes Csala
Fail to find abstract!
Harry Hoster
Fail to find abstract!
Fail to find abstract!
Gregory Offer
Monica Marinescu
Billy Wu
Aron Walsh
Fail to find abstract!
Fail to find abstract!
Fail to find abstract!
Fail to find abstract!
Fail to find abstract!
Fail to find abstract!
Fail to find abstract!
Fail to find abstract!
Fail to find abstract!
Fail to find abstract!
Fail to find abstract!
Fail to find abstract!
Sam Cooper
Dhammika Widanage
Emma Kendrick
James Marco
Fail to find abstract!
Charles Monroe
Fail to find abstract!
Fail to find abstract!
Fail to find abstract!
Fail to find abstract!
Fail to find abstract!
Fail to find abstract!
Fail to find abstract!
David Howey
Fail to find abstract!
Fail to find abstract!
Jon Chapman
Fail to find abstract!
Colin Please
Denis Kramer
Fail to find abstract!
Fail to find abstract!
Fail to find abstract!
Chris-Kriton Skylaris
Fail to find abstract!
Fail to find abstract!
Fail to find abstract!
Fail to find abstract!
Giles Richardson
Dan Brett
Fail to find abstract!
Fail to find abstract!
David Scanlon
Fail to find abstract!
Fail to find abstract!
Fail to find abstract!
Fail to find abstract!
Fail to find abstract!
Fail to find abstract!
Paul Shearing
Fail to find abstract!
Saiful Islam
Benjamin Morgan
Fail to find abstract!



In [19]:

    
abs2=[]
for i in abstracts:
    j=dict(i)
    j.pop('pubs');
    abs2.append(j)



In [83]:

    
abs3={}
for i in abs2:
    if i['l'] not in abs3:
        print i['l'] 
        abs3[i['l']]=i









    



Csala
Hoster
Offer
Marinescu
Wu
Walsh
Cooper
Widanage
Kendrick
Marco
Monroe
Howey
Chapman
Please
Kramer
Skylaris
Richardson
Brett
Scanlon
Shearing
Islam
Morgan



In [89]:

    
# Simple WordCloud
from os import path
from scipy.misc import imread
import matplotlib.pyplot as plt
import random
from wordcloud import WordCloud, STOPWORDS



In [96]:

    
abs4={}
for i in abs3:
    abs4[i]=''
    for j in abs3[i]['abs']:
        abs4[i]=abs4[i]+' '+j['abstract']



In [131]:

    
#set the stopwords list
sw2={'inf','inf'}
for i in abs4:
    print i
    text=abs4[i]
    wordcloud = WordCloud(relative_scaling = 0.1, background_color="white",
                          stopwords = set(STOPWORDS.union(sw2))
                          ).generate(text)
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.show()

	f	l	s	u
0	Denes	Csala	56223929700	Lancaster University
1	Harry	Hoster	6701895930	Lancaster University
2	Gregory	Offer	23098060000	Imperial College London
3	Monica	Marinescu	36701631300	Imperial College London
4	Billy	Wu	7403591460	Imperial College London
5	Aron	Walsh	35315151400	Imperial College London
6	Sam	Cooper	56167913500	Imperial College London
7	Dhammika	Widanage	16317933200	University of Warwick
8	Emma	Kendrick	9844777200	University of Birmingham
9	James	Marco	16305142700	University of Warwick
10	Charles	Monroe	7006243205	University of Oxford
11	David	Howey	34267561800	University of Oxford
12	Jon	Chapman	7403045089	University of Oxford
13	Colin	Please	7003528228	University of Oxford
14	Denis	Kramer	7203031715	University of Southampton
15	Chris-Kriton	Skylaris	6603502266	University of Southampton
16	Giles	Richardson	7202431002	University of Southampton
17	Dan	Brett	13805905800	University College London
18	David	Scanlon	16647397400	University College London
19	Paul	Shearing	24178516700	University College London
20	Saiful	Islam	55547120924	University of Bath
21	Benjamin	Morgan	55174608300	University of Bath