This workbook parses all of the publications listed on Energy Lancaster's Lancaster University Research Portal page and extracts keywoprds an topical data from abstracts using natural language processing.
In [1]:
#python dom extension functions to get class and other attributes
def getAttr(dom,cl,attr='class',el='div'):
toreturn=[]
divs=dom.getElementsByTagName(el)
for div in divs:
clarray=div.getAttribute(attr).split(' ')
for cli in clarray:
if cli==cl: toreturn.append(div)
if toreturn!=[]: return toreturn
else: return None
Get number of pages for publications
In [2]:
#open first page, parse html, get number of pages and their links
import html5lib
import urllib2
url="http://www.research.lancs.ac.uk/portal/en/organisations/energy-lancaster/publications.html"
aResp = urllib2.urlopen(url)
t = aResp.read()
dom = html5lib.parse(t, treebuilder="dom")
links=getAttr(dom,'portal_navigator_paging',el='span')[0].childNodes
nr_of_pages=int([i for i in links if i.nodeType==1][::-1][0].childNodes[0].childNodes[0].nodeValue)-1
Extract links to publications, from all pages
In [3]:
#create publist array
publist=[]
#parse publications links on all pages
for pagenr in range(nr_of_pages):
aResp = urllib2.urlopen(url+'?page='+str(pagenr))
t = aResp.read()
dom = html5lib.parse(t, treebuilder="dom")
#get html list
htmlpublist=dom.getElementsByTagName('ol')
#extract pub links
for i in htmlpublist[0].childNodes:
if i.nodeType==1:
if i.childNodes[0].nodeType==1:
j=i.childNodes[1].childNodes[0].childNodes[0]
if j.nodeType==1:
publist.append(j.getAttribute('href'))
print 'finished page',pagenr
In [4]:
print len(publist),'publications associated with Energy Lancaster'
In [5]:
#create dictionary
pubdict={i:{"url":i} for i in publist}
Keyword extraction, for each publication
In [7]:
for r in range(len(publist)):
pub=publist[r]
aResp = urllib2.urlopen(pub)
t = aResp.read()
dom = html5lib.parse(t, treebuilder="dom")
#get keywords from pub page
keywords=getAttr(dom,'keywords',el='ul')
if keywords:
pubdict[pub]['keywords']=[i.childNodes[0].childNodes[0].nodeValue for i in keywords[0].getElementsByTagName('a')]
#get title from pub page
title=getAttr(dom,'title',el='h2')
if title:
pubdict[pub]['title']=title[0].childNodes[0].childNodes[0].nodeValue
abstract=getAttr(dom,'rendering_researchoutput_abstractportal',el='div')
if abstract:
pubdict[pub]['abstract']=abstract[0].childNodes[0].childNodes[0].nodeValue
if r%10==0: print 'processed',r,'publications...'
In [8]:
#save parsed data
import json
file('pubdict.json','w').write(json.dumps(pubdict))
#load if saved previously
#pubdict=json.loads(file('pubdict.json','r').read())
Mine titles and abstracts for topics
In [ ]:
#import dependencies
import pandas as pd
from textblob import TextBlob
#import spacy
#nlp = spacy.load('en')
In [ ]:
#run once if you need to download nltk corpora, igonre otherwise
import nltk
nltk.download()
In [31]:
#get topical nouns for title and abstract using natural language processing
for i in range(len(pubdict.keys())):
if 'title' in pubdict[pubdict.keys()[i]]:
if text:
text=pubdict[pubdict.keys()[i]]['title']
#get topical nouns with textblob
blob1 = TextBlob(text)
keywords1=blob1.noun_phrases
#get topical nouns with spacy
blob2 = nlp(text)
keywords2=[]
for k in blob2.noun_chunks:
keywords2.append(str(k).decode('utf8').replace(u'\n',' '))
#create unified, unique set of topical nouns, called keywords here
keywords=list(set(keywords2).union(set(keywords1)))
pubdict[pubdict.keys()[i]]['title-nlp']=keywords
if 'abstract' in pubdict[pubdict.keys()[i]]:
text=pubdict[pubdict.keys()[i]]['abstract']
if text:
#get topical nouns with textblob
blob1 = TextBlob(text)
keywords1=blob1.noun_phrases
#get topical nouns with spacy
blob2 = nlp(text)
keywords2=[]
for k in blob2.noun_chunks:
keywords2.append(str(k).decode('utf8').replace(u'\n',' '))
#create unified, unique set of topical nouns, called keywords here
keywords=list(set(keywords2).union(set(keywords1)))
pubdict[pubdict.keys()[i]]['abstract-nlp']=keywords
print i,',',
In [32]:
#save parsed data
file('pubdict2.json','w').write(json.dumps(pubdict))
#load if saved previously
#pubdict=json.loads(file('pubdict2.json','r').read())
Save output for D3 word cloud
In [41]:
keywords=[j for i in pubdict if 'keywords' in pubdict[i] if pubdict[i]['keywords'] for j in pubdict[i]['keywords']]
titles=[pubdict[i]['title'] for i in pubdict if 'title' in pubdict[i] if pubdict[i]['title']]
abstracts=[pubdict[i]['abstract'] for i in pubdict if 'abstract' in pubdict[i] if pubdict[i]['abstract']]
title_nlp=[j for i in pubdict if 'title-nlp' in pubdict[i] if pubdict[i]['title-nlp'] for j in pubdict[i]['title-nlp']]
abstract_nlp=[j for i in pubdict if 'abstract-nlp' in pubdict[i] if pubdict[i]['abstract-nlp'] for j in pubdict[i]['abstract-nlp']]
kt=keywords+titles
kta=kt+abstracts
kt_nlp=keywords+title_nlp
kta_nlp=kt+abstract_nlp
file('keywords.json','w').write(json.dumps(keywords))
file('titles.json','w').write(json.dumps(titles))
file('abstracts.json','w').write(json.dumps(abstracts))
file('kt.json','w').write(json.dumps(kt))
file('kta.json','w').write(json.dumps(kta))
file('kt_nlp.json','w').write(json.dumps(kt_nlp))
file('kta_nlp.json','w').write(json.dumps(kta_nlp))
In [37]:
import re
def convert(name):
s1 = re.sub('(.)([A-Z][a-z]+)', r'\1 \2', name)
return re.sub('([a-z0-9])([A-Z])', r'\1 \2', s1).lower()
In [49]:
kc=[convert(i) for i in keywords]
file('kc.json','w').write(json.dumps(kc))
ks=[j for i in kc for j in i.split()]
file('ks.json','w').write(json.dumps(ks))
ktc_nlp=[convert(i) for i in kt_nlp]
file('ktc_nlp.json','w').write(json.dumps(ktc_nlp))
kts_nlp=[j for i in ktc_nlp for j in i.split()]
file('kts_nlp.json','w').write(json.dumps(kts_nlp))
ktac_nlp=[convert(i) for i in kta_nlp]
file('ktac_nlp.json','w').write(json.dumps(ktac_nlp))
ktas_nlp=[j for i in ktac_nlp for j in i.split()]
file('ktas_nlp.json','w').write(json.dumps(ktas_nlp))
In [47]:
Out[47]:
Having consturcted three project score vectors (without title, with title, both), we sort the projects based on high scores. These are best matching research projects. We display a link to them below. Repeat for each topic.
In [ ]:
for topic_id in range(1,len(topics)):
#select topic
#topic_id=1
#use title
usetitle=True
verbose=False
#initiate global DFs
DF=pd.DataFrame()
projects1={}
projects2={}
projects12={}
#specify depth (n most relevant projects)
depth=100
#get topical nouns with textblob
blob1 = TextBlob(topics[topic_id].decode('utf8'))
keywords1=blob1.noun_phrases
#get topical nouns with spacy
blob2 = nlp(topics[topic_id].decode('utf8'))
keywords2=[]
for i in blob2.noun_chunks:
keywords2.append(str(i).replace(u'\n',' '))
#create unified, unique set of topical nouns, called keywords here
keywords=list(set(keywords2).union(set(keywords1)))
print '----- started processing topic ', topic_id,'-----'
print 'topic keywords are:',
for keyword in keywords: print keyword+', ',
print ' '
#construct search query from title and keywords, the cycle through the keywords
for keyword in keywords:
if usetitle:
if verbose: print 'query for <'+title+keyword+'>'
query=repr(title+keyword).replace(' ','+')[2:-1]
u0='http://gtr.rcuk.ac.uk/search/project/csv?term='
u1='&selectedFacets=&fields='
u2='pro.gr,pro.t,pro.a,pro.orcidId,per.fn,per.on,per.sn,'
u3='per.fnsn,per.orcidId,per.org.n,per.pro.t,per.pro.abs,pub.t,pub.a,pub.orcidId,org.n,org.orcidId,'
u4='acp.t,acp.d,acp.i,acp.oid,kf.d,kf.oid,is.t,is.d,is.oid,col.i,col.d,col.c,col.dept,col.org,col.pc,col.pic,'
u5='col.oid,ip.t,ip.d,ip.i,ip.oid,pol.i,pol.gt,pol.in,pol.oid,prod.t,prod.d,prod.i,prod.oid,rtp.t,rtp.d,rtp.i,'
u6='rtp.oid,rdm.t,rdm.d,rdm.i,rdm.oid,stp.t,stp.d,stp.i,stp.oid,so.t,so.d,so.cn,so.i,so.oid,ff.t,ff.d,ff.c,'
u7='ff.org,ff.dept,ff.oid,dis.t,dis.d,dis.i,dis.oid'
u8='&type=&fetchSize=50'
u9='&selectedSortableField=score&selectedSortOrder=DESC'
url=u0+query+u8+u9
#query RCUK GtR API
df=pd.read_csv(url,nrows=depth)
#record scores
df['score'] = depth-df.index
df=df.set_index('ProjectReference')
DF=pd.concat([DF,df])
for i in df.index:
if i not in projects12:projects12[i]=0
projects12[i]+=df.loc[i]['score']**2
if i not in projects1:projects1[i]=0
projects1[i]+=df.loc[i]['score']**2
if verbose: print 'query for <'+keyword+'>'
query=repr(keyword).replace(' ','+')[2:-1]
u0='http://gtr.rcuk.ac.uk/search/project/csv?term='
u1='&selectedFacets=&fields='
u2='pro.gr,pro.t,pro.a,pro.orcidId,per.fn,per.on,per.sn,'
u3='per.fnsn,per.orcidId,per.org.n,per.pro.t,per.pro.abs,pub.t,pub.a,pub.orcidId,org.n,org.orcidId,'
u4='acp.t,acp.d,acp.i,acp.oid,kf.d,kf.oid,is.t,is.d,is.oid,col.i,col.d,col.c,col.dept,col.org,col.pc,col.pic,'
u5='col.oid,ip.t,ip.d,ip.i,ip.oid,pol.i,pol.gt,pol.in,pol.oid,prod.t,prod.d,prod.i,prod.oid,rtp.t,rtp.d,rtp.i,'
u6='rtp.oid,rdm.t,rdm.d,rdm.i,rdm.oid,stp.t,stp.d,stp.i,stp.oid,so.t,so.d,so.cn,so.i,so.oid,ff.t,ff.d,ff.c,'
u7='ff.org,ff.dept,ff.oid,dis.t,dis.d,dis.i,dis.oid'
u8='&type=&fetchSize=50'
u9='&selectedSortableField=score&selectedSortOrder=DESC'
url=u0+query+u8+u9
#query RCUK GtR API
df=pd.read_csv(url,nrows=depth)
#record scores
df['score'] = depth-df.index
df=df.set_index('ProjectReference')
DF=pd.concat([DF,df])
for i in df.index:
if i not in projects12:projects12[i]=0
projects12[i]+=df.loc[i]['score']**2
if i not in projects2:projects2[i]=0
projects2[i]+=df.loc[i]['score']**2
print '----- finished topic ', topic_id,'-----'
print ' '
###### SORTING #######
#select top projects
#sort project vectors
top=30
import operator
sorted_projects1=sorted(projects1.items(), key=operator.itemgetter(1))[::-1][:30]
sorted_projects2=sorted(projects2.items(), key=operator.itemgetter(1))[::-1][:30]
sorted_projects12=sorted(projects12.items(), key=operator.itemgetter(1))[::-1][:30]
#record scores in sorted vector in a master vector
projects={}
for i in range(len(sorted_projects1)):
if sorted_projects1[i][0] not in projects:projects[sorted_projects1[i][0]]=0
projects[sorted_projects1[i][0]]+=(top-i)**2
for i in range(len(sorted_projects2)):
if sorted_projects2[i][0] not in projects:projects[sorted_projects2[i][0]]=0
projects[sorted_projects2[i][0]]+=(top-i)**2
for i in range(len(sorted_projects12)):
if sorted_projects12[i][0] not in projects:projects[sorted_projects12[i][0]]=0
projects[sorted_projects12[i][0]]+=(top-i)**2
#save final vector of most relevant projects
sorted_projects=sorted(projects.items(), key=operator.itemgetter(1))[::-1][:30]
###### DISPLAY ########
#print resulting links to projects
for i in range(len(sorted_projects)):
print str(i+1)+'.',DF.loc[sorted_projects[i][0]][u'GTRProjectUrl'].values[0],\
DF.loc[sorted_projects[i][0]][u'PIFirstName'].values[0],\
DF.loc[sorted_projects[i][0]][u'PISurname'].values[0]+'|',\
DF.loc[sorted_projects[i][0]][u'LeadROName'].values[0]+'|',\
DF.loc[sorted_projects[i][0]][u'StartDate'].values[0][6:]+'-'+\
DF.loc[sorted_projects[i][0]][u'EndDate'].values[0][6:]+'|',\
str(int(DF.loc[sorted_projects[i][0]][u'AwardPounds'].values[0])/1000)+'k'
print DF.loc[sorted_projects[i][0]][u'Title'].values[0]+'\n'
#print '----------------------------------------------------'