Energy Lancaster publication miner

This workbook parses all of the publications listed on Energy Lancaster's Lancaster University Research Portal page and extracts keywoprds an topical data from abstracts using natural language processing.


In [1]:
#python dom extension functions to get class and other attributes
def getAttr(dom,cl,attr='class',el='div'):
    toreturn=[]
    divs=dom.getElementsByTagName(el)
    for div in divs:
        clarray=div.getAttribute(attr).split(' ')
        for cli in clarray:
            if cli==cl: toreturn.append(div)
    if toreturn!=[]: return toreturn
    else: return None

Get number of pages for publications


In [2]:
#open first page, parse html, get number of pages and their links
import html5lib
import urllib2
url="http://www.research.lancs.ac.uk/portal/en/organisations/energy-lancaster/publications.html"
aResp = urllib2.urlopen(url)
t = aResp.read()
dom = html5lib.parse(t, treebuilder="dom")
links=getAttr(dom,'portal_navigator_paging',el='span')[0].childNodes
nr_of_pages=int([i for i in links if i.nodeType==1][::-1][0].childNodes[0].childNodes[0].nodeValue)-1

Extract links to publications, from all pages


In [3]:
#create publist array
publist=[]
#parse publications links on all pages
for pagenr in range(nr_of_pages):
    aResp = urllib2.urlopen(url+'?page='+str(pagenr))
    t = aResp.read()
    dom = html5lib.parse(t, treebuilder="dom")
    #get html list
    htmlpublist=dom.getElementsByTagName('ol')
    #extract pub links
    for i in htmlpublist[0].childNodes:
        if i.nodeType==1:
            if i.childNodes[0].nodeType==1:
                j=i.childNodes[1].childNodes[0].childNodes[0]
                if j.nodeType==1:
                    publist.append(j.getAttribute('href'))
    print 'finished page',pagenr


finished page 0
finished page 1
finished page 2
finished page 3
finished page 4
finished page 5
finished page 6

In [4]:
print len(publist),'publications associated with Energy Lancaster'


608 publications associated with Energy Lancaster

In [5]:
#create dictionary
pubdict={i:{"url":i} for i in publist}

Keyword extraction, for each publication


In [7]:
for r in range(len(publist)):
    pub=publist[r]
    aResp = urllib2.urlopen(pub)
    t = aResp.read()
    dom = html5lib.parse(t, treebuilder="dom")
    #get keywords from pub page
    keywords=getAttr(dom,'keywords',el='ul')
    if keywords:
        pubdict[pub]['keywords']=[i.childNodes[0].childNodes[0].nodeValue for i in keywords[0].getElementsByTagName('a')]
    #get title from pub page
    title=getAttr(dom,'title',el='h2')
    if title:
        pubdict[pub]['title']=title[0].childNodes[0].childNodes[0].nodeValue
    abstract=getAttr(dom,'rendering_researchoutput_abstractportal',el='div')
    if abstract:
        pubdict[pub]['abstract']=abstract[0].childNodes[0].childNodes[0].nodeValue    
    if r%10==0: print 'processed',r,'publications...'


processed 0 publications...
processed 10 publications...
processed 20 publications...
processed 30 publications...
processed 40 publications...
processed 50 publications...
processed 60 publications...
processed 70 publications...
processed 80 publications...
processed 90 publications...
processed 100 publications...
processed 110 publications...
processed 120 publications...
processed 130 publications...
processed 140 publications...
processed 150 publications...
processed 160 publications...
processed 170 publications...
processed 180 publications...
processed 190 publications...
processed 200 publications...
processed 210 publications...
processed 220 publications...
processed 230 publications...
processed 240 publications...
processed 250 publications...
processed 260 publications...
processed 270 publications...
processed 280 publications...
processed 290 publications...
processed 300 publications...
processed 310 publications...
processed 320 publications...
processed 330 publications...
processed 340 publications...
processed 350 publications...
processed 360 publications...
processed 370 publications...
processed 380 publications...
processed 390 publications...
processed 400 publications...
processed 410 publications...
processed 420 publications...
processed 430 publications...
processed 440 publications...
processed 450 publications...
processed 460 publications...
processed 470 publications...
processed 480 publications...
processed 490 publications...
processed 500 publications...
processed 510 publications...
processed 520 publications...
processed 530 publications...
processed 540 publications...
processed 550 publications...
processed 560 publications...
processed 570 publications...
processed 580 publications...
processed 590 publications...
processed 600 publications...

In [8]:
#save parsed data
import json
file('pubdict.json','w').write(json.dumps(pubdict)) 
#load if saved previously
#pubdict=json.loads(file('pubdict.json','r').read())

Mine titles and abstracts for topics


In [ ]:
#import dependencies
import pandas as pd
from textblob import TextBlob
#import spacy
#nlp = spacy.load('en')

In [ ]:
#run once if you need to download nltk corpora, igonre otherwise
import nltk
nltk.download()


showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml

In [31]:
#get topical nouns for title and abstract using natural language processing
for i in range(len(pubdict.keys())):
    if 'title' in pubdict[pubdict.keys()[i]]:
        if text:
            text=pubdict[pubdict.keys()[i]]['title']
            #get topical nouns with textblob
            blob1 = TextBlob(text)
            keywords1=blob1.noun_phrases
            #get topical nouns with spacy
            blob2 = nlp(text)
            keywords2=[]
            for k in blob2.noun_chunks:
                keywords2.append(str(k).decode('utf8').replace(u'\n',' '))
            #create unified, unique set of topical nouns, called keywords here
            keywords=list(set(keywords2).union(set(keywords1)))
            pubdict[pubdict.keys()[i]]['title-nlp']=keywords
    if 'abstract' in pubdict[pubdict.keys()[i]]:
        text=pubdict[pubdict.keys()[i]]['abstract']
        if text:
            #get topical nouns with textblob
            blob1 = TextBlob(text)
            keywords1=blob1.noun_phrases
            #get topical nouns with spacy
            blob2 = nlp(text)
            keywords2=[]
            for k in blob2.noun_chunks:
                keywords2.append(str(k).decode('utf8').replace(u'\n',' '))
            #create unified, unique set of topical nouns, called keywords here
            keywords=list(set(keywords2).union(set(keywords1)))
            pubdict[pubdict.keys()[i]]['abstract-nlp']=keywords
    print i,',',


0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 , 10 , 11 , 12 , 13 , 14 , 15 , 16 , 17 , 18 , 19 , 20 , 21 , 22 , 23 , 24 , 25 , 26 , 27 , 28 , 29 , 30 , 31 , 32 , 33 , 34 , 35 , 36 , 37 , 38 , 39 , 40 , 41 , 42 , 43 , 44 , 45 , 46 , 47 , 48 , 49 , 50 , 51 , 52 , 53 , 54 , 55 , 56 , 57 , 58 , 59 , 60 , 61 , 62 , 63 , 64 , 65 , 66 , 67 , 68 , 69 , 70 , 71 , 72 , 73 , 74 , 75 , 76 , 77 , 78 , 79 , 80 , 81 , 82 , 83 , 84 , 85 , 86 , 87 , 88 , 89 , 90 , 91 , 92 , 93 , 94 , 95 , 96 , 97 , 98 , 99 , 100 , 101 , 102 , 103 , 104 , 105 , 106 , 107 , 108 , 109 , 110 , 111 , 112 , 113 , 114 , 115 , 116 , 117 , 118 , 119 , 120 , 121 , 122 , 123 , 124 , 125 , 126 , 127 , 128 , 129 , 130 , 131 , 132 , 133 , 134 , 135 , 136 , 137 , 138 , 139 , 140 , 141 , 142 , 143 , 144 , 145 , 146 , 147 , 148 , 149 , 150 , 151 , 152 , 153 , 154 , 155 , 156 , 157 , 158 , 159 , 160 , 161 , 162 , 163 , 164 , 165 , 166 , 167 , 168 , 169 , 170 , 171 , 172 , 173 , 174 , 175 , 176 , 177 , 178 , 179 , 180 , 181 , 182 , 183 , 184 , 185 , 186 , 187 , 188 , 189 , 190 , 191 , 192 , 193 , 194 , 195 , 196 , 197 , 198 , 199 , 200 , 201 , 202 , 203 , 204 , 205 , 206 , 207 , 208 , 209 , 210 , 211 , 212 , 213 , 214 , 215 , 216 , 217 , 218 , 219 , 220 , 221 , 222 , 223 , 224 , 225 , 226 , 227 , 228 , 229 , 230 , 231 , 232 , 233 , 234 , 235 , 236 , 237 , 238 , 239 , 240 , 241 , 242 , 243 , 244 , 245 , 246 , 247 , 248 , 249 , 250 , 251 , 252 , 253 , 254 , 255 , 256 , 257 , 258 , 259 , 260 , 261 , 262 , 263 , 264 , 265 , 266 , 267 , 268 , 269 , 270 , 271 , 272 , 273 , 274 , 275 , 276 , 277 , 278 , 279 , 280 , 281 , 282 , 283 , 284 , 285 , 286 , 287 , 288 , 289 , 290 , 291 , 292 , 293 , 294 , 295 , 296 , 297 , 298 , 299 , 300 , 301 , 302 , 303 , 304 , 305 , 306 , 307 , 308 , 309 , 310 , 311 , 312 , 313 , 314 , 315 , 316 , 317 , 318 , 319 , 320 , 321 , 322 , 323 , 324 , 325 , 326 , 327 , 328 , 329 , 330 , 331 , 332 , 333 , 334 , 335 , 336 , 337 , 338 , 339 , 340 , 341 , 342 , 343 , 344 , 345 , 346 , 347 , 348 , 349 , 350 , 351 , 352 , 353 , 354 , 355 , 356 , 357 , 358 , 359 , 360 , 361 , 362 , 363 , 364 , 365 , 366 , 367 , 368 , 369 , 370 , 371 , 372 , 373 , 374 , 375 , 376 , 377 , 378 , 379 , 380 , 381 , 382 , 383 , 384 , 385 , 386 , 387 , 388 , 389 , 390 , 391 , 392 , 393 , 394 , 395 , 396 , 397 , 398 , 399 , 400 , 401 , 402 , 403 , 404 , 405 , 406 , 407 , 408 , 409 , 410 , 411 , 412 , 413 , 414 , 415 , 416 , 417 , 418 , 419 , 420 , 421 , 422 , 423 , 424 , 425 , 426 , 427 , 428 , 429 , 430 , 431 , 432 , 433 , 434 , 435 , 436 , 437 , 438 , 439 , 440 , 441 , 442 , 443 , 444 , 445 , 446 , 447 , 448 , 449 , 450 , 451 , 452 , 453 , 454 , 455 , 456 , 457 , 458 , 459 , 460 , 461 , 462 , 463 , 464 , 465 , 466 , 467 , 468 , 469 , 470 , 471 , 472 , 473 , 474 , 475 , 476 , 477 , 478 , 479 , 480 , 481 , 482 , 483 , 484 , 485 , 486 , 487 , 488 , 489 , 490 , 491 , 492 , 493 , 494 , 495 , 496 , 497 , 498 , 499 , 500 , 501 , 502 , 503 , 504 , 505 , 506 , 507 , 508 , 509 , 510 , 511 , 512 , 513 , 514 , 515 , 516 , 517 , 518 , 519 , 520 , 521 , 522 , 523 , 524 , 525 , 526 , 527 , 528 , 529 , 530 , 531 , 532 , 533 , 534 , 535 , 536 , 537 , 538 , 539 , 540 , 541 , 542 , 543 , 544 , 545 , 546 , 547 , 548 , 549 , 550 , 551 , 552 , 553 , 554 , 555 , 556 , 557 , 558 , 559 , 560 , 561 , 562 , 563 , 564 , 565 , 566 , 567 , 568 , 569 , 570 , 571 , 572 , 573 , 574 , 575 , 576 , 577 , 578 , 579 , 580 , 581 , 582 , 583 , 584 , 585 , 586 , 587 , 588 , 589 , 590 , 591 , 592 , 593 , 594 , 595 , 596 , 597 , 598 , 599 , 600 , 601 , 602 , 603 , 604 , 605 , 606 , 607 ,

In [32]:
#save parsed data
file('pubdict2.json','w').write(json.dumps(pubdict)) 
#load if saved previously
#pubdict=json.loads(file('pubdict2.json','r').read())

Save output for D3 word cloud


In [41]:
keywords=[j for i in pubdict if 'keywords' in pubdict[i] if pubdict[i]['keywords'] for j in pubdict[i]['keywords']]
titles=[pubdict[i]['title'] for i in pubdict if 'title' in pubdict[i] if pubdict[i]['title']]
abstracts=[pubdict[i]['abstract'] for i in pubdict if 'abstract' in pubdict[i] if pubdict[i]['abstract']]
title_nlp=[j for i in pubdict if 'title-nlp' in pubdict[i] if pubdict[i]['title-nlp'] for j in pubdict[i]['title-nlp']]
abstract_nlp=[j for i in pubdict if 'abstract-nlp' in pubdict[i] if pubdict[i]['abstract-nlp'] for j in pubdict[i]['abstract-nlp']]
kt=keywords+titles
kta=kt+abstracts
kt_nlp=keywords+title_nlp
kta_nlp=kt+abstract_nlp
file('keywords.json','w').write(json.dumps(keywords))
file('titles.json','w').write(json.dumps(titles))
file('abstracts.json','w').write(json.dumps(abstracts))
file('kt.json','w').write(json.dumps(kt))
file('kta.json','w').write(json.dumps(kta))
file('kt_nlp.json','w').write(json.dumps(kt_nlp))
file('kta_nlp.json','w').write(json.dumps(kta_nlp))

In [37]:
import re
def convert(name):
    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1 \2', name)
    return re.sub('([a-z0-9])([A-Z])', r'\1 \2', s1).lower()

In [49]:
kc=[convert(i) for i in keywords]
file('kc.json','w').write(json.dumps(kc))
ks=[j for i in kc for j in i.split()]
file('ks.json','w').write(json.dumps(ks))
ktc_nlp=[convert(i) for i in kt_nlp]
file('ktc_nlp.json','w').write(json.dumps(ktc_nlp))
kts_nlp=[j for i in ktc_nlp for j in i.split()]
file('kts_nlp.json','w').write(json.dumps(kts_nlp))
ktac_nlp=[convert(i) for i in kta_nlp]
file('ktac_nlp.json','w').write(json.dumps(ktac_nlp))
ktas_nlp=[j for i in ktac_nlp for j in i.split()]
file('ktas_nlp.json','w').write(json.dumps(ktas_nlp))

In [47]:



Out[47]:
[u'sustainability',
 u'human-centered',
 u'computing',
 u'interaction',
 u'design',
 u'hydrodynamics',
 u'sediment',
 u'transport',
 u'morphodynamics',
 u'groyne',
 u'scour',
 u'coastal',
 u'structures',
 u'noble',
 u'gas',
 u'halogen',
 u'subduction',
 u'pore',
 u'fluid',
 u'volatile',
 u'recycling',
 u'fluvial',
 u'suspended',
 u'sediment',
 u'phosphorus',
 u'diffuse',
 u'pollution',
 u'water',
 u'quality',
 u'headwater',
 u'connectivity',
 u'grassland',
 u'spectra',
 u'\u03b3',
 u'rays',
 u'digital',
 u'discrimination',
 u'frequency',
 u'gradient',
 u'analysis',
 u'neutron',
 u'organic',
 u'scintillators',
 u'pulse',
 u'gradient',
 u'analysis',
 u'time',
 u'of',
 u'flight',
 u'photocatalytically',
 u'initiated',
 u'electroless',
 u'deposition',
 u'modeling',
 u'and',
 u'control',
 u'of',
 u'agriculture',
 u'plant',
 u'factories',
 u'optimal',
 u'control',
 u'in',
 u'agriculture',
 u'distributed',
 u'control',
 u'of',
 u'environmental',
 u'systems',
 u'modeling',
 u'and',
 u'identification',
 u'parameter',
 u'estimation',
 u'system',
 u'identification',
 u'nonlinear',
 u'model',
 u'genetic',
 u'algorithm',
 u'mathematical',
 u'modeling',
 u'gomos',
 u'changepoint',
 u'analysis',
 u'penalised',
 u'likelihood',
 u'hindcast',
 u'time',
 u'series',
 u'significant',
 u'wave',
 u'height.',
 u'hollow',
 u'nanoparticles',
 u'nanorods',
 u'li',
 u'storage',
 u'cobalt',
 u'phosphides',
 u'oil',
 u'phase',
 u'synthesis',
 u'improved',
 u'electrochemical',
 u'performance',
 u'negative-electrode',
 u'thermal-decomposition',
 u'conversion',
 u'reactions',
 u'nickel',
 u'phosphide',
 u'syringe',
 u'pump',
 u'nanoparticles',
 u'temperature',
 u'nanowires',
 u'nanorods',
 u'internet',
 u'of',
 u'things',
 u'modules',
 u'and',
 u'interfaces',
 u'pervasive',
 u'computing',
 u'software',
 u'architectures',
 u'multi-objective',
 u'evolutionary',
 u'optimisation',
 u'wave',
 u'energy',
 u'converter',
 u'principal',
 u'component',
 u'analysis',
 u'(pca)',
 u'feature',
 u'extraction',
 u'condition',
 u'monitoring',
 u'wind',
 u'turbine',
 u'distributed',
 u'generation',
 u'nondestructive',
 u'testing',
 u'eddy',
 u'current',
 u'electromagnetic',
 u'induction',
 u'imaging',
 u'condition',
 u'monitoring',
 u'power',
 u'plant',
 u'reliability',
 u'condition',
 u'monitoring',
 u'operation',
 u'and',
 u'maintenance',
 u'real-time',
 u'simulation',
 u'wind',
 u'turbines',
 u'changepoint',
 u'oceanography',
 u'search',
 u'method',
 u'scanning-tunneling-microscopy',
 u'augmented-wave',
 u'method',
 u'pyrolytic-graphite',
 u'surface',
 u'oxygen',
 u'ag(111)',
 u'adsorption',
 u'assemblies',
 u'molecule',
 u'pyridine',
 u'electrochemical',
 u'analysis',
 u'etching',
 u'tungsten',
 u'nitrate',
 u'cycling',
 u'riverbed',
 u'sediment',
 u'water',
 u'quality',
 u'pollution',
 u'hyporheic',
 u'zone',
 u'groundwater',
 u'pore',
 u'water',
 u'det',
 u'semiconductors',
 u'localization',
 u'quantum',
 u'dots',
 u'level-transient',
 u'spectroscopy',
 u'dementia',
 u'mouse',
 u'dynamics',
 u'keystroke',
 u'dynamics',
 u'data',
 u'mining',
 u'medical',
 u'informatics',
 u'lithium',
 u'insertion',
 u'li-ion',
 u'framework',
 u'cathode',
 u'fe-2(so4)(3)',
 u'storage',
 u'energy',
 u'model-driven',
 u'software',
 u'development',
 u'education',
 u'parameter',
 u'estimation',
 u'system',
 u'identification',
 u'nonlinear',
 u'model',
 u'multi-objective',
 u'genetic',
 u'algorithm',
 u'mathematical',
 u'modeling',
 u'inverse',
 u'theory',
 u'data',
 u'fusion',
 u'markov-chain',
 u'monte-',
 u'carlo',
 u'trans-dimensional',
 u'electrical',
 u'resistivity',
 u'tomography',
 u'laterally',
 u'constrained',
 u'inversion',
 u'marine',
 u'seismic',
 u'ava',
 u'maximum-entropy',
 u'experimental-design',
 u'joint',
 u'inversion',
 u'water-content',
 u'csem',
 u'data',
 u'model',
 u'resistivity',
 u'algorithms',
 u'hydrogen',
 u'adsorption',
 u'nitrogen',
 u'adsorption',
 u'hydrogen',
 u'storage',
 u'mof',
 u'structure-property',
 u'relationship',
 u'breathing',
 u'structure',
 u'user-driven',
 u'creative',
 u'workshop',
 u'design',
 u'future',
 u'problem',
 u'small',
 u'hydro',
 u'plants',
 u'hydro',
 u'project',
 u'cost',
 u'electro-mechanical',
 u'equipment',
 u'cost',
 u'turbine',
 u'cost',
 u'cloud',
 u'computing',
 u'flooding',
 u'stakeholder',
 u'engagement',
 u'rural',
 u'land',
 u'management',
 u'local',
 u'ev',
 u'op',
 u'flooding',
 u'tool',
 u'(left)',
 u'scanning-tunneling-microscopy',
 u'augmented-wave',
 u'method',
 u'metal-surfaces',
 u'pyrolytic-graphite',
 u'cu',
 u'assemblies',
 u'networks',
 u'pyridine',
 u'ag(100)',
 u'cu(100)',
 u'agriculture',
 u'climate',
 u'eutrophication',
 u'waste',
 u'water',
 u'windermere',
 u'robot',
 u'arms',
 u'robot',
 u'programming',
 u'robotic',
 u'equipment',
 u'automation',
 u'anomalous',
 u'diffusion',
 u'solute',
 u'transport',
 u'simulation',
 u'laplacian',
 u'delivery',
 u'dynamics',
 u'model',
 u'mitigation',
 u'strategy',
 u'phosphorus',
 u'delivery',
 u'fuzzy',
 u'modelling',
 u'constructed',
 u'wetlands',
 u'riparian',
 u'buffer',
 u'zones',
 u'water',
 u'quality',
 u'bimetallic',
 u'surface',
 u'surface',
 u'alloy',
 u'platinum',
 u'monolayer',
 u'electrocatalysis',
 u'oxygen',
 u'reduction',
 u'reaction',
 u'scanning',
 u'probe',
 u'microscopy',
 u'ruthenium',
 u'ad-atoms',
 u'heterogeneous',
 u'catalysis',
 u'electronic-properties',
 u'ammonia-synthesis',
 u'carbon-monoxide',
 u'single-crystal',
 u'metal-surfaces',
 u'co',
 u'adsorption',
 u'platinum',
 u'electrocatalysts',
 u'hydropower',
 u'impulse',
 u'turbines',
 u'pelton',
 u'turbine',
 u'turgo',
 u'turbine',
 u'computational',
 u'fluid',
 u'dynamics',
 u'optimisation',
 u'ion-selective',
 u'electrodes',
 u'membrane',
 u'electrodes',
 u'derivatives',
 u'ionophores',
 u'sites',
 u'digital',
 u'signage,',
 u'hci,',
 u'long-term',
 u'deployments,',
 u'public',
 u'display',
 u'synaptic',
 u'plasticity',
 u'long',
 u'term',
 u'depression',
 u'dominant',
 u'sub-processes',
 u'discrete-time',
 u'transfer',
 u'function',
 u'models',
 u'relaxation',
 u'induced',
 u'polarization',
 u'measurements',
 u'parameters',
 u'washington',
 u'rocks',
 u'shaly',
 u'sands',
 u'resistivity',
 u'zone',
 u'groundwater',
 u'electrical-properties',
 u'parameter',
 u'estimation',
 u'system',
 u'identification',
 u'nonlinear',
 u'model',
 u'genetic',
 u'algorithm',
 u'mathematical',
 u'modeling',
 u'domain-induced',
 u'polarization',
 u'clay-rocks',
 u'electrolyte',
 u'solution',
 u'cole-cole',
 u'parameters',
 u'colloidal',
 u'particles',
 u'sands',
 u'time-domain',
 u'frequency',
 u'dielectric-dispersion',
 u'porous-media',
 u'electrical-properties',
 u'resolution',
 u'tunnel-junctions',
 u'scattering',
 u'spectroscopy',
 u'transition-edge',
 u'sensors',
 u'microcalorimeters',
 u'metals',
 u'magnetic',
 u'calorimeters',
 u'detectors',
 u'performance',
 u'wind',
 u'turbines',
 u'supervisory',
 u'control',
 u'and',
 u'data',
 u'acquisition',
 u'(scada)',
 u'data',
 u'parallel',
 u'factor',
 u'analysis',
 u'k-means',
 u'clustering',
 u'condition',
 u'monitoring',
 u'metal-surfaces',
 u'organic-molecules',
 u'pyrolytic-graphite',
 u'adsorption',
 u'assemblies',
 u'complexes',
 u'substrate',
 u'cu(110)',
 u'model',
 u'predictive',
 u'control',
 u'non-',
 u'minimal',
 u'state',
 u'space',
 u'optimal',
 u'controller',
 u'tuning',
 u'decoupling',
 u'random',
 u'field',
 u'locally',
 u'stationary',
 u'local',
 u'autocovariance',
 u'ls2',
 u'w',
 u'texture',
 u'analysis',
 u'non-decimated',
 u'wavelets',
 u'r.',
 u'energy,',
 u'heat',
 u'policy',
 u'high-capacity',
 u'nanostructured',
 u'materials',
 u'energy-conversion',
 u'nanowire',
 u'arrays',
 u'storage',
 u'devices',
 u'oxide',
 u'co3',
 u'o4',
 u'foam',
 u'challenges',
 u'identification',
 u'state-dependent',
 u'parameter',
 u'robotics',
 u'non-minimal',
 u'state',
 u'space',
 u'dots',
 u'molecular-beam',
 u'epitaxy',
 u'segregation',
 u'sodium-ion',
 u'batteries',
 u'polyoxometalates',
 u'cluster',
 u'electrodes',
 u'sodium-ion',
 u'battery',
 u'anodes',
 u'hybrid',
 u'electrode',
 u'materials',
 u'lasers',
 u'photoluminescence',
 u'localization',
 u'temperature',
 u'dependence',
 u'wells',
 u'copper',
 u'electric',
 u'power',
 u'supplies',
 u'electrochemical',
 u'techniques',
 u'electrodes',
 u'ions',
 u'lithium',
 u'nanostructures',
 u'vanadium',
 u'compounds',
 u'eroi',
 u'e',
 u'ro',
 u'ei',
 u'photovoltaic',
 u'energy',
 u'insolation',
 u'levels',
 u'switzerland',
 u'germany',
 u'incentive',
 u'system',
 u'adjustment',
 u'factor',
 u'impulse',
 u'turbine',
 u'injectors',
 u'pelton-',
 u'turgo',
 u'runner',
 u'hydraulic',
 u'efficiency',
 u'numerical',
 u'modelling',
 u'computational',
 u'fluid',
 u'dynamics',
 u'hydropower',
 u'spear',
 u'valve',
 u'design',
 u'commercial',
 u'cell',
 u'modification',
 u'current',
 u'density',
 u'distribution',
 u'internal',
 u'temperature',
 u'local',
 u'potential',
 u'measurements',
 u'state',
 u'of',
 u'charge',
 u'(soc)',
 u'inhomogeneities',
 u'starting',
 u'work',
 u'parental',
 u'influence',
 u'britain',
 u'twentieth',
 u'century',
 u'polyethylene',
 u'collimator',
 u'ej-426',
 u'detector',
 u'ultra-fast',
 u'mixed-field',
 u'analyser',
 u'real-time',
 u'digital',
 u'imaging',
 u'analysis',
 u'electrochemical',
 u'capacitors',
 u'nanowire',
 u'arrays',
 u'energy-storage',
 u'batteries',
 u'geophysics',
 u'peatlands',
 u'carbon',
 u'cycle',
 u'ground-penetrating',
 u'radar',
 u'northern',
 u'peatlands',
 u'carbon-dioxide',
 u'raised',
 u'bogs',
 u'methane',
 u'accumulation',
 u'minnesota',
 u'bubbles',
 u'velocity',
 u'complex',
 u'templating',
 u'synthesis',
 u'iron',
 u'oxide',
 u'nanoparticles',
 u'lyotropic',
 u'mesophases',
 u'nitrate',
 u'solutions',
 u'nanosized',
 u'particles',
 u'goethite',
 u'hematite',
 u'precipitation',
 u'assemblies',
 u'hydroxides',
 u'glycol',
 u'gamma',
 u'river\u2013groundwater',
 u'interaction',
 u'temperature',
 u'time',
 u'series',
 u'analysis',
 u'seepage',
 u'flux',
 u'walking',
 u'cycling',
 u'sustainable',
 u'travel',
 u'household',
 u'constraints',
 u'stainless-steel',
 u'nitric-acid',
 u'corrosion',
 u'grow-cell',
 u'closed-environment',
 u'growing',
 u'system',
 u'state-dependent',
 u'parameter',
 u'spatial',
 u'zones',
 u'thermal',
 u'modelling',
 u'hammerstein',
 u'model',
 u'environmental',
 u'chamber',
 u'transfer',
 u'function',
 u'model',
 u'logistic',
 u'growth',
 u'function',
 u'transport',
 u'history',
 u'policy',
 u'anthraquinone',
 u'\u03c0-conjugation',
 u'mechanically',
 u'controlled',
 u'break',
 u'junction',
 u'single-molecule',
 u'conductance',
 u'relaxation',
 u'breakthrough',
 u'curves',
 u'surface-reactions',
 u'advection-dispersion',
 u'solute',
 u'transport',
 u'fractional',
 u'dispersion',
 u'nuclear-magnetic-resonance',
 u'nmr',
 u'tracer',
 u'tests',
 u'porous-media',
 u'disruption',
 u'travel',
 u'resilience',
 u'diaries',
 u'weather',
 u'family',
 u'risk',
 u'bootstrapping',
 u'fabric',
 u'analysis',
 u'lattice',
 u'processes',
 u'non-stationarity',
 u'random',
 u'field',
 u'global',
 u'navigation',
 u'satellite',
 u'system',
 u'navigation',
 u'signal',
 u'simulator',
 u'multi-constellation',
 u'reconfigurable',
 u'platform',
 u'walking',
 u'cycling',
 u'greenhouse',
 u'gas',
 u'emissions',
 u'ethnographies',
 u'complexity',
 u'electron-transport',
 u'tetracyanoethylene',
 u'conductivity',
 u'derivatives',
 u'wires',
 u'tetrathiafulvalene',
 u'resistance',
 u'chemistry',
 u'thiophene',
 u'binding',
 u'decolorization',
 u'semiconductor',
 u'acid',
 u'red',
 u'17',
 u'pollutants',
 u'nanoparticles',
 u'titanium',
 u'dioxide',
 u'photosensitized',
 u'degradation',
 u'textile',
 u'effluent',
 u'reactive',
 u'red',
 u'241',
 u'tio2',
 u'thin-films',
 u'visible-light',
 u'photodegradation',
 u'dispersions',
 u'reduction',
 u'textile',
 u'dye',
 u'radiation',
 u'risk',
 u'based',
 u'policy',
 u'livestock',
 u'disease',
 u'control',
 u'cultural',
 u'theory',
 u'expert',
 u'discourse',
 u'sociotechnical',
 u'hazards',
 u'condition',
 u'monitoring',
 u'distributed',
 u'generation',
 u'wind',
 u'turbine',
 u'lipschitz',
 u'exponent',
 u'feature',
 u'extraction',
 u'data',
 u'mining',
 u'and',
 u'fusion',
 u'stream',
 u'water',
 u'interface',
 u'photocatalysis',
 u'electroless',
 u'deposition',
 u'titanium',
 u'dioxide',
 u'quartz',
 u'crystal',
 u'microbalance',
 u'palladium',
 u'membranes',
 u'tio2',
 u'films',
 u'copper',
 u'water',
 u'pd',
 u'pscad/emtdc',
 u'crowbar',
 u'protection',
 u'doubly',
 u'fed',
 u'induction',
 u'generators',
 u'(dfig)',
 u'fault',
 u'ride-through',
 u'(frt)',
 u'tidal',
 u'current',
 u'turbine',
 u'phosphatase',
 u'mobilisation',
 u'transport',
 u'temporal',
 u'review',
 u'hydrolysis',
 u'hyporheic',
 u'river',
 u'restoration',
 u'riffle-pool',
 u'groundwater',
 u'pumping',
 u'exchange',
 u'scale',
 u'waste',
 u'management',
 u'nuclear',
 u'power',
 u'decommissioning',
 u'wetland',
 u'hydrogeophysics',
 u'temperature',
 u'resistivity',
 u'biomimetics',
 u'fish',
 u'swimming',
 u'undulating',
 u'propulsion',
 u'orthogonal',
 u'experiment',
 u'design',
 u'nuclear',
 u'plutonium',
 u'dioxide',
 u'water',
 u'adsorption',
 u'qcm',
 u'cerium',
 u'oxide',
 u'structure-',
 u'property',
 u'relationships',
 u'characterization',
 u'tools',
 u'nanocrystals',
 u'stimuli-',
 u'responsive',
 u'materials',
 u'data',
 u'storage',
 u'ultrasonic',
 u'force',
 u'microscopy',
 u'ufm',
 u'subsurface',
 u'3',
 u'd',
 u'hyporheic',
 u'nitrate',
 u'hydrological',
 u'pathways',
 u'groundwater-fed',
 u'rivers',
 u'water',
 u'quality',
 u'pollution',
 u'wave',
 u'energy',
 u'converter',
 u'(wec)',
 u'multi',
 u'axis',
 u'wave',
 u'energy',
 u'damping,',
 u'wave',
 u'tank',
 u'experiment',
 u'state-dependent',
 u'parameter',
 u'non-minimal',
 u'state',
 u'space',
 u'pole',
 u'assignment',
 u'nuclear',
 u'decommissioning',
 u'robot',
 u'hydraulic',
 u'manipulators',
 u'pole',
 u'assignment',
 u'mean',
 u'value',
 u'engine',
 u'model',
 u'multivariable',
 u'decoupling',
 u'control',
 u'engine',
 u'control',
 u'internal',
 u'combustion',
 u'engine',
 u'mvem',
 u'non',
 u'minimal',
 u'state',
 u'space',
 u'sliding',
 u'mode',
 u'control',
 u'dilute',
 u'nitrides',
 u'multiple',
 u'quantum',
 u'wells',
 u'magneto-photoluminescence',
 u'excitons',
 u'interaction',
 u'design',
 u'thermal',
 u'comfort',
 u'heating',
 u'cooling',
 u'gallium',
 u'arsenide',
 u'gallium',
 u'compounds',
 u'iii-v',
 u'semiconductors',
 u'infrared',
 u'spectra',
 u'nanostructured',
 u'materials',
 u'semiconductor',
 u'quantum',
 u'dots',
 u'solar',
 ...]

Having consturcted three project score vectors (without title, with title, both), we sort the projects based on high scores. These are best matching research projects. We display a link to them below. Repeat for each topic.


In [ ]:
for topic_id in range(1,len(topics)):
    #select topic
    #topic_id=1
    #use title
    usetitle=True
    verbose=False
    #initiate global DFs
    DF=pd.DataFrame()
    projects1={}
    projects2={}
    projects12={}
    #specify depth (n most relevant projects)
    depth=100
    #get topical nouns with textblob
    blob1 = TextBlob(topics[topic_id].decode('utf8'))
    keywords1=blob1.noun_phrases
    #get topical nouns with spacy
    blob2 = nlp(topics[topic_id].decode('utf8'))
    keywords2=[]
    for i in blob2.noun_chunks:
        keywords2.append(str(i).replace(u'\n',' '))
    #create unified, unique set of topical nouns, called keywords here
    keywords=list(set(keywords2).union(set(keywords1)))
    print '----- started processing topic ', topic_id,'-----'
    print 'topic keywords are:',
    for keyword in keywords: print keyword+', ',
    print ' '
    #construct search query from title and keywords, the cycle through the keywords
    for keyword in keywords:
        if usetitle:
            if verbose: print 'query for <'+title+keyword+'>'
            query=repr(title+keyword).replace(' ','+')[2:-1]
            u0='http://gtr.rcuk.ac.uk/search/project/csv?term='
            u1='&selectedFacets=&fields='
            u2='pro.gr,pro.t,pro.a,pro.orcidId,per.fn,per.on,per.sn,'
            u3='per.fnsn,per.orcidId,per.org.n,per.pro.t,per.pro.abs,pub.t,pub.a,pub.orcidId,org.n,org.orcidId,'
            u4='acp.t,acp.d,acp.i,acp.oid,kf.d,kf.oid,is.t,is.d,is.oid,col.i,col.d,col.c,col.dept,col.org,col.pc,col.pic,'
            u5='col.oid,ip.t,ip.d,ip.i,ip.oid,pol.i,pol.gt,pol.in,pol.oid,prod.t,prod.d,prod.i,prod.oid,rtp.t,rtp.d,rtp.i,'
            u6='rtp.oid,rdm.t,rdm.d,rdm.i,rdm.oid,stp.t,stp.d,stp.i,stp.oid,so.t,so.d,so.cn,so.i,so.oid,ff.t,ff.d,ff.c,'
            u7='ff.org,ff.dept,ff.oid,dis.t,dis.d,dis.i,dis.oid'
            u8='&type=&fetchSize=50'
            u9='&selectedSortableField=score&selectedSortOrder=DESC'
            url=u0+query+u8+u9
            #query RCUK GtR API
            df=pd.read_csv(url,nrows=depth)
            #record scores
            df['score'] = depth-df.index
            df=df.set_index('ProjectReference')
            DF=pd.concat([DF,df])
        for i in df.index:
            if i not in projects12:projects12[i]=0
            projects12[i]+=df.loc[i]['score']**2
            if i not in projects1:projects1[i]=0
            projects1[i]+=df.loc[i]['score']**2
        if verbose: print 'query for <'+keyword+'>'
        query=repr(keyword).replace(' ','+')[2:-1]
        u0='http://gtr.rcuk.ac.uk/search/project/csv?term='
        u1='&selectedFacets=&fields='
        u2='pro.gr,pro.t,pro.a,pro.orcidId,per.fn,per.on,per.sn,'
        u3='per.fnsn,per.orcidId,per.org.n,per.pro.t,per.pro.abs,pub.t,pub.a,pub.orcidId,org.n,org.orcidId,'
        u4='acp.t,acp.d,acp.i,acp.oid,kf.d,kf.oid,is.t,is.d,is.oid,col.i,col.d,col.c,col.dept,col.org,col.pc,col.pic,'
        u5='col.oid,ip.t,ip.d,ip.i,ip.oid,pol.i,pol.gt,pol.in,pol.oid,prod.t,prod.d,prod.i,prod.oid,rtp.t,rtp.d,rtp.i,'
        u6='rtp.oid,rdm.t,rdm.d,rdm.i,rdm.oid,stp.t,stp.d,stp.i,stp.oid,so.t,so.d,so.cn,so.i,so.oid,ff.t,ff.d,ff.c,'
        u7='ff.org,ff.dept,ff.oid,dis.t,dis.d,dis.i,dis.oid'
        u8='&type=&fetchSize=50'
        u9='&selectedSortableField=score&selectedSortOrder=DESC'
        url=u0+query+u8+u9
        #query RCUK GtR API
        df=pd.read_csv(url,nrows=depth)
        #record scores
        df['score'] = depth-df.index
        df=df.set_index('ProjectReference')
        DF=pd.concat([DF,df])
        for i in df.index:
            if i not in projects12:projects12[i]=0
            projects12[i]+=df.loc[i]['score']**2
            if i not in projects2:projects2[i]=0
            projects2[i]+=df.loc[i]['score']**2
    print '----- finished topic ', topic_id,'-----'
    print ' '

    ###### SORTING #######
    #select top projects
    #sort project vectors
    top=30
    import operator
    sorted_projects1=sorted(projects1.items(), key=operator.itemgetter(1))[::-1][:30]
    sorted_projects2=sorted(projects2.items(), key=operator.itemgetter(1))[::-1][:30]
    sorted_projects12=sorted(projects12.items(), key=operator.itemgetter(1))[::-1][:30]
    #record scores in sorted vector in a master vector
    projects={}
    for i in range(len(sorted_projects1)):
            if sorted_projects1[i][0] not in projects:projects[sorted_projects1[i][0]]=0
            projects[sorted_projects1[i][0]]+=(top-i)**2
    for i in range(len(sorted_projects2)):
            if sorted_projects2[i][0] not in projects:projects[sorted_projects2[i][0]]=0
            projects[sorted_projects2[i][0]]+=(top-i)**2
    for i in range(len(sorted_projects12)):
            if sorted_projects12[i][0] not in projects:projects[sorted_projects12[i][0]]=0
            projects[sorted_projects12[i][0]]+=(top-i)**2
    #save final vector of most relevant projects
    sorted_projects=sorted(projects.items(), key=operator.itemgetter(1))[::-1][:30]

    ###### DISPLAY ########
    #print resulting links to projects
    for i in range(len(sorted_projects)):
        print str(i+1)+'.',DF.loc[sorted_projects[i][0]][u'GTRProjectUrl'].values[0],\
                DF.loc[sorted_projects[i][0]][u'PIFirstName'].values[0],\
                DF.loc[sorted_projects[i][0]][u'PISurname'].values[0]+'|',\
                DF.loc[sorted_projects[i][0]][u'LeadROName'].values[0]+'|',\
                DF.loc[sorted_projects[i][0]][u'StartDate'].values[0][6:]+'-'+\
                DF.loc[sorted_projects[i][0]][u'EndDate'].values[0][6:]+'|',\
                str(int(DF.loc[sorted_projects[i][0]][u'AwardPounds'].values[0])/1000)+'k'
        print DF.loc[sorted_projects[i][0]][u'Title'].values[0]+'\n'

    #print '----------------------------------------------------'