In [78]:
import re
import copy
import furl
import requests
import tldextract
from lxml import etree

In [79]:
arpha = 'http://arphahub.com/about/webservices'

In [80]:
content = requests.get(arpha).content

In [81]:
parsed = etree.HTML(content)

In [82]:
uri_elements = parsed.xpath('//p[@style="margin-left: 40px;"]')

In [83]:
oai_pmh_uris = []
for uri_part in uri_elements:
    if uri_part.text.strip() == 'OAI-PMH - oai_dc:':
        oai_pmh_uris.append(uri_part.find('a').text)

In [84]:
oai_pmh_uris


Out[84]:
['http://bdj.pensoft.net/oai.php?verb=ListRecords&set=bdj&metadataPrefix=oai_dc',
 'http://zookeys.pensoft.net/oai.php?verb=ListRecords&set=zookeys&metadataPrefix=oai_dc',
 'http://phytokeys.pensoft.net/oai.php?verb=ListRecords&set=phytokeys&metadataPrefix=oai_dc',
 'http://mycokeys.pensoft.net/oai.php?verb=ListRecords&set=mycokeys&metadataPrefix=oai_dc',
 'http://biorisk.pensoft.net/oai.php?verb=ListRecords&set=biorisk&metadataPrefix=oai_dc',
 'http://compcytogen.pensoft.net/oai.php?verb=ListRecords&set=compcytogen&metadataPrefix=oai_dc',
 'http://ijm.pensoft.net/oai.php?verb=ListRecords&set=ijm&metadataPrefix=oai_dc',
 'http://jhr.pensoft.net/oai.php?verb=ListRecords&set=jhr&metadataPrefix=oai_dc',
 'http://natureconservation.pensoft.net/oai.php?verb=ListRecords&set=natureconservation&metadataPrefix=oai_dc',
 'http://neobiota.pensoft.net/oai.php?verb=ListRecords&set=neobiota&metadataPrefix=oai_dc',
 'http://subtbiol.pensoft.net/oai.php?verb=ListRecords&set=subtbiol&metadataPrefix=oai_dc',
 'http://dez.pensoft.net/oai.php?verb=ListRecords&set=dez&metadataPrefix=oai_dc',
 'http://zse.pensoft.net/oai.php?verb=ListRecords&set=zse&metadataPrefix=oai_dc']

In [85]:
journal_names = parsed.xpath('//p/strong/text()')

In [86]:
journal_names


Out[86]:
['Biodiversity Data Journal',
 'ZooKeys',
 'PhytoKeys',
 'MycoKeys',
 'BioRisk',
 'Comparative Cytogenetics',
 'International Journal of Myriapodology',
 'Journal of Hymenoptera Research',
 'Nature Conservation',
 'NeoBiota',
 'Subterranean Biology',
 'Deutsche Entomologische Zeitschrift',
 'Zoosystematics and Evolution']

In [87]:
pensoft_journals = []
names_copy = copy.copy(journal_names)
OAI_BASE_REGEX = re.compile(r'(https?://.*(?:\?))')

for url in oai_pmh_uris:
    furled = furl.furl(url)
    
    one_journal = {}
    one_journal['source'] = names_copy.pop(0)
    one_journal['baseurl'] = OAI_BASE_REGEX.search(url).group()
    one_journal['set'] = furled.args['set']
    
    pensoft_journals.append(one_journal)

In [88]:
pensoft_journals


Out[88]:
[{'baseurl': 'http://bdj.pensoft.net/oai.php?',
  'set': 'bdj',
  'source': 'Biodiversity Data Journal'},
 {'baseurl': 'http://zookeys.pensoft.net/oai.php?',
  'set': 'zookeys',
  'source': 'ZooKeys'},
 {'baseurl': 'http://phytokeys.pensoft.net/oai.php?',
  'set': 'phytokeys',
  'source': 'PhytoKeys'},
 {'baseurl': 'http://mycokeys.pensoft.net/oai.php?',
  'set': 'mycokeys',
  'source': 'MycoKeys'},
 {'baseurl': 'http://biorisk.pensoft.net/oai.php?',
  'set': 'biorisk',
  'source': 'BioRisk'},
 {'baseurl': 'http://compcytogen.pensoft.net/oai.php?',
  'set': 'compcytogen',
  'source': 'Comparative Cytogenetics'},
 {'baseurl': 'http://ijm.pensoft.net/oai.php?',
  'set': 'ijm',
  'source': 'International Journal of Myriapodology'},
 {'baseurl': 'http://jhr.pensoft.net/oai.php?',
  'set': 'jhr',
  'source': 'Journal of Hymenoptera Research'},
 {'baseurl': 'http://natureconservation.pensoft.net/oai.php?',
  'set': 'natureconservation',
  'source': 'Nature Conservation'},
 {'baseurl': 'http://neobiota.pensoft.net/oai.php?',
  'set': 'neobiota',
  'source': 'NeoBiota'},
 {'baseurl': 'http://subtbiol.pensoft.net/oai.php?',
  'set': 'subtbiol',
  'source': 'Subterranean Biology'},
 {'baseurl': 'http://dez.pensoft.net/oai.php?',
  'set': 'dez',
  'source': 'Deutsche Entomologische Zeitschrift'},
 {'baseurl': 'http://zse.pensoft.net/oai.php?',
  'set': 'zse',
  'source': 'Zoosystematics and Evolution'}]

In [ ]: