In [10]:
import slate
import urllib
import re
In [3]:
furl = urllib.urlopen('http://www.nature.com/ismej/journal/v10/n1/pdf/ismej2015100a.pdf')
with open( '/tmp/asdfasdfasdf.pdf', 'w' ) as ftempfile :
ftempfile.write( furl.read() )
with open( '/tmp/asdfasdfasdf.pdf' ) as f :
doc = slate.PDF(f)
In this example, we are looking for a link to some source code :
http://prodege.jgi-psf.org//downloads/src
However, in the PDF, the URL is line wrapped, so the src is lost.
In [20]:
urlre = re.compile( '(?P<url>https?://[^\s]+)' )
for page in doc :
print urlre.findall( page )
In [19]:
urlre = re.compile( '(?P<url>https?://[^\s]+)' )
for page in doc :
print urlre.findall( page.replace('\n','') )
At this point, the author elects to flip a table.
Let's try looking at the HTML version. I'll swipe some code from Dive into Python here, because finding URLs in a HTML document is what is known as a "Solved Problem."
In [1]:
from sgmllib import SGMLParser
class URLLister(SGMLParser):
def reset(self):
SGMLParser.reset(self)
self.urls = []
def start_a(self, attrs):
href = [v for k, v in attrs if k=='href']
if href:
self.urls.extend(href)
def get_urls_from(url):
url_list = []
import urllib
usock = urllib.urlopen(url)
parser = URLLister()
parser.feed(usock.read())
usock.close()
parser.close()
map(url_list.append,
[item for item in parser.urls if item.startswith(('http', 'ftp', 'www'))])
return url_list
Here are all the URLs in the document...
In [3]:
urls = get_urls_from('http://www.nature.com/ismej/journal/v10/n1/full/ismej2015100a.html')
urls
Out[3]:
Bleh. That is mostly links in the references, ads and navigation cruft from the journal's content mismanagement system. Because their system is heinously ad hoc, there is no base URL. So, we're forced to use an ad hoc exclusion list.
In [4]:
excluded = [ 'http://www.nature.com',
'http://dx.doi.org',
'http://www.ncbi.nlm.nih.gov',
'http://creativecommons.org',
'https://s100.copyright.com',
'http://mts-isme.nature.com',
'http://www.isme-microbes.org',
'http://ad.doubleclick.net',
'http://mse.force.com',
'http://links.isiglobalnet2.com',
'http://www.readcube.com',
'http://chemport.cas.org',
'http://publicationethics.org/',
'http://www.natureasia.com/'
]
def novel_url( url ) :
for excluded_url in excluded :
if url.startswith( excluded_url ) :
return False
return True
filter( novel_url, urls )
Out[4]:
Much better. Now, let's see if these exist...
In [5]:
import requests
for url in filter( novel_url, urls ) :
request = requests.get( url )
if request.status_code == 200:
print 'Good : ', url
else:
print 'Fail : ', url
Looks like this will work, though we'll need to make a hand-curated list of excluded URLs. Othersise, the counts of dead links could be badly skewed by any issues within the journal's content mismanagement system, ad servers and other irrelevent crud.
In [8]:
from pyzotero import zotero
api_key = open( 'zotero_api_key.txt' ).read().strip()
library_id = open( 'zotero_api_userID.txt' ).read().strip()
library_type = 'group'
group_id = '405341' # microBE.net group ID
zot = zotero.Zotero(group_id, library_type, api_key)
items = zot.top(limit=5)
# we've retrieved the latest five top-level items in our library
# we can print each item's item type and ID
for item in items:
#print('Item: %s | Key: %s') % (item['data']['itemType'], item['data']['key'])
print item['data']['key'], ':', item['data']['title']
So far so good. Let's have a look at the url attribute...
In [47]:
for item in items:
print item['data']['key'], ':', item['data']['url']
Well, it looks like not all resources have URLs. Let's try looping over some of these and extracting links...
In [9]:
for item in items:
paper_url = item['data']['url']
if paper_url.startswith( 'http' ) :
link_urls = get_urls_from( paper_url )
print item['data']['key']
for url in filter( novel_url, link_urls ) :
print ' ', url
Clearly, we need to expand the excluded URL list. And we need to match domains, not URLs.
In [22]:
excluded = [ 'nature.com',
'doi.org',
'ncbi.nlm.nih.gov',
'creativecommons.org',
'copyright.com',
'isme-microbes.org',
'doubleclick.net',
'force.com',
'isiglobalnet2.com',
'readcube.com',
'cas.org',
'publicationethics.org',
'natureasia.com',
'uq.edu.au',
'edx.org',
'facebook.com',
'instagram.com',
'youtube.com',
'flickr.com',
'twitter.com',
'go8.edu.au',
'google.com',
'vimeo.com',
'peerj.com',
'mendeley.com',
'cloudfront.net',
'webofknowledge.com',
'sciencedirect.com',
'aol.com',
'pinterest.com',
'scopus.com',
'live.com',
'exlibrisgroup.com',
'usyd.edu.au',
'academicanalytics.com',
'microbiomedigest.com',
'ask.com',
'sogou.com',
'ou.com',
'du.edu',
'ru.nl',
'freshdesk.com',
'caltech.edu',
'traackr.com',
'adobe.com',
'linkedin.com',
'feedly.com',
'google.co.uk',
'glgoo.org',
'library.wisc.edu',
'lib.fsu.edu',
'library.illinois.edu',
'exchange.ou.edu',
'lib.noaa.gov',
'innocentive.com',
'sfx.kcl.ac.uk',
'sfx.unimi.it',
'lib.utexas.edu',
'orcid.org',
]
def novel_url( url ) :
for excluded_url in excluded :
if url.__contains__( excluded_url ) :
return False
return True
This excluded list is getting sloppy as the author slowly lapses into a vegitative state, but we'll push on anyway.
In [23]:
for item in items:
paper_url = item['data']['url']
if paper_url.startswith( 'http' ) :
try :
link_urls = get_urls_from( paper_url )
print item['data']['key']
for url in list(set(filter( novel_url, link_urls ))) :
print ' ', url
except IOError :
print item['data']['key'], 'FAILED'
Some journals aggressivly ban and throttle IPs, so this process gets slow and awful, but it works. Let's check these for dead links...
In [25]:
for item in items:
paper_url = item['data']['url']
if paper_url.startswith( 'http' ) :
try :
link_urls = get_urls_from( paper_url )
print item['data']['key']
for url in list(set(filter( novel_url, link_urls ))) :
request = requests.get( url )
if request.status_code == 200:
print ' Good : ', url
else:
print ' Fail : ', url
except IOError :
print item['data']['key'], 'FAILED'
I guess that'll do for a proof of concept.
In [ ]: