In [7]:
#from os import path
import re
import requests
from bs4 import BeautifulSoup

In [8]:
# publication year to check
year = 2011
#year = 2010

In [9]:
if year == 2011:
    start_page = "http://fermi.gsfc.nasa.gov/ssc/library/pubs/articles.html"
elif year == 2010:
    start_page = "http://fermi.gsfc.nasa.gov/ssc/library/pubs/2010"
regex = re.compile(r'.*/')
match = regex.match(start_page)
if year == 2011: 
    base = match.group(0)
elif year = 2010:
    base = start_page + '/'
print(base)


http://fermi.gsfc.nasa.gov/ssc/library/pubs/

In [10]:
response = requests.get(start_page)

#print(response.text)

soup = BeautifulSoup(response.text, 'lxml')

In [16]:
if year == 2011:
    page_pattern = r'2011/'
elif year == 2010:
    page_pattern = r'2010-'
sgr_pattern = r'.*sgr\sa.*'
sgr_regex = re.compile(sgr_pattern, re.IGNORECASE)

# for some reason this doesn't seem to match galactic centre ?
gc_pattern = r'.*galactic\scent.*'

gc_regex = re.compile(gc_pattern, re.IGNORECASE)
ref_regex = re.compile(r'.*href="(.+?)".*')

find_ref = False
for link in soup.find_all('a'):
    path_str = link.get('href')
    if re.match(page_pattern, path_str):
        link_url = base + path_str
        page = requests.get(link_url)
        #print(link_url)
        for line in (page.text).split('\n'):
            if sgr_regex.match(line) or gc_regex.match(line):
                print(line)
                find_ref = True
            if find_ref and ref_regex.match(line):
                match = ref_regex.search(line)
                print(match.group(1))
                find_ref = False


bubbles are the remnants of a large-scale wide-angle outflow from Sgr A<sup>*</sup>, the
but bright accretion event on to Sgr A<sup>*</sup> if it happened concurrently with the
into stars, while the rest accreted onto Sgr A<sup>*</sup>. One interpretation of this is a
reduced star formation efficiency inside the Sgr A<sup>*</sup> accretion disc due to
http://arXiv.org/abs/1104.5443
50&deg; above and below the galactic centre, which presumably originated in
http://arXiv.org/abs/1104.3585
</div>
http://arXiv.org/abs/1103.4545
</div>
http://arXiv.org/abs/1103.2128

In [12]:
tst_string = 'gyih frenk go'
if re.match(r'.*frenk.*',tst_string):
    print('yo')


yo

In [13]:
#next_link = soup.find('a', id='next-link').get('href')