In [7]:
#from os import path
import re
import requests
from bs4 import BeautifulSoup
In [8]:
# publication year to check
year = 2011
#year = 2010
In [9]:
if year == 2011:
start_page = "http://fermi.gsfc.nasa.gov/ssc/library/pubs/articles.html"
elif year == 2010:
start_page = "http://fermi.gsfc.nasa.gov/ssc/library/pubs/2010"
regex = re.compile(r'.*/')
match = regex.match(start_page)
if year == 2011:
base = match.group(0)
elif year = 2010:
base = start_page + '/'
print(base)
In [10]:
response = requests.get(start_page)
#print(response.text)
soup = BeautifulSoup(response.text, 'lxml')
In [16]:
if year == 2011:
page_pattern = r'2011/'
elif year == 2010:
page_pattern = r'2010-'
sgr_pattern = r'.*sgr\sa.*'
sgr_regex = re.compile(sgr_pattern, re.IGNORECASE)
# for some reason this doesn't seem to match galactic centre ?
gc_pattern = r'.*galactic\scent.*'
gc_regex = re.compile(gc_pattern, re.IGNORECASE)
ref_regex = re.compile(r'.*href="(.+?)".*')
find_ref = False
for link in soup.find_all('a'):
path_str = link.get('href')
if re.match(page_pattern, path_str):
link_url = base + path_str
page = requests.get(link_url)
#print(link_url)
for line in (page.text).split('\n'):
if sgr_regex.match(line) or gc_regex.match(line):
print(line)
find_ref = True
if find_ref and ref_regex.match(line):
match = ref_regex.search(line)
print(match.group(1))
find_ref = False
In [12]:
tst_string = 'gyih frenk go'
if re.match(r'.*frenk.*',tst_string):
print('yo')
In [13]:
#next_link = soup.find('a', id='next-link').get('href')