In [12]:
import os
import time
import pickle
In [10]:
import requests
from bs4 import BeautifulSoup as bs
In [11]:
fd = os.path.abspath(os.path.join('..'))
In [13]:
raw_data_path = os.path.join(fd, "data/raw")
int_data_path = os.path.join(fd, "data/interim")
In [3]:
urls = ["https://www.cia.gov/library/readingroom/collection/presidents-daily-brief-1969-1977?page=1",
"https://www.cia.gov/library/readingroom/collection/presidents-daily-brief-1969-1977?page=99",
"https://www.cia.gov/library/readingroom/collection/presidents-daily-brief-1969-1977?page=126"]
In [8]:
pages = []
for url in urls:
page = requests.get(url)
print(page.status_code)
pages.append(page.content)
time.sleep(1)
In [14]:
with open(os.path.join(fd, int_data_path, "rrPages.pkl"), 'wb') as f1:
pickle.dump(pages, f1)
In [19]:
with open(os.path.join(fd, int_data_path, "rrPages.pkl"), 'rb') as f1:
pages = pickle.load(f1)
In [20]:
len(pages)
Out[20]:
In [21]:
pages[0][:100]
Out[21]:
In [22]:
soups = [bs(page, 'html.parser') for page in pages]
In [23]:
soup = soups[0]
In [35]:
contents = soup.body.find_all('div', {'class' : "views-row"})
In [36]:
len(contents)
Out[36]:
In [38]:
contents[0]
Out[38]:
In [39]:
contents[1]
Out[39]:
In [56]:
def parse_content_row(content_element_soup):
doc_page_match = ('h4', {'class' : "field-content"})
doc_pdf_match = ('tr', {'class' : "odd"})
doc_pages_match = ('div', {'class' : "field-content"})
doc_page_info = content_element_soup.find(doc_page_match[0], doc_page_match[1])
doc_pdf_info = content_element_soup.find(doc_pdf_match[0], doc_pdf_match[1])
doc_pages_info = content_element_soup.find(doc_pages_match[0], doc_pages_match[1])
title = doc_page_info.text.strip()
doc_page_url = doc_page_info.find('a')['href']
doc_id = doc_page_url.split("/")[-1].strip()
doc_pdf_url = doc_pdf_info.find('a')['href']
pages = doc_pages_info.text.strip()
return({"doc_id" : doc_id,
"title" :title,
"info_url" : doc_page_url,
"pdf_url" : doc_pdf_url,
"n_pages" : pages})
In [57]:
info = [parse_content_row(c) for c in contents]
In [66]:
info[10]
Out[66]:
In [59]:
import sys
In [60]:
sys.path.append(os.path.join(fd, "src"))
In [61]:
import library_page_process
In [63]:
di = library_page_process.extract_docs_from_page(soups[0])
In [ ]: