In [12]:
import os
import time
import pickle

In [10]:
import requests
from bs4 import BeautifulSoup as bs

In [11]:
fd = os.path.abspath(os.path.join('..'))

In [13]:
raw_data_path = os.path.join(fd, "data/raw")
int_data_path = os.path.join(fd, "data/interim")

Main Pages


In [3]:
urls = ["https://www.cia.gov/library/readingroom/collection/presidents-daily-brief-1969-1977?page=1",
        "https://www.cia.gov/library/readingroom/collection/presidents-daily-brief-1969-1977?page=99",
        "https://www.cia.gov/library/readingroom/collection/presidents-daily-brief-1969-1977?page=126"]

In [8]:
pages = []
for url in urls:
    page = requests.get(url)
    print(page.status_code)
    pages.append(page.content)
    time.sleep(1)


200
200
200

In [14]:
with open(os.path.join(fd, int_data_path, "rrPages.pkl"), 'wb') as f1:
    pickle.dump(pages, f1)

Investigate


In [19]:
with open(os.path.join(fd, int_data_path, "rrPages.pkl"), 'rb') as f1:
    pages = pickle.load(f1)

In [20]:
len(pages)


Out[20]:
3

In [21]:
pages[0][:100]


Out[21]:
b'<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML+RDFa 1.0//EN"\n  "http://www.w3.org/MarkUp/DTD/xhtml-rdfa-1.'

In [22]:
soups = [bs(page, 'html.parser') for page in pages]

In [23]:
soup = soups[0]

In [35]:
contents = soup.body.find_all('div', {'class' : "views-row"})

In [36]:
len(contents)


Out[36]:
20

In [38]:
contents[0]


Out[38]:
<div class="views-row views-row-1 views-row-odd views-row-first">
<div> <p class="docType"><span></span></p> </div>
<div class="views-field views-field-title"> <h4 class="field-content"><a href="/library/readingroom/document/0005976614">THE PRESIDENT'S DAILY BRIEF 12 FEBRUARY 1969</a></h4> </div>
<div class="views-field views-field-field-document-number"> <span class="views-label views-label-field-taxonomy-doc-type">Document Number: </span> <span class="field-content">0005976614</span> </div>
<div class="views-field views-field-field-page-count"> <span class="views-label views-label-field-page-count">Pages: </span> <div class="field-content">23</div> </div>
<div class="views-field views-field-field-file-1"> <div class="field-content"><table class="sticky-enabled">
<thead><tr><th>Attachment</th><th>Size</th> </tr></thead>
<tbody>
<tr class="odd"><td><span class="file"><img alt="PDF icon" class="file-icon" src="/library/readingroom/modules/file/icons/application-pdf.png" title="application/pdf"/> <a href="https://www.cia.gov/library/readingroom/docs/DOC_0005976614.pdf" type="application/pdf; length=576780">DOC_0005976614.pdf</a></span></td><td>563.26 KB</td> </tr>
</tbody>
</table>
</div> </div>
<div class="views-field views-field-edit-node"> <span class="field-content"></span> </div>
<div class="views-field views-field-field-files-s3-2"> <div class="field-content"></div> </div> </div>

In [39]:
contents[1]


Out[39]:
<div class="views-row views-row-2 views-row-even">
<div> <p class="docType"><span></span></p> </div>
<div class="views-field views-field-title"> <h4 class="field-content"><a href="/library/readingroom/document/0005976616">THE PRESIDENT'S DAILY BRIEF 13 FEBRUARY 1969</a></h4> </div>
<div class="views-field views-field-field-document-number"> <span class="views-label views-label-field-taxonomy-doc-type">Document Number: </span> <span class="field-content">0005976616</span> </div>
<div class="views-field views-field-field-page-count"> <span class="views-label views-label-field-page-count">Pages: </span> <div class="field-content">22</div> </div>
<div class="views-field views-field-field-file-1"> <div class="field-content"><table class="sticky-enabled">
<thead><tr><th>Attachment</th><th>Size</th> </tr></thead>
<tbody>
<tr class="odd"><td><span class="file"><img alt="PDF icon" class="file-icon" src="/library/readingroom/modules/file/icons/application-pdf.png" title="application/pdf"/> <a href="https://www.cia.gov/library/readingroom/docs/DOC_0005976616.pdf" type="application/pdf; length=681127">DOC_0005976616.pdf</a></span></td><td>665.16 KB</td> </tr>
</tbody>
</table>
</div> </div>
<div class="views-field views-field-edit-node"> <span class="field-content"></span> </div>
<div class="views-field views-field-field-files-s3-2"> <div class="field-content"></div> </div> </div>

In [56]:
def parse_content_row(content_element_soup):
    doc_page_match = ('h4', {'class' : "field-content"})
    doc_pdf_match =  ('tr', {'class' : "odd"})
    doc_pages_match = ('div', {'class' : "field-content"})
    
    doc_page_info = content_element_soup.find(doc_page_match[0], doc_page_match[1])
    doc_pdf_info = content_element_soup.find(doc_pdf_match[0], doc_pdf_match[1])
    doc_pages_info = content_element_soup.find(doc_pages_match[0], doc_pages_match[1])
    
    title = doc_page_info.text.strip()
    doc_page_url = doc_page_info.find('a')['href']
    doc_id = doc_page_url.split("/")[-1].strip()
    doc_pdf_url = doc_pdf_info.find('a')['href']
    pages = doc_pages_info.text.strip()
    return({"doc_id" : doc_id,
            "title" :title,
            "info_url" : doc_page_url, 
            "pdf_url" : doc_pdf_url,
            "n_pages" : pages})

In [57]:
info = [parse_content_row(c) for c in contents]

In [66]:
info[10]


Out[66]:
{'doc_id': '0005976634',
 'info_url': '/library/readingroom/document/0005976634',
 'n_pages': '11',
 'pdf_url': 'https://www.cia.gov/library/readingroom/docs/DOC_0005976634.pdf',
 'title': "THE PRESIDENT'S DAILY BRIEF 25 FEBRUARY 1969"}

Test Code


In [59]:
import sys

In [60]:
sys.path.append(os.path.join(fd, "src"))

In [61]:
import library_page_process

In [63]:
di = library_page_process.extract_docs_from_page(soups[0])

In [ ]: