In [1]:
import os

In [2]:
import pickle

In [25]:
import textract

In [43]:
import ipyparallel as ipp
from ipyparallel import depend, require, dependent

In [ ]:
# /home/immersinn/.virtualenvs/cia_lib/bin/ipcluster start --n=8

In [4]:
fd = os.path.abspath(os.path.join('..'))

In [22]:
raw_data_path = os.path.join(fd, "data/raw")
int_data_path = os.path.join(fd, "data/interim")
pdf_docs_path = os.path.join(raw_data_path, "presidents-daily-brief-1969-1977_pdfs")
int_docs_path = os.path.join(int_data_path, "presidents-daily-brief-1969-1977_dats")

In [19]:
file_names = [fn for fn in os.listdir(pdf_docs_path) if fn.endswith(".pdf")]

In [28]:
len(file_names)


Out[28]:
2506

In [82]:
rc = ipp.Client()
print(rc[:])
lv = rc[:]


<DirectView [0, 1, 2, 3,...]>

In [88]:
def process_and_save(fn):
    text = textract.process(os.path.join(pdf_docs_path, fn), method="tesseract", language="eng")
    dn = fn.strip(".pdf")
    with open(os.path.join(int_docs_path, dn + '.bin'), 'wb') as f1:
        pickle.dump(text, f1)
    return(1)

In [90]:
lv.push(dict(raw_data_path = raw_data_path,
             int_data_path = int_data_path,
             pdf_docs_path = pdf_docs_path,
             int_docs_path = int_docs_path,
             process_and_save = process_and_save));

In [91]:
with lv.sync_imports():
    import os
    import pickle
    import textract


importing os on engine(s)
importing pickle on engine(s)
importing textract on engine(s)

In [92]:
%%time

results = lv.map_sync(lambda x: process_and_save(x), file_names)


CPU times: user 40.5 s, sys: 2min 55s, total: 3min 35s
Wall time: 4h 31min 16s

Test Load


In [98]:
doc_id = "0005976599"

In [99]:
with open(os.path.join(int_docs_path, "DOC_" + doc_id + '.bin'), 'rb') as f1:
    text = f1.read()

In [100]:
text[:1500]


Out[100]:
b'\x80\x03B\xe6P\x00\x00Dec\xe2\x80\x98assmed In Pan - Sanlllled Copy Approved for Release 2016/04/27 : CIA-RDP79TDDBSSADDS7Q00280001-3\n/ K / \\\n94/\n9/\n\n \n\n \n\nT/oe sz\'dmt\xe2\x80\x99y Daily Brief\n\n4 February 1969\n\n18\n\n50x1\n2\n\nDec\xe2\x80\x98assmed In Pal\xe2\x80\x9c \xc2\xbb Sanmxed Copy Approved for Release 2018/04/27 V C|A-RDP79TUUQSSADDS7OOZSOOO1-3\n\nDeclass\xe2\x80\x98lied in Pan - sanitized Copy Approved for Release 2016/04/27 : CIA-RDP79T00936A006700280001-3 _\n\n \n\n:2 /7\n\n@252\xe2\x80\x9d:\n\n \n\n   \n   \n    \n  \n\n \n\nLEBANON\n\nEevul\n\nDxmascus\n\nMediterranean Sea\n\n__, /\namp\n\n/Ymmuk n\n\n   \n  \n \n  \n\n    \n \n   \n   \n      \n \n   \n\nISRAEL r\\\n[/3\nW wssr\n"\xe2\x80\x9c. \xe2\x80\x98 \\amx \xc2\xa7 .n\n_J,_ Am.\xe2\x80\x9d .~ 732-\nEr\nJew am.\xe2\x80\x9c\nand\nmu my; / 5"\n\xe2\x80\x9c3.534 msm\n.2\n\nJORDAN\n\nSINAI\nPENINSULA\n\nqg, ~25-\n\nARABIA\n\n\'Shan-n =an\nShiykh\n\nHugh-a: \\\n\n \n\n \n\n \n\n \n\nmu m\n\nDeclassmed In Part - sanitized Copy Approved for Release 2016/04/27 V CIA-RDP79T00936A006700250001-3\n\n \n\nW\n\na H,\xe2\x80\x99\n\n.. c.r\xe2\x80\x98,.ec\n\nDec\xe2\x80\x98assi\xef\xac\x81ed in Pan - sanitized Cnpy Approved (or Release 2016/04/27 : CIA-RDP79T00936A006700280001-3\n\nFOR THE PRESIDENT ONLY\n\n5 February 1969\n\nLATE NOTES FOR THE PRESIDENT\'S DAILY BRIEF 0F\nyFEBRUARY 1969\n\nI. MAJOR PROBLEMS\n\nMIDDLE EAST\n\nThere is nothing of significance to reportl\n\nEUROPE\n\nThere is nothing of significance to report.\n\nSOVIET AFFAIRS\n\nThere is nothing of significance to rEport.\n\nVIETNAM\n\nVice President Ky has told Ambassador Lodge that he in-\ntends to return to Saigon on Saturday, 8 February. His pur-\npose is to put more pressure on president Thieu to reorgan\xe2\x80\x94\nize the South Vietnamese go'

In [101]:
str(text).lower().find('late notes')


Out[101]:
1318

In [102]:
str(text)[1300:1500]


Out[102]:
" February 1969\\n\\nLATE NOTES FOR THE PRESIDENT\\'S DAILY BRIEF 0F\\nyFEBRUARY 1969\\n\\nI. MAJOR PROBLEMS\\n\\nMIDDLE EAST\\n\\nThere is nothing of significance to reportl\\n\\nEUROPE\\n\\nThere is nothing of sig"

In [ ]: