In [1]:
import sys

In [2]:
sys.path.append('/home/immersinn/gits/cia_library/src/')

In [3]:
import utils
import mysql_utils

In [4]:
import mysql.connector
from mysql.connector.cursor import MySQLCursor
from mysql.connector.errors import IntegrityError

In [5]:
import requests
from bs4 import BeautifulSoup as bs

Pull Doc Info from MySQL


In [8]:
docs = mysql_utils.docinfoFromMySQL(limit=0, fields=['doc_id'])

In [9]:
len(docs)


Out[9]:
2507

In [10]:
for doc in docs[:5]:
    print(doc)


{'doc_id': '0005976614'}
{'doc_id': '0005976616'}
{'doc_id': '0005976618'}
{'doc_id': '0005976620'}
{'doc_id': '0005976623'}

Pull PDFs from Site


In [6]:
pdf_root_url = "https://www.cia.gov/library/readingroom/docs/"

In [30]:
def build_pdf_url(doc_id):
    return(pdf_root_url + "DOC_" + doc_id + '.pdf')

In [31]:
pdf_urls = [build_pdf_url(doc['doc_id']) for doc in docs]

In [32]:
pdf_urls[:5]


Out[32]:
['https://www.cia.gov/library/readingroom/docs/DOC_0005976614.pdf',
 'https://www.cia.gov/library/readingroom/docs/DOC_0005976616.pdf',
 'https://www.cia.gov/library/readingroom/docs/DOC_0005976618.pdf',
 'https://www.cia.gov/library/readingroom/docs/DOC_0005976620.pdf',
 'https://www.cia.gov/library/readingroom/docs/DOC_0005976623.pdf']

In [33]:
req = requests.get(pdf_urls[0])
pdf = req.content
req.close()

In [34]:
pdf[:100]


Out[34]:
b'%PDF-1.4\n%\xe2\xe3\xcf\xd3\n1 0 obj\n<</DecodeParms<</Rows 4204/K -1/Columns 2528>>/Type/XObject/ColorSpace/Device'

In [38]:
from urllib import request
import socks
from sockshandler import SocksiPyHandler

In [39]:
opener = request.build_opener(SocksiPyHandler(socks.SOCKS5,
                                                  "127.0.0.1",
                                                  9050))

In [40]:
pdf = opener.open(pdf_urls[0]).read()

In [41]:
pdf[:100]


Out[41]:
b'%PDF-1.4\n%\xe2\xe3\xcf\xd3\n1 0 obj\n<</DecodeParms<</Rows 4204/K -1/Columns 2528>>/Type/XObject/ColorSpace/Device'

In [48]:
n_openers = 4

In [53]:
1 // n_openers


Out[53]:
0

In [46]:
2 // 5


Out[46]:
0

In [54]:
import itertools

In [55]:
itertools.cycle


Out[55]:
itertools.cycle

In [56]:
cy = itertools.cycle([1,2,3])

In [57]:
let = 'abcdefgh'

In [58]:
for n,l in zip(cy,let):
    print(n)
    print(l)


1
a
2
b
3
c
1
d
2
e
3
f
1
g
2
h

In [59]:
from importlib import reload

In [62]:
reload(utils)


Out[62]:
<module 'utils' from '/home/immersinn/gits/cia_library/src/utils.py'>

In [63]:
utils.writePDF(pdf, docs[0]['doc_id'])

In [64]:
reload(mysql_utils)


Out[64]:
<module 'mysql_utils' from '/home/immersinn/gits/cia_library/src/mysql_utils.py'>

In [ ]: