In [1]:
import sys
In [2]:
sys.path.append('/home/immersinn/gits/cia_library/src/')
In [3]:
import utils
import mysql_utils
In [4]:
import mysql.connector
from mysql.connector.cursor import MySQLCursor
from mysql.connector.errors import IntegrityError
In [5]:
import requests
from bs4 import BeautifulSoup as bs
In [8]:
docs = mysql_utils.docinfoFromMySQL(limit=0, fields=['doc_id'])
In [9]:
len(docs)
Out[9]:
In [10]:
for doc in docs[:5]:
print(doc)
In [6]:
pdf_root_url = "https://www.cia.gov/library/readingroom/docs/"
In [30]:
def build_pdf_url(doc_id):
return(pdf_root_url + "DOC_" + doc_id + '.pdf')
In [31]:
pdf_urls = [build_pdf_url(doc['doc_id']) for doc in docs]
In [32]:
pdf_urls[:5]
Out[32]:
In [33]:
req = requests.get(pdf_urls[0])
pdf = req.content
req.close()
In [34]:
pdf[:100]
Out[34]:
In [38]:
from urllib import request
import socks
from sockshandler import SocksiPyHandler
In [39]:
opener = request.build_opener(SocksiPyHandler(socks.SOCKS5,
"127.0.0.1",
9050))
In [40]:
pdf = opener.open(pdf_urls[0]).read()
In [41]:
pdf[:100]
Out[41]:
In [48]:
n_openers = 4
In [53]:
1 // n_openers
Out[53]:
In [46]:
2 // 5
Out[46]:
In [54]:
import itertools
In [55]:
itertools.cycle
Out[55]:
In [56]:
cy = itertools.cycle([1,2,3])
In [57]:
let = 'abcdefgh'
In [58]:
for n,l in zip(cy,let):
print(n)
print(l)
In [59]:
from importlib import reload
In [62]:
reload(utils)
Out[62]:
In [63]:
utils.writePDF(pdf, docs[0]['doc_id'])
In [64]:
reload(mysql_utils)
Out[64]:
In [ ]: