In [ ]:
import asyncio
import datetime
import os
import re
import sqlite3
import time
import aiohttp
import bs4
import requests
In [ ]:
# Daily index URL
INDEX_URL = 'https://www.sec.gov/Archives/edgar/daily-index/'
ARCHIVE_URL = 'https://www.sec.gov/Archives/'
DELAY = 2
YEARS = list(range(2000, 2018))
QUARTERS = ['QTR1', 'QTR2', 'QTR3', 'QTR4']
HOME = os.path.expanduser('~')
HOOVER_FOLDER = os.path.join(HOME, '.hoover')
DEBUG = True
if DEBUG:
QUARTERS = ['QTR1']
YEARS = [2015]
In [ ]:
# Comprehension yields [('2000', 'QTR1'), ..., ('2017', 'QTR1') ]
period_tuples = [
(str(year), quarter)
for year in YEARS
for quarter in QUARTERS
]
# URLS
quarterly_urls = [
INDEX_URL + year + '/' + quarter + '/'
for year, quarter
in period_tuples
]
# ('1994', 'QTR1): [] for each
index_dict = {
tuple_: []
for tuple_
in period_tuples
}
In [ ]:
def populate_index_dict():
for tuple_, url in zip(period_tuples, quarterly_urls):
time.sleep(DELAY / 10)
r = requests.get(url)
if r.status_code == 404:
continue
html = r.text
soup = bs4.BeautifulSoup(html, 'html.parser')
main_div = soup.find('table', {'summary': 'heding'})
for tag in main_div.findAll('a'):
text = tag['href']
index_dict[tuple_].append(text)
populate_index_dict()
In [ ]:
def prep_db_data():
db_data = []
for key, value in index_dict.items():
year, quarter = key
if value:
for filename in value:
if not 'master' in filename or '.gz' in filename:
continue
link_text = ''.join([
INDEX_URL,
str(year),
'/',
quarter,
'/',
filename
])
year = int(filename.split('.')[1][:4])
month = int(filename.split('.')[1][4:6])
day = int(filename.split('.')[1][6:])
dt = datetime.datetime(year, month, day)
db_data.append(
tuple([
year,
quarter,
filename,
dt,
link_text
])
)
return db_data
db_data = prep_db_data()
In [ ]:
def create_hoover_db():
if not os.path.exists(HOOVER_FOLDER):
os.mkdir(HOOVER_FOLDER)
db_path = os.path.join(HOOVER_FOLDER, 'hoover_db.sqlite3')
conn = sqlite3.connect(db_path)
with conn:
cursor = conn.cursor()
cursor.execute(
'CREATE TABLE IF NOT EXISTS index_table ( '
' index_table_id INTEGER PRIMARY KEY,'
' year INTEGER NOT NULL,'
' quarter TEXT NOT NULL,'
' filname TEXT NOT NULL,'
' input_dt NUMERIC NOT NULL,'
' link_text TEXT NOT NULL,'
' index_data TEXT'
');'
)
cursor.execute(
'CREATE TABLE IF NOT EXISTS company_table ( '
' company_table_id INTEGER PRIMARY KEY,'
' cik TEXT NOT NULL,'
' name TEXT NOT NULL,'
' form_type TEXT NOT NULL,'
' filed_dt NUMERIC NOT NULL,'
' filename TEXT NOT NULL,'
' form_text TEXT'
');'
)
return db_path
db_path = create_hoover_db()
In [ ]:
def populate_hoover_db(db_path):
conn = sqlite3.connect(db_path)
with conn:
cursor = conn.cursor()
cursor.executemany((
'INSERT INTO index_table '
'VALUES (NULL, ?, ?, ?, ?, ?, NULL)'
),
db_data
)
populate_hoover_db(db_path)
In [ ]:
async def data_from_link(link):
async with aiohttp.ClientSession() as session:
async with session.get(link) as response:
byte_data = await response.read()
text_data = byte_data.decode('utf-8', errors='replace')
return text_data
In [ ]:
def dl_text_and_insert(db_path):
conn = sqlite3.connect(db_path)
with conn:
cursor = conn.cursor()
cursor.execute(
'SELECT index_table_id, link_text '
'FROM index_table;'
)
results = cursor.fetchall()
with conn:
loop = asyncio.get_event_loop()
cursor = conn.cursor()
while results:
time.sleep(DELAY * 10)
if len(results) >= 10:
number_to_pull = 10
else:
number_to_pull = len(results)
list_of_items = [
results.pop(0)
for _
in range(number_to_pull)
]
list_of_links = [item[1] for item in list_of_items]
list_of_indices = [item[0] for item in list_of_items]
tasks = [
data_from_link(link)
for link
in list_of_links
]
while True:
try:
time.sleep(DELAY)
text = loop.run_until_complete(asyncio.gather(*tasks))
break
except aiohttp.ClientConnectorError:
print('X', end='')
clean_text = [
text_data.partition('-----\n')[2]
for text_data
in text
]
cursor.executemany(
'UPDATE index_table '
'SET index_data=? '
'WHERE index_table_id=?;',
zip(clean_text, list_of_indices)
)
print('.', end='')
conn.commit()
cursor.execute('VACUUM')
print('Downloading indices')
dl_text_and_insert(db_path)
print('\nComplete.')
In [ ]:
def populate_company_table(db_path):
conn = sqlite3.connect(db_path)
with conn:
cursor1 = conn.cursor()
cursor2 = conn.cursor()
for index, row in enumerate(
cursor1.execute('SELECT index_data FROM index_table;')
):
text = row[0]
lines = text.splitlines()
entries = [
tuple([cik,
company,
form,
datetime.datetime.strptime(datestamp, '%Y%m%d'),
link])
for cik, company, form, datestamp, link
in [
line.split('|')
for line
in lines
]
]
cursor2.executemany(
'INSERT INTO company_table '
'VALUES (NULL, ?, ?, ?, ?, ?, NULL);',
entries
)
if index % 1 == 0:
print('.', end='')
cursor1.execute('VACUUM')
print('Populating table')
populate_company_table(db_path)
print('\nComplete')
In [ ]:
def dl_filings(db_path):
conn = sqlite3.connect(db_path)
with conn:
cursor1 = conn.cursor()
cursor2 = conn.cursor()
loop = asyncio.get_event_loop()
cursor1.execute('SELECT company_table_id, filename '
'FROM company_table;')
while True:
batch = cursor1.fetchmany(10)
if not batch:
break
else:
while True:
try:
# Inverse zip
indices, paths = list(zip(*batch))
links = [
os.path.join(ARCHIVE_URL, path)
for path
in paths
]
tasks = [
data_from_link(link)
for link
in links
]
time.sleep(DELAY)
raw_text = loop.run_until_complete(asyncio.gather(*tasks))
break
except aiohttp.ClientConnectorError:
print('X', end='')
entries = zip(raw_text, indices)
print('.', end='')
cursor2.executemany(
'UPDATE company_table '
'SET form_text=? '
'WHERE company_table_id=?;',
entries
)
cursor1.execute('VACUUM')
print('Downloading forms')
dl_filings(db_path)
print('\nComplete')