notebook.community

Edit and run



In [ ]:

    
import asyncio
import datetime
import os
import re
import sqlite3
import time

import aiohttp
import bs4
import requests



In [ ]:

    
# Daily index URL
INDEX_URL = 'https://www.sec.gov/Archives/edgar/daily-index/'
ARCHIVE_URL = 'https://www.sec.gov/Archives/'
DELAY = 2
YEARS = list(range(2000, 2018))
QUARTERS = ['QTR1', 'QTR2', 'QTR3', 'QTR4']
HOME = os.path.expanduser('~')
HOOVER_FOLDER = os.path.join(HOME, '.hoover')
DEBUG = True
if DEBUG:
    QUARTERS = ['QTR1']
    YEARS = [2015]



In [ ]:

    
# Comprehension yields [('2000', 'QTR1'), ..., ('2017', 'QTR1') ]
period_tuples = [
    (str(year), quarter)
    for year in YEARS
    for quarter in QUARTERS
]

# URLS
quarterly_urls = [
    INDEX_URL + year + '/' + quarter + '/'
    for year, quarter
    in period_tuples
]

# ('1994', 'QTR1): [] for each
index_dict = {
    tuple_: []
    for tuple_
    in period_tuples
}



In [ ]:

    
def populate_index_dict():
    for tuple_, url in zip(period_tuples, quarterly_urls):
        time.sleep(DELAY / 10)
        r = requests.get(url)
        if r.status_code == 404:
            continue
        html = r.text
        soup = bs4.BeautifulSoup(html, 'html.parser')
        main_div = soup.find('table', {'summary': 'heding'})
        for tag in main_div.findAll('a'):
            text = tag['href']
            index_dict[tuple_].append(text)


populate_index_dict()



In [ ]:

    
def prep_db_data():
    db_data = []
    for key, value in index_dict.items():
        year, quarter = key
        if value:
            for filename in value:
                if not 'master' in filename or '.gz' in filename:
                    continue
                link_text = ''.join([
                    INDEX_URL,
                    str(year),
                    '/',
                    quarter,
                    '/',
                    filename
                ])
                year = int(filename.split('.')[1][:4])
                month = int(filename.split('.')[1][4:6])
                day = int(filename.split('.')[1][6:])
                dt = datetime.datetime(year, month, day)
                db_data.append(
                    tuple([
                        year,
                        quarter,
                        filename,
                        dt,
                        link_text
                    ])
                )
    return db_data


db_data = prep_db_data()



In [ ]:

    
def create_hoover_db():
    if not os.path.exists(HOOVER_FOLDER):
        os.mkdir(HOOVER_FOLDER)
    db_path = os.path.join(HOOVER_FOLDER, 'hoover_db.sqlite3')
    conn = sqlite3.connect(db_path)
    with conn:
        cursor = conn.cursor()
        cursor.execute(
            'CREATE TABLE IF NOT EXISTS index_table ( '
            '    index_table_id INTEGER PRIMARY KEY,'
            '    year           INTEGER NOT NULL,'
            '    quarter        TEXT    NOT NULL,'
            '    filname        TEXT    NOT NULL,'
            '    input_dt       NUMERIC NOT NULL,'
            '    link_text      TEXT    NOT NULL,'
            '    index_data     TEXT'
            ');'
        )
        cursor.execute(
            'CREATE TABLE IF NOT EXISTS company_table ( '
            '    company_table_id INTEGER PRIMARY KEY,'
            '    cik       TEXT    NOT NULL,'
            '    name      TEXT    NOT NULL,'
            '    form_type TEXT    NOT NULL,'
            '    filed_dt  NUMERIC NOT NULL,'
            '    filename  TEXT    NOT NULL,'
            '    form_text TEXT'
            ');'
        )
    return db_path


db_path = create_hoover_db()



In [ ]:

    
def populate_hoover_db(db_path):
    conn = sqlite3.connect(db_path)
    with conn:
        cursor = conn.cursor()
        cursor.executemany((    
                'INSERT INTO index_table '
                'VALUES (NULL, ?, ?, ?, ?, ?, NULL)'
            ),
            db_data
        )


populate_hoover_db(db_path)



In [ ]:

    
async def data_from_link(link):
    async with aiohttp.ClientSession() as session:
        async with session.get(link) as response:
            byte_data = await response.read()
            text_data = byte_data.decode('utf-8', errors='replace')
            return text_data



In [ ]:

    
def dl_text_and_insert(db_path):
    conn = sqlite3.connect(db_path)
    with conn:
        cursor = conn.cursor()
        cursor.execute(
            'SELECT index_table_id, link_text '
            'FROM index_table;'
        )
        results = cursor.fetchall()
    with conn:
        loop = asyncio.get_event_loop()
        cursor = conn.cursor()
        while results:
            time.sleep(DELAY * 10)
            if len(results) >= 10:
                number_to_pull = 10
            else:
                number_to_pull = len(results)
            list_of_items = [
                results.pop(0)
                for _
                in range(number_to_pull)
            ]
            list_of_links = [item[1] for item in list_of_items]
            list_of_indices = [item[0] for item in list_of_items]
            tasks = [
                data_from_link(link)
                for link
                in list_of_links
            ]
            while True:
                try:
                    time.sleep(DELAY)
                    text = loop.run_until_complete(asyncio.gather(*tasks))
                    break
                except aiohttp.ClientConnectorError:
                    print('X', end='')   
            clean_text = [
                 text_data.partition('-----\n')[2]
                 for text_data
                 in text
            ]
            cursor.executemany(
                'UPDATE index_table ' 
                'SET index_data=? '
                'WHERE index_table_id=?;',
                zip(clean_text, list_of_indices)
            )
            print('.', end='')   
            conn.commit()
        cursor.execute('VACUUM')

    
print('Downloading indices')
dl_text_and_insert(db_path)
print('\nComplete.')



In [ ]:

    
def populate_company_table(db_path):
    conn = sqlite3.connect(db_path)
    with conn:
        cursor1 = conn.cursor()
        cursor2 = conn.cursor()
        for index, row in enumerate(
            cursor1.execute('SELECT index_data FROM index_table;')
        ):
            text = row[0]
            lines = text.splitlines()
            entries = [
                tuple([cik,
                       company,
                       form,
                       datetime.datetime.strptime(datestamp, '%Y%m%d'),
                       link])
                for cik, company, form, datestamp, link
                in [
                    line.split('|')
                    for line
                    in lines
                ]
            ]
            cursor2.executemany(
                'INSERT INTO company_table '
                'VALUES (NULL, ?, ?, ?, ?, ?, NULL);',
                entries
            )
            if index % 1 == 0:
                print('.', end='')   
        cursor1.execute('VACUUM')


print('Populating table')
populate_company_table(db_path)
print('\nComplete')



In [ ]:

    
def dl_filings(db_path):
    conn = sqlite3.connect(db_path)
    with conn:
        cursor1 = conn.cursor()
        cursor2 = conn.cursor()
        loop = asyncio.get_event_loop()
        cursor1.execute('SELECT company_table_id, filename '
                        'FROM company_table;')
        while True:
            batch = cursor1.fetchmany(10)
            if not batch:
                break
            else:
                while True:
                    try:
                        # Inverse zip
                        indices, paths = list(zip(*batch))
                        links = [
                            os.path.join(ARCHIVE_URL, path)
                            for path
                            in paths
                        ]
                        tasks = [
                            data_from_link(link)
                            for link
                            in links
                        ]
                        time.sleep(DELAY)
                        raw_text = loop.run_until_complete(asyncio.gather(*tasks))
                        break
                    except aiohttp.ClientConnectorError:
                        print('X', end='')
                entries = zip(raw_text, indices)
                print('.', end='')
                cursor2.executemany(
                    'UPDATE company_table '
                    'SET form_text=? '
                    'WHERE company_table_id=?;',
                    entries
                )
        cursor1.execute('VACUUM')


print('Downloading forms')
dl_filings(db_path)
print('\nComplete')