In [1]:
import sys
import os

In [4]:
main_repo_dir = os.path.abspath(os.path.join('../..'))
sys.path.append(os.path.join(main_repo_dir, 'ncga'))

In [5]:
import extract_billpage_content as ebc

In [6]:
from importlib import reload

In [7]:
from bs4 import BeautifulSoup as bs

In [8]:
file_names = ['bill_0', 'bill_1']
pages = []
for fn in file_names:
    with open(os.path.join(main_repo_dir,
                           'ncga',
                           "tests/test_data/billpage_html", fn + '.txt'),
              'rb') as f:
        pages.append(f.read())

In [9]:
soups = [bs(html, 'html.parser') for html in pages if html]

In [10]:
soup = soups[1]

In [11]:
ebc.extract_links(soup)


Out[11]:
[{'html': '/Sessions/2017/Bills/House/HTML/H2v0.html',
  'label': 'Filed',
  'pdf': '/Sessions/2017/Bills/House/PDF/H2v0.pdf'},
 {'html': '/Sessions/2017/Bills/House/HTML/H2v1.html',
  'label': 'Edition 1',
  'pdf': '/Sessions/2017/Bills/House/PDF/H2v1.pdf'},
 {'html': '/Sessions/2017/Bills/House/HTML/H2v2.html',
  'label': 'Edition 2',
  'pdf': '/Sessions/2017/Bills/House/PDF/H2v2.pdf'},
 {'html': '/Sessions/2017/Bills/House/HTML/H2v3.html',
  'label': 'Edition 3',
  'pdf': '/Sessions/2017/Bills/House/PDF/H2v3.pdf'},
 {'html': '/Sessions/2017/Bills/House/HTML/H2v4.html',
  'label': 'Edition 4',
  'pdf': '/Sessions/2017/Bills/House/PDF/H2v4.pdf'}]

In [14]:
ebc.extract_meta(soup)


Out[14]:
{'Counties': None,
 'Keywords': ['ARMED FORCES',
  'EMERGENCY SERVICES',
  'ENVIRONMENT',
  'FIREFIGHTERS & FIREFIGHTING',
  'FORESTRY',
  'LOCAL GOVERNMENT',
  'PERSONNEL',
  'PROPERTY',
  'PUBLIC',
  'REAL ESTATE',
  'TAX EXEMPTIONS',
  'TAXATION',
  'TAXES',
  'PROPERTY',
  'VETERANS'],
 'Sponsors': [{'chamber': 'H', 'userid': '489'},
  {'chamber': 'H', 'userid': '632'},
  {'chamber': 'H', 'userid': '645'},
  {'chamber': 'H', 'userid': '658'},
  {'chamber': 'H', 'userid': '688'},
  {'chamber': 'H', 'userid': '661'},
  {'chamber': 'H', 'userid': '690'},
  {'chamber': 'H', 'userid': '691'},
  {'chamber': 'H', 'userid': '582'},
  {'chamber': 'H', 'userid': '716'},
  {'chamber': 'H', 'userid': '613'},
  {'chamber': 'H', 'userid': '463'},
  {'chamber': 'H', 'userid': '709'},
  {'chamber': 'H', 'userid': '504'},
  {'chamber': 'H', 'userid': '720'},
  {'chamber': 'H', 'userid': '650'},
  {'chamber': 'H', 'userid': '560'},
  {'chamber': 'H', 'userid': '598'},
  {'chamber': 'H', 'userid': '595'},
  {'chamber': 'H', 'userid': '723'},
  {'chamber': 'H', 'userid': '667'},
  {'chamber': 'H', 'userid': '635'},
  {'chamber': 'H', 'userid': '497'},
  {'chamber': 'H', 'userid': '393'},
  {'chamber': 'H', 'userid': '706'},
  {'chamber': 'H', 'userid': '727'},
  {'chamber': 'H', 'userid': '696'},
  {'chamber': 'H', 'userid': '630'},
  {'chamber': 'H', 'userid': '728'},
  {'chamber': 'H', 'userid': '721'}],
 'Statutes': ['105 (Chapter); 105-277.1C',
  '105-277.1E',
  '105-282.1 (Sections)']}

In [ ]: