In [1]:
import sys
import os

In [2]:
from importlib import reload

In [3]:
sys.path.append(os.path.join(os.path.abspath('../..'), 'src'))

In [4]:
import mysql_utils
import mongo_utils

In [5]:
with mysql_utils.curWith("SELECT * FROM bills WHERE session = '2017' LIMIT 200") as cur:
    bill_info = mysql_utils.dfDocsFromCursor(cur)

In [6]:
bill_info.head()


Out[6]:
id bill_id session date_filed title link page_scraped
0 1 H1 2017 2017-01-11 2017 House Temporary Rules. /gascripts/BillLookUp/BillLookUp.pl?Session=20... 1
1 2 S1 2017 2017-01-11 2017 Senate Permanent Rules. /gascripts/BillLookUp/BillLookUp.pl?Session=20... 1
2 3 S2 2017 2017-01-11 Adjourn Organizational Session. /gascripts/BillLookUp/BillLookUp.pl?Session=20... 1
3 4 H2 2017 2017-01-25 Provide Certain Property Tax Relief. /gascripts/BillLookUp/BillLookUp.pl?Session=20... 1
4 5 H3 2017 2017-01-25 Eminent Domain. /gascripts/BillLookUp/BillLookUp.pl?Session=20... 1

In [7]:
bill_ids = [int(i) for i in list(bill_info.id)]

In [8]:
with mongo_utils.collWith('ncga', 'bill_pages') as coll:
    bill_pages = mongo_utils.dfDocsFromCursor(coll.find({'bill_id' : {'$in' : bill_ids}}, {'_id' : 0}))

In [9]:
bill_pages.head()


Out[9]:
bill_id html
0 1 <!doctype html>\n<html>\n<head>\n\t<meta name=...
1 2 b'<!doctype html>\n<html>\n<head>\n\t<meta nam...
2 3 b'<!doctype html>\n<html>\n<head>\n\t<meta nam...
3 4 b'<!doctype html>\n<html>\n<head>\n\t<meta nam...
4 5 b'<!doctype html>\n<html>\n<head>\n\t<meta nam...

In [10]:
from bs4 import BeautifulSoup as bs

In [11]:
import billpage_proc

In [12]:
bill_pages['soup'] = bill_pages.html.apply(lambda x: bs(x, 'html.parser'))

In [18]:
bill_pages['meta'] = bill_pages.soup.apply(billpage_proc.get_meta)

In [19]:
bill_pages.head()


Out[19]:
bill_id html soup meta
0 1 <!doctype html>\n<html>\n<head>\n\t<meta name=... <!DOCTYPE doctype html> <html> <head> <meta c... [{'label': 'Last Action', 'content': 'Adopted ...
1 2 b'<!doctype html>\n<html>\n<head>\n\t<meta nam... <!DOCTYPE doctype html> <html> <head> <meta c... [{'label': 'Last Action', 'content': 'Adopted ...
2 3 b'<!doctype html>\n<html>\n<head>\n\t<meta nam... <!DOCTYPE doctype html> <html> <head> <meta c... [{'label': 'Last Action', 'content': 'Ch. Res ...
3 4 b'<!doctype html>\n<html>\n<head>\n\t<meta nam... <!DOCTYPE doctype html> <html> <head> <meta c... [{'label': 'Last Action', 'content': 'Re-ref C...
4 5 b'<!doctype html>\n<html>\n<head>\n\t<meta nam... <!DOCTYPE doctype html> <html> <head> <meta c... [{'label': 'Last Action', 'content': 'Ref To C...

In [21]:
bill_pages['ses_and_ed'] = bill_pages.soup.apply(billpage_proc.get_session_and_editions)


---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-21-311ff59ae86d> in <module>()
----> 1 bill_pages['ses_and_ed'] = bill_pages.soup.apply(billpage_proc.get_session_and_editions)

/home/immersinn/.virtualenvs/ncga/lib/python3.5/site-packages/pandas/core/series.py in apply(self, func, convert_dtype, args, **kwds)
   2292             else:
   2293                 values = self.asobject
-> 2294                 mapped = lib.map_infer(values, f, convert=convert_dtype)
   2295 
   2296         if len(mapped) and isinstance(mapped[0], Series):

pandas/src/inference.pyx in pandas.lib.map_infer (pandas/lib.c:66124)()

/home/immersinn/gits/ncga/src/billpage_proc.py in get_session_and_editions(soup)
     88                     'content' : bill_summaries.find('a')['href']})
     89 
---> 90     header = entries[1]
     91     header_values = [th.text for th in header.find_all('th')]
     92     edition_content = get_edition_content(entries[2:], header_values)

IndexError: list index out of range

In [ ]: