notebook.community

Edit and run



In [1]:

    
import sys
import os



In [2]:

    
from importlib import reload



In [3]:

    
sys.path.append(os.path.join(os.path.abspath('../..'), 'src'))



In [4]:

    
import mysql_utils
import mongo_utils



In [5]:

    
with mysql_utils.curWith("SELECT * FROM bills WHERE session = '2017' LIMIT 200") as cur:
    bill_info = mysql_utils.dfDocsFromCursor(cur)



In [6]:

    
bill_info.head()









    Out[6]:






  
    
      
      id
      bill_id
      session
      date_filed
      title
      link
      page_scraped
    
  
  
    
      0
      1
      H1
      2017
      2017-01-11
      2017 House Temporary Rules.
      /gascripts/BillLookUp/BillLookUp.pl?Session=20...
      1
    
    
      1
      2
      S1
      2017
      2017-01-11
      2017 Senate Permanent Rules.
      /gascripts/BillLookUp/BillLookUp.pl?Session=20...
      1
    
    
      2
      3
      S2
      2017
      2017-01-11
      Adjourn Organizational Session.
      /gascripts/BillLookUp/BillLookUp.pl?Session=20...
      1
    
    
      3
      4
      H2
      2017
      2017-01-25
      Provide Certain Property Tax Relief.
      /gascripts/BillLookUp/BillLookUp.pl?Session=20...
      1
    
    
      4
      5
      H3
      2017
      2017-01-25
      Eminent Domain.
      /gascripts/BillLookUp/BillLookUp.pl?Session=20...
      1



In [7]:

    
bill_ids = [int(i) for i in list(bill_info.id)]



In [8]:

    
with mongo_utils.collWith('ncga', 'bill_pages') as coll:
    bill_pages = mongo_utils.dfDocsFromCursor(coll.find({'bill_id' : {'$in' : bill_ids}}, {'_id' : 0}))



In [9]:

    
bill_pages.head()









    Out[9]:






  
    
      
      bill_id
      html
    
  
  
    
      0
      1
      <!doctype html>\n<html>\n<head>\n\t<meta name=...
    
    
      1
      2
      b'<!doctype html>\n<html>\n<head>\n\t<meta nam...
    
    
      2
      3
      b'<!doctype html>\n<html>\n<head>\n\t<meta nam...
    
    
      3
      4
      b'<!doctype html>\n<html>\n<head>\n\t<meta nam...
    
    
      4
      5
      b'<!doctype html>\n<html>\n<head>\n\t<meta nam...



In [10]:

    
from bs4 import BeautifulSoup as bs



In [11]:

    
import billpage_proc



In [12]:

    
bill_pages['soup'] = bill_pages.html.apply(lambda x: bs(x, 'html.parser'))



In [18]:

    
bill_pages['meta'] = bill_pages.soup.apply(billpage_proc.get_meta)



In [19]:

    
bill_pages.head()









    Out[19]:






  
    
      
      bill_id
      html
      soup
      meta
    
  
  
    
      0
      1
      <!doctype html>\n<html>\n<head>\n\t<meta name=...
      <!DOCTYPE doctype html>

<html>
<head>
<meta c...
      [{'label': 'Last Action', 'content': 'Adopted ...
    
    
      1
      2
      b'<!doctype html>\n<html>\n<head>\n\t<meta nam...
      <!DOCTYPE doctype html>

<html>
<head>
<meta c...
      [{'label': 'Last Action', 'content': 'Adopted ...
    
    
      2
      3
      b'<!doctype html>\n<html>\n<head>\n\t<meta nam...
      <!DOCTYPE doctype html>

<html>
<head>
<meta c...
      [{'label': 'Last Action', 'content': 'Ch. Res ...
    
    
      3
      4
      b'<!doctype html>\n<html>\n<head>\n\t<meta nam...
      <!DOCTYPE doctype html>

<html>
<head>
<meta c...
      [{'label': 'Last Action', 'content': 'Re-ref C...
    
    
      4
      5
      b'<!doctype html>\n<html>\n<head>\n\t<meta nam...
      <!DOCTYPE doctype html>

<html>
<head>
<meta c...
      [{'label': 'Last Action', 'content': 'Ref To C...



In [21]:

    
bill_pages['ses_and_ed'] = bill_pages.soup.apply(billpage_proc.get_session_and_editions)









    



---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-21-311ff59ae86d> in <module>()
----> 1 bill_pages['ses_and_ed'] = bill_pages.soup.apply(billpage_proc.get_session_and_editions)

/home/immersinn/.virtualenvs/ncga/lib/python3.5/site-packages/pandas/core/series.py in apply(self, func, convert_dtype, args, **kwds)
   2292             else:
   2293                 values = self.asobject
-> 2294                 mapped = lib.map_infer(values, f, convert=convert_dtype)
   2295 
   2296         if len(mapped) and isinstance(mapped[0], Series):

pandas/src/inference.pyx in pandas.lib.map_infer (pandas/lib.c:66124)()

/home/immersinn/gits/ncga/src/billpage_proc.py in get_session_and_editions(soup)
     88                     'content' : bill_summaries.find('a')['href']})
     89 
---> 90     header = entries[1]
     91     header_values = [th.text for th in header.find_all('th')]
     92     edition_content = get_edition_content(entries[2:], header_values)

IndexError: list index out of range



In [ ]:

	id	bill_id	session	date_filed	title	link	page_scraped
0	1	H1	2017	2017-01-11	2017 House Temporary Rules.	/gascripts/BillLookUp/BillLookUp.pl?Session=20...	1
1	2	S1	2017	2017-01-11	2017 Senate Permanent Rules.	/gascripts/BillLookUp/BillLookUp.pl?Session=20...	1
2	3	S2	2017	2017-01-11	Adjourn Organizational Session.	/gascripts/BillLookUp/BillLookUp.pl?Session=20...	1
3	4	H2	2017	2017-01-25	Provide Certain Property Tax Relief.	/gascripts/BillLookUp/BillLookUp.pl?Session=20...	1
4	5	H3	2017	2017-01-25	Eminent Domain.	/gascripts/BillLookUp/BillLookUp.pl?Session=20...	1

	bill_id	html
0	1	<!doctype html>\n<html>\n<head>\n\t<meta name=...
1	2	b'<!doctype html>\n<html>\n<head>\n\t<meta nam...
2	3	b'<!doctype html>\n<html>\n<head>\n\t<meta nam...
3	4	b'<!doctype html>\n<html>\n<head>\n\t<meta nam...
4	5	b'<!doctype html>\n<html>\n<head>\n\t<meta nam...

	bill_id	html	soup	meta
0	1	<!doctype html>\n<html>\n<head>\n\t<meta name=...	<!DOCTYPE doctype html> <html> <head> <meta c...	[{'label': 'Last Action', 'content': 'Adopted ...
1	2	b'<!doctype html>\n<html>\n<head>\n\t<meta nam...	<!DOCTYPE doctype html> <html> <head> <meta c...	[{'label': 'Last Action', 'content': 'Adopted ...
2	3	b'<!doctype html>\n<html>\n<head>\n\t<meta nam...	<!DOCTYPE doctype html> <html> <head> <meta c...	[{'label': 'Last Action', 'content': 'Ch. Res ...
3	4	b'<!doctype html>\n<html>\n<head>\n\t<meta nam...	<!DOCTYPE doctype html> <html> <head> <meta c...	[{'label': 'Last Action', 'content': 'Re-ref C...
4	5	b'<!doctype html>\n<html>\n<head>\n\t<meta nam...	<!DOCTYPE doctype html> <html> <head> <meta c...	[{'label': 'Last Action', 'content': 'Ref To C...