In [1]:
import json
import igraph
import os
import re
import time
import logging
import sqlite3
import pandas as pd
import numpy as np
from lxml import objectify

In [2]:
# structure is this:
# -congress term
#     -votes
#          -year (2 possible)
#               -billno (h31 for house, s31 for senate)
#                    *data.json
#                         result: "Passed"
#                         bill:
#                              congress:
#                              number:
#                              type:
#                         date:
#                         requires: "1/2"
#                         category:
#                         session: yyyy
#                         type: ?
#                         subject: short description
#                         number: ?
#                         votes:
#                              Not_Voting: [{display_name: , party: , ID: , state: }]
#                              Nay: ...
#                              Present: ...
#                              Yay: ...
#                         chamber: 'h' or 's'
#                         congress: 104
#                         question: short description again
#                         vote_id: 'h10-104.1995'

In [2]:
data_dir = "/home/matt/Datasets/Congress/"
os.chdir(data_dir)

Create and init the db


In [3]:
output_db = 'congress.db'
con = sqlite3.connect(output_db)
cur = con.cursor()

In [5]:
bill_schema = ['location', 'TEXT',
               'session', 'INTEGER',
               'roll', 'INTEGER',
               'year', 'INTEGER',
               'quarter', 'INTEGER',
               'month', 'INTEGER',
               'aye', 'INTEGER',
               'nay', 'INTEGER',
               'present', 'INTEGER',
               'nv','INTEGER',
               'required','TEXT',
               'result','TEXT',
               'category','TEXT',
               'type','TEXT',
               'question','TEXT']
bill_fields = bill_schema[0::2]
bill_keys = bill_fields[0:4]
print(bill_fields)
print(bill_keys)


['location', 'session', 'roll', 'year', 'quarter', 'month', 'aye', 'nay', 'present', 'nv', 'required', 'result', 'category', 'type', 'question']
['location', 'session', 'roll', 'year']

In [6]:
vote_schema = ['location','TEXT',
               'session','INTEGER',
               'roll','INTEGER',
               'year','INTEGER',
               'quarter', 'INTEGER',
               'month', 'INTEGER',
               'id','TEXT',
               'vote','TEXT',
               'value','TEXT',
               'state','TEXT']
vote_fields = vote_schema[0::2]
vote_keys = vote_fields[0:4] + vote_fields[6:7]
print(vote_fields)
print(vote_keys)


['location', 'session', 'roll', 'year', 'quarter', 'month', 'id', 'vote', 'value', 'state']
['location', 'session', 'roll', 'year', 'id']

In [7]:
bills_command = (
   "create table bills ("+
    "{} {}, "*len(bill_fields)+
    "PRIMARY KEY ("+
    ','.join(bill_keys)+
    ") )").format(*bill_schema)
votes_command = (
   "create table votes ("+
    "{} {}, "*len(vote_fields)+
    "PRIMARY KEY ("+
    ','.join(vote_keys)+
    ") )").format(*vote_schema)

In [8]:
print(votes_command)
print(bills_command)


create table votes (location TEXT, session INTEGER, roll INTEGER, year INTEGER, quarter INTEGER, month INTEGER, id TEXT, vote TEXT, value TEXT, state TEXT, PRIMARY KEY (location,session,roll,year,id) )
create table bills (location TEXT, session INTEGER, roll INTEGER, year INTEGER, quarter INTEGER, month INTEGER, aye INTEGER, nay INTEGER, present INTEGER, nv INTEGER, required TEXT, result TEXT, category TEXT, type TEXT, question TEXT, PRIMARY KEY (location,session,roll,year) )

In [9]:
cur.execute(votes_command)
cur.execute(bills_command)
con.commit()

Read the data to the db


In [10]:
# all the data will come from here
os.chdir('congress')

regexes we'll need


In [11]:
congress_num = re.compile('[0-9]{1,3}')
congress_bill = re.compile('(s|h)[0-9]{1,5}')
date_regex = re.compile(r'([0-9]{4}(-|/)[0-9]{1,2}\2[0-9]{1,2})')

functions we'll need


In [17]:
def insert_rows(cur,table,data,fields,how='ignore'):
    command = ("insert or {} into {} VALUES ("+','.join(["?"]*len(fields))+")").format(how,table)
    cur.executemany(command,data)

def get_bill_data(root):
    datetime = pd.to_datetime(root.attrib['datetime'])
    year = int(datetime.year)
    quarter = 4*year+int(datetime.quarter)
    month = 12*year+int(datetime.month)
    
    counts = [int(get_attr(root,kind,0)) for kind in ['aye','nay','present','nv']]
    text_attrs = [get_node_text(root,tag,'') for tag in ['required','result','category','type','question']]
    
    return [root.attrib['where'],int(root.attrib['session']),int(root.attrib['roll']),
            year, quarter, month] + counts + text_attrs
    
def get_attr(obj,attr,default):
    return obj.attrib.get(attr,default)

def get_node_text(obj,tag,default):
    return obj.__dict__.get(tag,objectify.StringElement(default)).text

This shows that every bill has an xml; we'll use these because the json is a mess


In [14]:
congresses = sorted([name for name in os.listdir() if re.match(congress_num,name)],key=lambda s: int(s))

# should be no output if every bill has an xml file
for congress in congresses:
    vote_dir = os.path.join(congress,'votes')
    years = sorted(os.listdir(vote_dir))
    
    for year in years:
        bills = os.listdir(os.path.join(vote_dir,year))
        bills = sorted([num for num in bills if re.match(congress_bill,num)])
        
        for bill in bills:
            bill_dir = os.path.join(vote_dir,year,bill)
            files = os.listdir(bill_dir)
            files = [ file for file in files if file.endswith('.xml')]
            if len(files)==0:
                print(bill_dir)

Loop through the bills and store them with the votes


In [18]:
congresses = sorted([name for name in os.listdir() if re.match(congress_num,name)],key=lambda s: int(s))

#c=0
# for every session
for congress in congresses:
#     c+=1
#     if c>2:
#         break
    
    vote_dir = os.path.join(congress,'votes')
    years = sorted(os.listdir(vote_dir))
    
    # for every year of that session
    for year in years:
        bills = os.listdir(os.path.join(vote_dir,year))
        bills = sorted([num for num in bills if re.match(congress_bill,num)], 
                       key = lambda s: int(s[1:]))
        
        bill_data = []
        # for every bill in that year
        for bill in bills:
            # list the files in the bill dir and parse the xml
            bill_dir = os.path.join(vote_dir,year,bill)
            files = os.listdir(bill_dir)
            xmlfiles = [file for file in files if file.endswith('.xml')]
            
            with open(os.path.join(vote_dir,year,bill,xmlfiles[0]), 'r') as infile:
                # get the xml object representing all the votes
                data = objectify.parse(infile).getroot()
                
            # get the bill data from the xml object
            new_bill = get_bill_data(data)
            
            # append to the list for that year
            bill_data.append(tuple(new_bill))
            
            # get the vote data from the xml object
            voters = data.voter
            vote_data = [tuple(new_bill[0:6] + [voter.attrib.get(attr,None) for attr in vote_fields[6:]]) for voter in voters]
            
            if np.random.rand() > 0.9997:
                print(new_bill)
                print(vote_data[np.random.randint(0,len(vote_data))])
            
            # insert the vote data into the db for each bill
            insert_rows(cur,'votes',vote_data,vote_fields,'ignore')
        
        # insert the bill data into the db for each year
        insert_rows(cur,'bills',bill_data,bill_fields,'ignore')
        
        # commit the changes
        con.commit()
        
    print("congress {} completed".format(congress))


congress 1 completed
congress 2 completed
congress 3 completed
congress 4 completed
congress 5 completed
congress 6 completed
congress 7 completed
congress 8 completed
congress 9 completed
congress 10 completed
['senate', 11, 126, 1811, 7245, 21733, 23, 7, 0, 4, 'unknown', 'unknown', 'unknown', 'On Passage', 'TO PASS THE SENATE BILL, AS AMENDED, AND APPROPRIATE $100,000 THEREFORE.']
('senate', 11, 126, 1811, 7245, 21733, '411899', '+', 'Yea', 'OH')
congress 11 completed
congress 12 completed
congress 13 completed
congress 14 completed
congress 15 completed
congress 16 completed
congress 17 completed
congress 18 completed
congress 19 completed
congress 20 completed
congress 21 completed
congress 22 completed
['house', 23, 27, 1834, 7337, 22010, 107, 88, 0, 41, 'unknown', 'unknown', 'unknown', 'TO TABLE THE RESOLUTION PROVIDING THAT THE COMMITTEE ON INDIAN AFFAIRS BE INSTRUCTED TO INQUIRE WHETHER THE PROVISIONS OF THE TREATY OF MARCH 1832, WITH THE CREEK TRIBE OF INDIANS, IN ALABAMA, BE INCONSISTENT WITH THE SOVEREIGN RIGHT OF JURISDICTION OF SAID STATE WITHIN ITS LIMITS, AND WHETHER THE EXECUTION OF THE TREATY CONFLICTS WITH LAWS OF SAID STATE.  (P. 175-3).', 'TO TABLE THE RESOLUTION PROVIDING THAT THE COMMITTEE ON INDIAN AFFAIRS BE INSTRUCTED TO INQUIRE WHETHER THE PROVISIONS OF THE TREATY OF MARCH 1832, WITH THE CREEK TRIBE OF INDIANS, IN ALABAMA, BE INCONSISTENT WITH THE SOVEREIGN RIGHT OF JURISDICTION OF SAID STATE WITHIN ITS LIMITS, AND WHETHER THE EXECUTION OF THE TREATY CONFLICTS WITH LAWS OF SAID STATE.  (P. 175-3).']
('house', 23, 27, 1834, 7337, 22010, '401280', '0', 'Not Voting', 'PA')
congress 23 completed
congress 24 completed
congress 25 completed
['house', 26, 114, 1840, 7361, 22082, 0, 0, 0, 60, 'unknown', 'unknown', 'unknown', 'TO ELECT A CHAPLAIN.', 'TO ELECT A CHAPLAIN.']
('house', 26, 114, 1840, 7361, 22082, '409809', 'BRAXTON', 'BRAXTON', 'NH')
congress 26 completed
congress 27 completed
congress 28 completed
congress 29 completed
congress 30 completed
congress 31 completed
congress 32 completed
congress 33 completed
congress 34 completed
['house', 35, 416, 1859, 7437, 22309, 86, 85, 0, 65, 'unknown', 'unknown', 'unknown', 'TO SUSPEND RULES IN ORDER TO MOVE THAT COMM. OF THE WHOLE HOUSE BE DISCHARGED FROM FURTHER CONSIDERATION OF H.R. 348, BILL TO PROVIDE FOR RELIEF OF EDWIN M. CHAFFEE, INVENTOR OF PROCESS TO GRIND RATHER THAN DISSOLVE RAW MATERIALS USED IN MANUFACTURE OF INDIA RUBBER GOODS.  (P. 699).', 'TO SUSPEND RULES IN ORDER TO MOVE THAT COMM. OF THE WHOLE HOUSE BE DISCHARGED FROM FURTHER CONSIDERATION OF H.R. 348, BILL TO PROVIDE FOR RELIEF OF EDWIN M. CHAFFEE, INVENTOR OF PROCESS TO GRIND RATHER THAN DISSOLVE RAW MATERIALS USED IN MANUFACTURE OF INDIA RUBBER GOODS.  (P. 699).']
('house', 35, 416, 1859, 7437, 22309, '405649', '0', 'Not Voting', 'OH')
congress 35 completed
['senate', 36, 327, 1860, 7442, 22326, 17, 23, 0, 25, 'unknown', 'unknown', 'unknown', 'On the Amendment', "TO AMEND H.R. 304, BY STRIKING OUT SECTION 2, WHICH AUTHORIZES THE POSTMASTER GENERAL TO AGREE WITH THE CONTRACTORS ON ROUTE 12578, FROM ST. LOUIS TO SAN FRANCISCO, FOR SUCH MODIFICATION OF THEIR CONTRACT, SO AS TO CARRY THE MAIL FROM VICKSBURG OR NEW ORLEANS, INSTEAD OF ST. LOUIS, AND PROVIDING FOR A TRI-WEEKLY SERVICE IN 20 DAYS' TIME, AT THE SAME TRAVEL SUPPLYING INTERMEDIATE OFFICERS ON THE ROUTE WITHOUT ADDITIONAL COST TO THE GOVERNMENT.  (P. 3147-2)."]
('senate', 36, 327, 1860, 7442, 22326, '404883', '-', 'Nay', 'CA')
congress 36 completed
['senate', 37, 560, 1862, 7452, 22356, 26, 14, 0, 7, 'unknown', 'unknown', 'unknown', 'On Passage', 'TO PASS THE HOUSE CONCURRENT RESOLUTION, AS AMENDED, WHICH PROVIDES THAT WHEN THE TWO HOUSES ADJOURN TODAY, THEY MEET AGAIN ON THE FIRST MONDAY IN JANUARY NEXT.  (P. 170-3).']
('senate', 37, 560, 1862, 7452, 22356, '411739', '-', 'Nay', 'MA')
['house', 37, 569, 1863, 7453, 22358, 57, 101, 0, 21, 'unknown', 'unknown', 'unknown', 'On the Amendment', 'TO AMEND S. 511 BY STRIKING OUT FROM SECTION 25 THE PROVI- SIONS FOR ARREST.  (P.1291-2).']
('house', 37, 569, 1863, 7453, 22358, '403481', '-', 'Nay', 'NY')
congress 37 completed
congress 38 completed
congress 39 completed
congress 40 completed
['senate', 41, 102, 1870, 7481, 22441, 23, 35, 0, 7, 'unknown', 'unknown', 'unknown', 'On the Amendment', 'TO AMEND S. J. RES. 85 BY PREDICATING SUCH REPRESENTATION ON AGREEMENT OF THE STATE CONSTITUTION WITH THE PROPOSED 15TH AMENDMENT OF THE UNITED STATES CONSTITUTION, GRANTING EQUAL VOTING RIGHTS TO WHITE AND COLORED CITIZENS. (P. 463-1).']
('senate', 41, 102, 1870, 7481, 22441, '402398', '0', 'Not Voting', 'NJ')
congress 41 completed
congress 42 completed
congress 43 completed
congress 44 completed
congress 45 completed
congress 46 completed
['house', 47, 180, 1882, 7530, 22589, 149, 2, 0, 140, 'unknown', 'unknown', 'unknown', 'TO PROCEED TO THE CONSIDERATION OF THE CONTESTED ELECTION CASE OF HORATIO BISBEE, JR. VS. JESSE J. FENLEY FROM SECOND DISTRICT OF FLORIDA.  (P. 4401-1).', 'TO PROCEED TO THE CONSIDERATION OF THE CONTESTED ELECTION CASE OF HORATIO BISBEE, JR. VS. JESSE J. FENLEY FROM SECOND DISTRICT OF FLORIDA.  (P. 4401-1).']
('house', 47, 180, 1882, 7530, 22589, '409515', '+', 'Yea', 'KS')
['senate', 47, 376, 1882, 7530, 22589, 39, 14, 0, 23, 'unknown', 'unknown', 'unknown', 'TO AGREE TO AN AMENDMENT TO S. 420 PROVIDING THAT NO CIRCUIT OR DISTRICT COURT JUDGE PRESIDING IN A MATTER OR CASE SHALL BE ELIGIBLE TO SIT IN THE APPEAL OF THE SAME TRIAL OR HEARING.  (P. 3596).', 'TO AGREE TO AN AMENDMENT TO S. 420 PROVIDING THAT NO CIRCUIT OR DISTRICT COURT JUDGE PRESIDING IN A MATTER OR CASE SHALL BE ELIGIBLE TO SIT IN THE APPEAL OF THE SAME TRIAL OR HEARING.  (P. 3596).']
('senate', 47, 376, 1882, 7530, 22589, '409588', '+', 'Yea', 'NE')
congress 47 completed
congress 48 completed
congress 49 completed
congress 50 completed
congress 51 completed
congress 52 completed
congress 53 completed
congress 54 completed
congress 55 completed
congress 56 completed
congress 57 completed
congress 58 completed
congress 59 completed
congress 60 completed
congress 61 completed
congress 62 completed
['house', 63, 94, 1914, 7657, 22971, 303, 3, 4, 125, 'unknown', 'unknown', 'unknown', 'On Passage', 'TO PASS H. R. 1933, A BILL TO LIMIT THE EFFECT OF THE REGU- LATION OF INTERSTATE COMMERCE BETWEEN THE STATES IN GOODS, WARES, AND MERCHANDISE WHOLLY OR IN PART MANUFACTURED, MINED OR PRODUCED BY CONVICT LABOR OR IN ANY PRISON REFORMATORY. (P.4302-2).']
('house', 63, 94, 1914, 7657, 22971, '409367', '+', 'Yea', 'MA')
congress 63 completed
congress 64 completed
congress 65 completed
congress 66 completed
congress 67 completed
congress 68 completed
congress 69 completed
congress 70 completed
congress 71 completed
congress 72 completed
congress 73 completed
congress 74 completed
congress 75 completed
congress 76 completed
congress 77 completed
congress 78 completed
congress 79 completed
congress 80 completed
['senate', 81, 430, 1950, 7803, 23409, 40, 33, 31, 5, 'unknown', 'unknown', 'unknown', 'S 4037.  FULBRIGHT AMEND. ELIMINATING PROVISION FOR THE ESTABLISHMENT IN THE STATE DEPT. OF A BUREAU OF PASSPORTS AND VISAS.', 'S 4037.  FULBRIGHT AMEND. ELIMINATING PROVISION FOR THE ESTABLISHMENT IN THE STATE DEPT. OF A BUREAU OF PASSPORTS AND VISAS.']
('senate', 81, 430, 1950, 7803, 23409, '401851', 'P', 'Present', 'NC')
congress 81 completed
congress 82 completed
congress 83 completed
['senate', 84, 206, 1956, 7827, 23479, 50, 39, 1, 6, 'unknown', 'Agreed To', 'unknown', 'HR. 12130.  FOREIGN AID APPROPRIATIONS.  COMMITTEE AMENDMENT TO INCREASE MILITARY ASSISTANCE FROM $1.7 TO $2.3 BILLION.', 'HR. 12130.  FOREIGN AID APPROPRIATIONS.  COMMITTEE AMENDMENT TO INCREASE MILITARY ASSISTANCE FROM $1.7 TO $2.3 BILLION.']
('senate', 84, 206, 1956, 7827, 23479, '407749', '+', 'Yea', 'CO')
congress 84 completed
congress 85 completed
congress 86 completed
congress 87 completed
congress 88 completed
congress 89 completed
congress 90 completed
congress 91 completed
['house', 92, 157, 1971, 7887, 23660, 370, 4, 0, 59, 'unknown', 'unknown', 'unknown', 'TO ADOPT H. CON. RES. 370, EXPRESSING THE SENSE OF CONGRESS RELATIVE TO CERTAIN ACTIVITIES OF PUBLIC HEALTH SERVICE HOSPITALS, CLINICS, AND RESEARCH CENTERS.', 'TO ADOPT H. CON. RES. 370, EXPRESSING THE SENSE OF CONGRESS RELATIVE TO CERTAIN ACTIVITIES OF PUBLIC HEALTH SERVICE HOSPITALS, CLINICS, AND RESEARCH CENTERS.']
('house', 92, 157, 1971, 7887, 23660, '410493', '+', 'Yea', 'KY')
congress 92 completed
['senate', 93, 403, 1973, 7895, 23685, 82, 4, 0, 14, 'unknown', 'unknown', 'unknown', 'On Passage', 'TO PASS H.R. 9639.']
('senate', 93, 403, 1973, 7895, 23685, '407451', '+', 'Yea', 'SD')
['house', 93, 414, 1973, 7896, 23687, 263, 147, 0, 29, 'unknown', 'unknown', 'unknown', 'On the Amendment', 'TO AMEND H.R. 11104 BY REDUCING THE INCREASE IN THE DEBT LIMIT BY $2.3 BILLION.']
('house', 93, 414, 1973, 7896, 23687, '400949', '+', 'Yea', 'OH')
['senate', 93, 663, 1974, 7897, 23691, 54, 33, 0, 14, 'unknown', 'unknown', 'unknown', 'On Passage', 'TO PASS S. 1401.']
('senate', 93, 663, 1974, 7897, 23691, '300056', '-', 'Nay', 'HI')
['senate', 93, 1020, 1974, 7900, 23698, 41, 39, 0, 21, 'unknown', 'unknown', 'unknown', 'TO RECOMMIT S.3394 TO THE COMMITTEE ON FOREIGN RELATIONS.', 'TO RECOMMIT S.3394 TO THE COMMITTEE ON FOREIGN RELATIONS.']
('senate', 93, 1020, 1974, 7900, 23698, '400783', '+', 'Yea', 'AL')
congress 93 completed
['senate', 94, 119, 1975, 7902, 23704, 24, 61, 0, 14, 'unknown', 'unknown', 'unknown', 'On the Amendment', 'TO AMEND S. 622 SO AS TO PROVIDE COMPENSATION FOR A PRODUCER OR ROYALTY OWNER WHOSE FIELD IS DAMAGED BY OVERPRODUCTION AS A RESULT OF GOVERNMENT ORDER.']
('senate', 94, 119, 1975, 7902, 23704, '408123', '-', 'Nay', 'WI')
['house', 94, 303, 1975, 7903, 23707, 380, 11, 0, 43, 'unknown', 'unknown', 'unknown', 'TO SUSPEND THE RULES AND PASS H.R. 6971, THE CONSUMER GOODS PRICING ACT OF 1975.', 'TO SUSPEND THE RULES AND PASS H.R. 6971, THE CONSUMER GOODS PRICING ACT OF 1975.']
('house', 94, 303, 1975, 7903, 23707, '402727', '+', 'Yea', 'ME')
congress 94 completed
congress 95 completed
['senate', 96, 321, 1979, 7919, 23757, 49, 29, 0, 22, 'unknown', 'unknown', 'unknown', 'On Passage', 'TO PASS H.R. 5369, PROVIDING FOR A COMBINED PERMANENT AND TEMPORARY LIMIT ON THE PUBLIC DEBT OF $879 BILLION THROUGH MAY 31, 1980.  (MOTION PASSED).']
('senate', 96, 321, 1979, 7919, 23757, '411979', '-', 'Nay', 'ND')
congress 96 completed
congress 97 completed
congress 98 completed
['senate', 99, 269, 1985, 7944, 23830, 53, 43, 0, 4, 'unknown', 'unknown', 'unknown', 'On the Amendment', 'TO AMEND S 1714 BY WHICH THE SECRETARY OF TRANSPORTATION ENSURES THAT THE GREAT LAKE PORTS RECEIVE THE SAME PERCENTAGE OR WEIGHT OF TONNAGE, WHICHEVER AMOUNT IS SMALLER, OF GOVERNMENT DONATED COMMODITIES THAT THEY HAVE AVERAGED OVER THE LAST FIVE YEARS.']
('senate', 99, 269, 1985, 7944, 23830, '407355', '-', 'Nay', 'ID')
congress 99 completed
['senate', 100, 656, 1988, 7955, 23863, 58, 33, 0, 9, 'unknown', 'unknown', 'unknown', 'On the Amendment', 'TO AMEND HR 4776, FISCAL 1989 DISTRICT OF COLUMBIA APPROPRIATIONS, TO EXEMPT LOCAL RELIGIOUS INSTITUTIONS FROM LOCAL LAWS PROHIBITING DISCRIMINATION BASED ON SEXUAL ORIENTATION.']
('senate', 100, 656, 1988, 7955, 23863, '408252', '+', 'Yea', 'GA')
congress 100 completed
['senate', 101, 216, 1990, 7963, 23888, 51, 48, 0, 1, '1/2', 'Motion to Table Agreed to', 'procedural', 'On the Motion to Table S.Amdt. 2522 to S. 2884 (National Defense Authorization Act for Fiscal Year 1991)', 'On the Motion to Table S.Amdt. 2522 to S. 2884 (National Defense Authorization Act for Fiscal Year 1991)']
('senate', 101, 216, 1990, 7963, 23888, '409923', '+', 'Yea', 'WY')
['senate', 101, 265, 1990, 7964, 23890, 91, 7, 0, 2, '1/2', 'Bill Passed', 'passage', 'On Passage of the Bill', 'On Passage of the Bill H.R. 5021']
('senate', 101, 265, 1990, 7964, 23890, '400557', '+', 'Yea', 'NV')
congress 101 completed
congress 102 completed
congress 103 completed
congress 104 completed
congress 105 completed
congress 106 completed
congress 107 completed
['house', 108, 128, 2003, 8014, 24040, 424, 0, 0, 10, '2/3', 'Passed', 'passage-suspension', 'On Motion to Suspend the Rules and Agree', 'On Motion to Suspend the Rules and Agree: H CON RES 141 Expressing the sense of the Congress that the Internal Revenue Code of 1986 should be fundamentally reformed to be fairer, simpler, and less costly and to encourage economic growth, individual liberty and investment in American jobs.']
('house', 108, 128, 2003, 8014, 24040, '400138', '+', 'Yea', 'TN')
congress 108 completed
congress 109 completed
congress 110 completed
congress 111 completed
['senate', 112, 166, 2011, 8048, 24142, 82, 3, 0, 15, '1/2', 'Nomination Confirmed', 'nomination', 'On the Nomination', 'On the Nomination PN11: Cathy Bissoon, of Pennsylvania, to be United States District Judge for the Western District of Pennsylvania']
('senate', 112, 166, 2011, 8048, 24142, '300093', '+', 'Yea', 'MI')
['house', 112, 781, 2011, 8048, 24142, 262, 167, 0, 4, '1/2', 'Passed', 'passage', 'On Passage of the Bill', 'On Passage: H R 3078 To implement the United States-Colombia Trade Promotion Agreement']
('house', 112, 781, 2011, 8048, 24142, '412270', '+', 'Aye', 'UT')
congress 112 completed
congress 113 completed
congress 114 completed

In [19]:
# check the schema
cur.execute("select * from sqlite_master").fetchall()


Out[19]:
[('table',
  'votes',
  'votes',
  2,
  'CREATE TABLE votes (location TEXT, session INTEGER, roll INTEGER, year INTEGER, quarter INTEGER, month INTEGER, id TEXT, vote TEXT, value TEXT, state TEXT, PRIMARY KEY (location,session,roll,year,id) )'),
 ('index', 'sqlite_autoindex_votes_1', 'votes', 3, None),
 ('table',
  'bills',
  'bills',
  4,
  'CREATE TABLE bills (location TEXT, session INTEGER, roll INTEGER, year INTEGER, quarter INTEGER, month INTEGER, aye INTEGER, nay INTEGER, present INTEGER, nv INTEGER, required TEXT, result TEXT, category TEXT, type TEXT, question TEXT, PRIMARY KEY (location,session,roll,year) )'),
 ('index', 'sqlite_autoindex_bills_1', 'bills', 5, None)]

In [20]:
print(vote_fields)
print(bill_fields)


['location', 'session', 'roll', 'year', 'quarter', 'month', 'id', 'vote', 'value', 'state']
['location', 'session', 'roll', 'year', 'quarter', 'month', 'aye', 'nay', 'present', 'nv', 'required', 'result', 'category', 'type', 'question']

In [21]:
# from the last bill
vote_data[0:10]


Out[21]:
[('house', 114, 163, 2016, 8066, 24196, '412630', '+', 'Yea', 'LA'),
 ('house', 114, 163, 2016, 8066, 24196, '400004', '+', 'Yea', 'AL'),
 ('house', 114, 163, 2016, 8066, 24196, '412615', '+', 'Yea', 'CA'),
 ('house', 114, 163, 2016, 8066, 24196, '412625', '+', 'Yea', 'GA'),
 ('house', 114, 163, 2016, 8066, 24196, '412438', '+', 'Yea', 'MI'),
 ('house', 114, 163, 2016, 8066, 24196, '412642', '+', 'Yea', 'NE'),
 ('house', 114, 163, 2016, 8066, 24196, '412655', '+', 'Yea', 'TX'),
 ('house', 114, 163, 2016, 8066, 24196, '412469', '+', 'Yea', 'PA'),
 ('house', 114, 163, 2016, 8066, 24196, '412541', '+', 'Yea', 'KY'),
 ('house', 114, 163, 2016, 8066, 24196, '400018', '+', 'Yea', 'TX')]

In [24]:
# from the db
cur.execute("select * from votes where session = 114 and location = 'house' and year = 2016 and roll = 163 limit 10").fetchall()


Out[24]:
[('house', 114, 163, 2016, 8066, 24196, '400004', '+', 'Yea', 'AL'),
 ('house', 114, 163, 2016, 8066, 24196, '400018', '+', 'Yea', 'TX'),
 ('house', 114, 163, 2016, 8066, 24196, '400021', '-', 'Nay', 'CA'),
 ('house', 114, 163, 2016, 8066, 24196, '400029', '+', 'Yea', 'UT'),
 ('house', 114, 163, 2016, 8066, 24196, '400030', '+', 'Yea', 'GA'),
 ('house', 114, 163, 2016, 8066, 24196, '400032', '+', 'Yea', 'TN'),
 ('house', 114, 163, 2016, 8066, 24196, '400033', '+', 'Yea', 'OR'),
 ('house', 114, 163, 2016, 8066, 24196, '400046', '+', 'Yea', 'TX'),
 ('house', 114, 163, 2016, 8066, 24196, '400047', '-', 'Nay', 'PA'),
 ('house', 114, 163, 2016, 8066, 24196, '400048', '-', 'Nay', 'FL')]

In [25]:
len(cur.execute("select id from votes").fetchall())
# 23,602,445 votes


Out[25]:
23602445

Now build the senators table

import the data


In [26]:
os.chdir('..')
os.chdir('congress-legislators')

historic_senators = pd.read_csv('legislators-historic.csv')
current_senators = pd.read_csv('legislators-current.csv')

print(historic_senators.shape)
print(current_senators.shape)

print(historic_senators.columns)
print(current_senators.columns)


(11807, 29)
(540, 29)
Index(['last_name', 'first_name', 'birthday', 'gender', 'type', 'state',
       'district', 'party', 'url', 'address', 'phone', 'contact_form',
       'rss_url', 'twitter', 'facebook', 'facebook_id', 'youtube',
       'youtube_id', 'bioguide_id', 'thomas_id', 'opensecrets_id', 'lis_id',
       'cspan_id', 'govtrack_id', 'votesmart_id', 'ballotpedia_id',
       'washington_post_id', 'icpsr_id', 'wikipedia_id'],
      dtype='object')
Index(['last_name', 'first_name', 'birthday', 'gender', 'type', 'state',
       'district', 'party', 'url', 'address', 'phone', 'contact_form',
       'rss_url', 'twitter', 'facebook', 'facebook_id', 'youtube',
       'youtube_id', 'bioguide_id', 'thomas_id', 'opensecrets_id', 'lis_id',
       'cspan_id', 'govtrack_id', 'votesmart_id', 'ballotpedia_id',
       'washington_post_id', 'icpsr_id', 'wikipedia_id'],
      dtype='object')

combine the historic and current data and add a few columns


In [27]:
current_senators.set_index('govtrack_id',verify_integrity=True,inplace=True)
historic_senators.set_index('govtrack_id',verify_integrity=True,inplace=True)

senators = pd.concat([historic_senators,current_senators],axis=0,join='outer',verify_integrity=True)
senators.shape


Out[27]:
(12347, 28)

In [28]:
def year_quarter_month(datetime):
    dt = pd.to_datetime(datetime)
    return pd.Series([dt.year,4*dt.year+dt.quarter,12*dt.year+dt.month])

In [29]:
df = senators.birthday.apply(year_quarter_month)
df.columns = pd.Index(['birth_year','birth_quarter','birth_month'])

df.shape


Out[29]:
(12347, 3)

In [30]:
senators = pd.concat([senators,df],axis=1,join='outer')
senators.head(5)


Out[30]:
last_name first_name birthday gender type state district party url address ... lis_id cspan_id votesmart_id ballotpedia_id washington_post_id icpsr_id wikipedia_id birth_year birth_quarter birth_month
govtrack_id
401222 Bassett Richard 1745-04-02 M sen DE NaN Anti-Administration NaN NaN ... NaN NaN NaN NaN NaN 507 NaN 1745 6982 20944
401521 Bland Theodorick 1742-03-21 M rep VA 9 NaN NaN NaN ... NaN NaN NaN NaN NaN 786 NaN 1742 6969 20907
402032 Burke Aedanus 1743-06-16 M rep SC 2 NaN NaN NaN ... NaN NaN NaN NaN NaN 1260 Aedanus Burke 1743 6974 20922
402334 Carroll Daniel 1730-07-22 M rep MD 6 NaN NaN NaN ... NaN NaN NaN NaN NaN 1538 Daniel Carroll 1730 6923 20767
402671 Clymer George 1739-03-16 M rep PA -1 NaN NaN NaN ... NaN NaN NaN NaN NaN 1859 George Clymer 1739 6957 20871

5 rows × 31 columns


In [31]:
senators.shape


Out[31]:
(12347, 31)

The database schema and create command


In [32]:
senator_schema = ['id','TEXT','first_name','TEXT','last_name','TEXT',
                  'state','TEXT','district','TEXT','party','TEXT',
                  'birthday','TEXT','birth_year','INTEGER','birth_quarter','INTEGER','birth_month','INTEGER',
                  'url','TEXT','twitter','TEXT','facebook','TEXT','facebook_id','TEXT',
                  'youtube','TEXT','youtube_id','TEXT',
                  'opensecrets_id','TEXT','wikipedia_id','TEXT','washington_post_id','TEXT']
senator_fields = senator_schema[0::2]
senator_keys = senator_schema[0:1]
senator_types = senator_schema[1::2]
print(senator_fields)
print(senator_keys)
print(senator_types)


['id', 'first_name', 'last_name', 'state', 'district', 'party', 'birthday', 'birth_year', 'birth_quarter', 'birth_month', 'url', 'twitter', 'facebook', 'facebook_id', 'youtube', 'youtube_id', 'opensecrets_id', 'wikipedia_id', 'washington_post_id']
['id']
['TEXT', 'TEXT', 'TEXT', 'TEXT', 'TEXT', 'TEXT', 'TEXT', 'INTEGER', 'INTEGER', 'INTEGER', 'TEXT', 'TEXT', 'TEXT', 'TEXT', 'TEXT', 'TEXT', 'TEXT', 'TEXT', 'TEXT']

In [33]:
senators_command = (
    "create table senators ("+
    "{} {}, "*len(senator_fields)+
    "PRIMARY KEY ("+
    ','.join(senator_keys)+
    ") )").format(*senator_schema)
print(senators_command)


create table senators (id TEXT, first_name TEXT, last_name TEXT, state TEXT, district TEXT, party TEXT, birthday TEXT, birth_year INTEGER, birth_quarter INTEGER, birth_month INTEGER, url TEXT, twitter TEXT, facebook TEXT, facebook_id TEXT, youtube TEXT, youtube_id TEXT, opensecrets_id TEXT, wikipedia_id TEXT, washington_post_id TEXT, PRIMARY KEY (id) )

Subset on the columns we want in the db


In [34]:
senators['id'] = senators.index
print(senators.shape)
print(senators.columns)


(12347, 32)
Index(['last_name', 'first_name', 'birthday', 'gender', 'type', 'state',
       'district', 'party', 'url', 'address', 'phone', 'contact_form',
       'rss_url', 'twitter', 'facebook', 'facebook_id', 'youtube',
       'youtube_id', 'bioguide_id', 'thomas_id', 'opensecrets_id', 'lis_id',
       'cspan_id', 'votesmart_id', 'ballotpedia_id', 'washington_post_id',
       'icpsr_id', 'wikipedia_id', 'birth_year', 'birth_quarter',
       'birth_month', 'id'],
      dtype='object')

In [35]:
# The columns we want to keep
senators = senators[senator_fields]

Put it all in the db


In [36]:
def type_or_null(i,datatype):
    if pd.isnull(i):
        return None
    else:
        return datatype(i)

typedict = {'TEXT':str,'INTEGER':int}

In [37]:
data = []
for index in senators.index:
    row = senators.loc[index,]
    row_data = []
    for i in range(len(row)):
        row_data.append(type_or_null(row.iloc[i],typedict[senator_types[i]]))
    data.append(tuple(row_data))

In [38]:
print(len(data))
print(len(data[0]))
print(len(senator_fields))


12347
19
19

In [39]:
cur.execute(senators_command)
con.commit()

insert_rows(cur,'senators',data,senator_fields)
con.commit()

In [40]:
print(len(cur.execute('select youtube from senators').fetchall()))


12347

In [41]:
# every year is there- no gaps bigger than 1
np.diff(np.array(sorted(list(zip(*cur.execute("select distinct year from votes").fetchall()))[0])))


Out[41]:
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

Create indices for fast future queries


In [ ]:
# indices for fast queries
for time_col in ['year','quarter','month']:
    cur.execute("create index vote_{0} on votes({0},location)".format(time_col))
    con.commit()

That's all folks!


In [42]:
con.close()