In [58]:
# Loop through the directory; find data from each json; look in the text directories; open each subdirectory;
# read in each subdirectory json file; find the one with the latest date. Read and store text file associated
# with that directory

In [59]:
# Set up the database to save the results of the new york bill table
# There will be one table for the New York bills and one for U.S. bills
## Python packages - you may have to pip install sqlalchemy, sqlalchemy_utils, and psycopg2.
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2
import pandas as pd

In [60]:
#In Python: Define a database name
dbname = 'bills_db'
username = 'Joel'
## 'engine' is a connection to a database
## Here, we're using postgres, but sqlalchemy can connect to other things too.
engine = create_engine('postgres://%s@localhost/%s'%(username,dbname))
print engine.url

## create a database (if it doesn't exist)
if not database_exists(engine.url):
    create_database(engine.url)
print(database_exists(engine.url))


postgres://Joel@localhost/bills_db
True

In [61]:
# Actually create the table
Base.metadata.create_all(engine)

In [62]:
from sqlalchemy.ext.declarative import declarative_base
Base = declarative_base()
from sqlalchemy import Column, Integer, String
class US_Bill(Base):
    __tablename__ = 'us_bills'
    bill_num = Column(String, primary_key=True)
    bill_name = Column(String)
    bill_text = Column(String)
    top_subject = Column(String)

    def __repr__(self):
        return "<US_Bill(bill_num='%s', bill_name='%s', bill_text='%s,', top_subject='%s,')>" % (
            self.bill_num, self.bill_name, self.bill_text, self.top_subject)

In [63]:
class Bill_Subject(Base):
    __tablename__ = 'bill_subject'
    bill_num = Column(String, primary_key=True)
    subject = Column(String, primary_key=True)

    def __repr__(self):
        return "<Bill_Subject(bill_num='%s', subject='%s')>" % (
            self.bill_num, self.subject)

In [64]:
session.close()
from sqlalchemy.orm import sessionmaker
Session = sessionmaker(bind=engine)
session = Session()

In [ ]:


In [65]:
import json
import os
import datetime
def get_latest_text_dir(path):
    latest_date = datetime.datetime(1900, 1, 1)
    dir_name = ""
    for status_dir in os.listdir(path):  
        status_file = path + '/' + status_dir + '/data.json'
        with open(status_file) as data_file:
            status_data = json.load(data_file)
            date = datetime.datetime.strptime(status_data['issued_on'], '%Y-%m-%d')
            if (date > latest_date):
                latest_date = date
                dir_name = status_dir
    # print(latest_date)
    return dir_name

In [66]:
# print(get_latest_text_dir('/Users/Joel/Desktop/Insight/data/bills/s/s2039/text-versions'))

In [67]:
Bill_Subject.__table__


Out[67]:
Table('bill_subject', MetaData(bind=None), Column('bill_num', String(), table=<bill_subject>, primary_key=True, nullable=False), Column('subject', String(), table=<bill_subject>, primary_key=True, nullable=False), schema=None)

In [68]:
import os
from pprint import pprint
bills_dir = '/Users/Joel/Desktop/Insight/data/bills'

for bill_type_dir in os.listdir(bills_dir):
    type_dir = bills_dir + '/' + bill_type_dir
    for bill_dir in os.listdir(type_dir):
        bill_path = type_dir + '/' + bill_dir

            #pprint(data)
        text_dir_base = bill_path + '/text-versions'
        if (os.path.isdir(text_dir_base)):
            dir_name = get_latest_text_dir(text_dir_base)
            text_name = text_dir_base + '/' + dir_name + '/document.txt'
            
            text = ""
            with open(text_name) as text_file:
                text = text_file.readlines()
                    
            outer_json = bill_path + '/' + 'data.json'
            with open(outer_json) as data_file:    
                bill_data = json.load(data_file)
                bill_num = bill_data['bill_id']
                bill_name = bill_data['official_title']
                top_subject = bill_data['subjects_top_term']
                one_bill = US_Bill(bill_num=bill_num, bill_name=bill_name, bill_text=text, top_subject=top_subject)
                session.add(one_bill)

                for term in bill_data['subjects']:
                    one_sub = Bill_Subject(bill_num=bill_num, subject=term)
                    session.add(one_sub)
                    

session.commit()
session.close()

In [ ]: