In [58]:
# Loop through the directory; find data from each json; look in the text directories; open each subdirectory;
# read in each subdirectory json file; find the one with the latest date. Read and store text file associated
# with that directory
In [59]:
# Set up the database to save the results of the new york bill table
# There will be one table for the New York bills and one for U.S. bills
## Python packages - you may have to pip install sqlalchemy, sqlalchemy_utils, and psycopg2.
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2
import pandas as pd
In [60]:
#In Python: Define a database name
dbname = 'bills_db'
username = 'Joel'
## 'engine' is a connection to a database
## Here, we're using postgres, but sqlalchemy can connect to other things too.
engine = create_engine('postgres://%s@localhost/%s'%(username,dbname))
print engine.url
## create a database (if it doesn't exist)
if not database_exists(engine.url):
create_database(engine.url)
print(database_exists(engine.url))
In [61]:
# Actually create the table
Base.metadata.create_all(engine)
In [62]:
from sqlalchemy.ext.declarative import declarative_base
Base = declarative_base()
from sqlalchemy import Column, Integer, String
class US_Bill(Base):
__tablename__ = 'us_bills'
bill_num = Column(String, primary_key=True)
bill_name = Column(String)
bill_text = Column(String)
top_subject = Column(String)
def __repr__(self):
return "<US_Bill(bill_num='%s', bill_name='%s', bill_text='%s,', top_subject='%s,')>" % (
self.bill_num, self.bill_name, self.bill_text, self.top_subject)
In [63]:
class Bill_Subject(Base):
__tablename__ = 'bill_subject'
bill_num = Column(String, primary_key=True)
subject = Column(String, primary_key=True)
def __repr__(self):
return "<Bill_Subject(bill_num='%s', subject='%s')>" % (
self.bill_num, self.subject)
In [64]:
session.close()
from sqlalchemy.orm import sessionmaker
Session = sessionmaker(bind=engine)
session = Session()
In [ ]:
In [65]:
import json
import os
import datetime
def get_latest_text_dir(path):
latest_date = datetime.datetime(1900, 1, 1)
dir_name = ""
for status_dir in os.listdir(path):
status_file = path + '/' + status_dir + '/data.json'
with open(status_file) as data_file:
status_data = json.load(data_file)
date = datetime.datetime.strptime(status_data['issued_on'], '%Y-%m-%d')
if (date > latest_date):
latest_date = date
dir_name = status_dir
# print(latest_date)
return dir_name
In [66]:
# print(get_latest_text_dir('/Users/Joel/Desktop/Insight/data/bills/s/s2039/text-versions'))
In [67]:
Bill_Subject.__table__
Out[67]:
In [68]:
import os
from pprint import pprint
bills_dir = '/Users/Joel/Desktop/Insight/data/bills'
for bill_type_dir in os.listdir(bills_dir):
type_dir = bills_dir + '/' + bill_type_dir
for bill_dir in os.listdir(type_dir):
bill_path = type_dir + '/' + bill_dir
#pprint(data)
text_dir_base = bill_path + '/text-versions'
if (os.path.isdir(text_dir_base)):
dir_name = get_latest_text_dir(text_dir_base)
text_name = text_dir_base + '/' + dir_name + '/document.txt'
text = ""
with open(text_name) as text_file:
text = text_file.readlines()
outer_json = bill_path + '/' + 'data.json'
with open(outer_json) as data_file:
bill_data = json.load(data_file)
bill_num = bill_data['bill_id']
bill_name = bill_data['official_title']
top_subject = bill_data['subjects_top_term']
one_bill = US_Bill(bill_num=bill_num, bill_name=bill_name, bill_text=text, top_subject=top_subject)
session.add(one_bill)
for term in bill_data['subjects']:
one_sub = Bill_Subject(bill_num=bill_num, subject=term)
session.add(one_sub)
session.commit()
session.close()
In [ ]: