In [1]:
import json
In [2]:
def parse_date(file_name):
starting_point_of_date = "_20"
date_pos_start = file_name.find(starting_point_of_date)+1
return file_name[date_pos_start:date_pos_start+10]
'''
#--- Test it ---
parse_date("snapshots/HealthData.gov_2014-02-24_data.json")
'''
Out[2]:
In [3]:
# Pull out the most important elements to tally on
def get_keys(dataset):
keys = ["bureauCode", "programCode", "publisher",
"landingPage","modified",
"Identifier", "downloadURL"]
'''
Characteristics of non-federal entries for DKAN
→ Publisher:Name is "State of" or "City of"
→ downloadURL has non-hhs domain
→ Identifier has non-hhs domain
→ Usually "bureauCode": ["009:00" and "programCode": [ "009:000"
'''
key_values = []
for i,key in enumerate(keys):
if key in dataset:
key_values.append(dataset[key])
else:
key_values.append(None)
return dict(zip(keys, key_values))
'''
------------------------------------------------
--- Capturing agency counts ---
------------------------------------------------
Many dataset entries lack bureauCode.
So perhaps other identifiers can be used as a proxy
'''
'''
#--- Test it ---
print get_keys(json_data_list[0][0])
'''
'''
# --- Experiment with keys ---
if "bureauCode" in json_data_list[0][0]: print json_data_list[0][0]["bureauCode"]
if "publisher" in json_data_list[0][0]: print json_data_list[0][0]["publisher"]
if "landingPage" in json_data_list[0][0]: print json_data_list[0][0]["landingPage"]
print json.dumps(json_data_list[0][0], sort_keys=False, indent=4)
'''
Out[3]:
In [4]:
# FIXME: Code not yet finished
# FIXME: Should call get_keys
# Create a dictionary of values for comparison
def get_key_list(dataset_list):
key_list = []
for index, dataset in enumerate(dataset_list):
key_list.append(get_keys(dataset))
#for # List of unique bureauCode values
totals = len(dataset_list)
#print get_keys(dataset[0])
return key_list
In [5]:
In [196]:
def support_old_schema(dataset_list):
if isinstance(dataset_list, dict):
return dataset_list["dataset"]
elif isinstance(dataset_list, list):
return dataset_list
else:
return None
'''
#--- Test it ---
print support_old_schema(dataset)
'''
Out[196]:
In [9]:
import os
import glob # Wildcard search
import json
def load_file(file_name):
with open(file_name) as json_file:
json_data = json.load(json_file)
return json_data
print("Loaded file: "+file_name)
#FIXME: Need to find agency decode
# Examples: {'009:25', '009:15', '009:92', '009:10', '009:75',
# '009:20', '009:30', '009:17', '009:70', '009:00',
# '009:38', '009:33'}
def main():
file_pattern = "snapshots/"
file_pattern += "HealthData.gov[_][0-9][0-9][0-9][0-9][-][0-9][0-9][-][0-9][0-9][_]data.json"
file_name_list = glob.glob(file_pattern)
datasets = []
for index, file_name in enumerate(reversed(file_name_list)):
snapshot_date = parse_date(file_name)
dataset_list = load_file(file_name)
dataset_list = support_old_schema(dataset_list)
key_list = get_key_list(dataset_list)
agency_counts = get_agency_counts(key_list)
print snapshot_date+": "+str(agency_counts)+"\n"
if index > 0: break # Don't run all for debugging
In [10]:
main()
In [259]:
#======================
#== This section for main()
#======================
# FIXME: Change order to save each row immediately, rather than all at end
file_pattern = "snapshots/"
#file_pattern += "*2015-02-01*"
#file_pattern += "*2015-06-18*"
file_pattern += "HealthData.gov[_][0-9][0-9][0-9][0-9][-][0-9][0-9][-][0-9][0-9][_]data.json"
file_name_list = glob.glob(file_pattern)
datasets = []
agency_lookup = load_agency_lookup()
dict_counts_by_date = {}
for index, file_name in enumerate(reversed(file_name_list)):
snapshot_date = parse_date(file_name)
dataset_list = load_file(file_name)
dataset_list = support_old_schema(dataset_list)
key_list = get_key_list(dataset_list)
#print key_list
#print agency_lookup
agency_counts = get_agency_counts(key_list,agency_lookup)
#print snapshot_date+": "+str(agency_counts)+"\n"
dict_counts_by_date[snapshot_date]=agency_counts
#if index > 15: break # Don't run all for debugging
#print agency_lookup
#dict_counts_by_date
convert_dict_to_csv(dict_counts_by_date,agency_lookup)
In [261]:
print snapshot_date
In [251]:
def get_agency_abbrev_list(agency_lookup):
# Looks more complex than needed, but due to sorting by key
bureau_code_list = []
for bureau_code in agency_lookup.iterkeys():
bureau_code_list.append(bureau_code)
bureau_code_list.sort()
agency_abbrev_list = []
for bureau_code in bureau_code_list:
agency_abbrev_list.append(agency_lookup[bureau_code])
return agency_abbrev_list
#agency_abbrev_list = get_agency_abbrev_list(agency_lookup)
Out[251]:
In [254]:
import csv
def convert_dict_to_csv(dict_counts_by_date,agency_lookup):
# --- Be sure list of abbreviations is sorted by key ---
agency_abbrev_list = get_agency_abbrev_list(agency_lookup)
row_csv = []
row_csv_list = []
# --- Build header ---
row_csv.append("Date")
for agency_abbrev in agency_abbrev_list:
row_csv.append(agency_abbrev)
row_csv_list.append(row_csv)
#print row_csv
for row_date,row_counts in dict_counts_by_date.iteritems():
row_csv = []
row_csv.append(row_date)
# Using this method because want to be sorted by bureau_code
for agency_abbrev in agency_abbrev_list:
row_csv.append(str(row_counts.get(agency_abbrev,0)))
row_csv_list.append(row_csv)
#print row_csv
with open("generated/totals_by_agency.csv", "wb") as csv_file:
writer = csv.writer(csv_file)
writer.writerows(row_csv_list)
In [253]:
def get_agency_counts(key_list,agency_lookup):
agency_counts = {}
for index,key_item in enumerate(key_list):
agencies = key_item["bureauCode"]
#print key_item
#print key_item["publisher"]["name"]
# Just in case it's not a list, make it one
agencies = agencies if isinstance(agencies,list) else [agencies]
#print agencies
for agency in agencies:
#agency = agency.encode('ascii','ignore')
agency_abbrev = agency_lookup.get(agency,"Other")
# Occassionally "bureauCode"][0] == "009:00" is used for State/Local
if agency == "009:00":
publisher_name = key_item["publisher"]
# Handle when publisher is not a dictionary
if isinstance(publisher_name, dict): publisher_name = str(publisher_name)
if "State of" in publisher_name:
agency_abbrev = "State"
elif "City of" in publisher_name:
agency_abbrev = "City"
agency_counts[agency_abbrev] = agency_counts.get(agency_abbrev, 0) + 1
#if index > 0: break # Don't run all for debugging
return agency_counts
#agency_lookup = load_agency_lookup()
#agency_counts = get_agency_counts(key_list,agency_lookup)
#print agency_counts
#print snapshot_date+": "+str(agency_counts)+"\n"
In [104]:
def load_agency_lookup():
with open('agency_lookup_columns.json') as data_file:
agency_lookup_columns = json.load(data_file)
bureau_code_index = agency_lookup_columns['columns'].index('bureau_code')
agency_abbrev_index = agency_lookup_columns['columns'].index('agency_abbrev')
agency_lookup = {}
for agency_record in agency_lookup_columns['data']:
# TBD: May want to convert unicode using .encode('ascii','ignore')
agency_lookup[agency_record[bureau_code_index]] = agency_record[agency_abbrev_index].encode('ascii','ignore')
return agency_lookup
#print load_agency_lookup()
In [107]:
import os
'''
------------------------------------------------
--- Reload the file only if it changed
------------------------------------------------
'''
def get_csv_data(last_mtime = 0, csv_data = []):
CSV_FILE_NAME = "generated/totals_by_agency.csv"
try:
mtime = os.path.getmtime(CSV_FILE_NAME)
except OSError:
mtime = 0
# Reload if there's a newer file
if mtime > last_mtime:
last_mtime = mtime
csv_file = open(CSV_FILE_NAME)
csv_reader = csv.reader(csv_file)
csv_data = []
for index, row in enumerate(csv_reader):
csv_data.append(row)
#if index > 0: break
return (mtime, csv_data)
#mtime, csv_data = get_csv_data()
#print mtime
#print csv_data
In [108]:
In [109]:
'''
------------------------------------------------
--- Check data from CSV
------------------------------------------------
'''
#: First time set to zero
last_mtime = 0
csv_data = []
'''
#: Alternative, if not setting first time
last_mtime = 0 if not ('last_mtime' in locals()) else last_mtime
csv_data = [] if not ('csv_data' in locals()) else csv_data
'''
#: Get data from file and update timestamp
last_mtime, csv_data = get_csv_data(last_mtime, csv_data)
#: Display data
csv_dates = []
header = csv_data[0]
date_pos = header.index('Date')
for index, row in enumerate(csv_data[1:]):
csv_dates.append(row[date_pos])
#print(row)
#if index > 10: break
#print csv_dates
#print csv_data
In [ ]:
def get_file_name_list():
file_pattern = "snapshots/"
file_pattern += "HealthData.gov[_][0-9][0-9][0-9][0-9][-][0-9][0-9][-][0-9][0-9][_]data.json"
file_name_list = glob.glob(file_pattern)
return file_name_list
In [111]:
def get_csv_date_list(csv_data):
csv_date_list = []
header = csv_data[0]
date_pos = header.index('Date')
for index, row in enumerate(csv_data[1:]):
csv_date_list.append(row[date_pos])
return csv_date_list
#print get_csv_date_list(csv_data)
In [ ]:
'''
------------------------------------------------
--- Load only dates missing
------------------------------------------------
'''
file_name_list = get_file_name_list()
mtime, csv_data = get_csv_data()
csv_date_list = get_csv_date_list(csv_data)
datasets = []
agency_lookup = load_agency_lookup()
dict_counts_by_date = {}
#: Load missing dates
for index, file_name in enumerate(reversed(file_name_list)):
snapshot_file_date = parse_date(file_name)
if snapshot_file_date not in csv_dates:
print "Running date: "+snapshot_file_date
dataset_list = load_file(file_name)
dataset_list = support_old_schema(dataset_list)
key_list = get_key_list(dataset_list)
#print key_list
#print agency_lookup
agency_counts = get_agency_counts(key_list,agency_lookup)
#print snapshot_date+": "+str(agency_counts)+"\n"
dict_counts_by_date[snapshot_date]=agency_counts
#if index > 15: break # Don't run all for debugging
#print agency_lookup
#dict_counts_by_date
convert_dict_to_csv(dict_counts_by_date,agency_lookup)