In [5]:
import json
import requests
import json_delta
'''
------------------------------------------------
--- A note about comparison techniques used ---
------------------------------------------------
json_delta is best for serializing/deserializing structures and
minimizing comm overhead. It's may not be ideal for specialized
comparison of existing JSON
'''
Out[5]:
In [21]:
import os
import glob # Wildcard search
file_pattern = "snapshots/"
file_pattern += "HealthData.gov[_][0-9][0-9][0-9][0-9][-][0-9][0-9][-][0-9][0-9][_]data.json"
#print "glob.glob("+file_pattern+")"
#print glob.glob(file_pattern)
In [35]:
def load_file(json_file_name):
with open(json_file_name) as json_file:
json_data_struct = json.load(json_file)
return json_data_struct
file_pattern = "snapshots/"
file_pattern += "HealthData.gov[_][0-9][0-9][0-9][0-9][-][0-9][0-9][-][0-9][0-9][_]data.json"
file_list = glob.glob(file_pattern)
json_data_list = [dict() for x in range(len(file_list))]
'''
#--- Test comparison ---
json_data_list[0] = load_file(file_list[0])
json_data_list[1] = load_file(file_list[1])
totals = compare_datasets(json_data_list[0],json_data_list[1])
print totals
'''
In [278]:
# json_delta.load_and_diff('{"foo":"bar"}', '{"foo":"baz"}', verbose=False)
# print "\n\n----------------------------\n\n"
test = json_delta.load_and_udiff('{"foo":"bar"}', '{"foo":"baz"}')
print test
print '\n'.join(test)
In [269]:
test = json_delta.udiff(left='{"foo":"bar"}', right='{"foo":"baz"}')
print test
print '\n'.join(test)
In [29]:
# recursively sort any lists it finds (and convert dictionaries
# to lists of (key, value) pairs so that they're orderable):
def ordered_json(obj):
if isinstance(obj, dict):
return sorted((k, ordered_json(v)) for k, v in obj.items())
if isinstance(obj, list):
return sorted(ordered_json(x) for x in obj)
else:
return obj
# print json_data[1]['dataset'][0]
#print
# print ordered_json(json_data[1]['dataset'][0])
In [21]:
# recursively sort any lists it finds (and convert dictionaries
# to lists of (key, value) pairs so that they're orderable):
import collections
def ordered_json2(obj,max_depth):
if max_depth == 1: return obj + "(max depth reached)"
if isinstance(obj, dict):
print "Instance: dictionary -- Type " + str(type(obj))
return sorted((k, ordered_json2(v, max_depth-1)) for k, v in obj.items())
#return collections.OrderedDict(sorted(obj.items()))
## return sorted(ordered_json2({k:v}, max_depth-1) for k, v in obj.items())
if isinstance(obj, list):
print "Instance: list -- Type " + str(type(obj))
return sorted(ordered_json2(x,max_depth-1) for x in obj)
else:
return obj
test = {}
#print ordered_json2(,5)
#print ordered_json2({'def1':['abc2','abc1',{'lab1':"c",'lab0':"b"}],'def0':'xyz'},5)
before_test = {'def1':['abc2','abc1',{'lab1':"a",'lab0':"b"}],'def0':'xyz'}
after_test = {'def1':['abc2','abc1',{'lab1':"a",'lab0':"c"}],'def0':'xyz'}
print ordered_json2([before_test,after_test],5)
#print json_data[1]['dataset'][0]
print
#print ordered_json(json_data[1]['dataset'][0])
'''
Convert tuple to dictionary: But this isn't sufficient
'''
Out[21]:
In [218]:
len(json_data[0]['dataset'])
Out[218]:
In [1]:
# Create a dictionary of values for comparison
import json_delta
def compare_datasets(dataset_list_before, dataset_list_after):
json_compare_dict = {}
dataset_list_diff = {"Counts":{"Added":0, "Deleted":0, "Changed":0, "No Change":0},
"Diff":""}
'''
#=== Sort entire datasets recursively for easier comparison later ===
print("\n\n===dataset_before===")
print(dataset_before)
print("\n\n===dataset_after===")
print(dataset_after)
#dataset_before = ordered_json(dataset_before)
#dataset_after = ordered_json(dataset_after )
print("\n\n===ordered_json(dataset_before===")
print(dataset_before)
print("\n\n===ordered_json(dataset_after===")
print(dataset_after)
'''
#=== First load the "after" values ===
for index, dataset_after in enumerate(dataset_list_after):
check_key = dataset_after['identifier']
json_compare_dict[check_key] = {'Status' :"Added",
'Before' :None,
'After' :dataset_after,
'Difference':None
}
dataset_list_diff["Counts"]["Added"] += 1
#=== Second load the "before" values ===
for index, dataset_before in enumerate(dataset_list_before):
check_key = dataset_before['identifier']
if check_key in json_compare_dict:
# Not deleted, so check for differences
dataset_after = json_compare_dict[check_key]['After']
# Must compare sorted versions of json struct
if ordered_json(dataset_after) == ordered_json(dataset_before):
diff_status = "No Change"
else:
diff_status = "Changed"
# Analyze difference only if changed
udiff_list = json_delta.udiff(
dataset_before,
dataset_after
)
udiff_output = '\n'.join(udiff_list)
dataset_list_diff["Diff"] = udiff_output
print "\n\n==dumps_before==========\n\n"
print 'type='+str(type(dataset_before))
print json.dumps(dataset_before)
print "\n\n==dumps_after==========\n\n"
print 'type='+str(type(dataset_after))
print json.dumps(dataset_after)
print "\n\n==load udiff list==========\n\n"
print 'type='+str(type(udiff_list))
print udiff_list
print "\n\n==load udiff output==========\n\n"
print 'type='+str(type(udiff_output))
print udiff_output
return dataset_list_diff
else:
# Deleted
diff_status = "Deleted"
json_compare_dict[check_key] = {'Status':diff_status,
'Before':dataset_before
}
dataset_list_diff["Counts"][diff_status] += 1
dataset_list_diff["Counts"]["Added" ] -= 1
dataset_list_diff["Counts"]["Added"] = max(0,dataset_list_diff["Counts"]["Added"])
return dataset_list_diff
#--- Test it ---
#totals = compare_datasets(json_data[0]['dataset'][:2],json_data[1]['dataset'][:2])
#totals = compare_datasets(json_data[0]['dataset'],json_data[1]['dataset'])
#print totals
totals = compare_datasets(json_data_list[0],json_data_list[1])
print totals["Counts"]
print totals["Diff"]
In [57]:
if "landingPage" in json_data_list[0][0]: print json_data_list[0][0]["landingPage"]
if "publisher" in json_data_list[0][0]: print json_data_list[0][0]["publisher"]
if "bureauCode" in json_data_list[0][0]: print json_data_list[0][0]["bureauCode"]
print json.dumps(json_data_list[0][0], sort_keys=False, indent=4)
In [211]:
#=== Check status counts between consecutive days ===
totals = {}
for key, value in json_compare_dict.iteritems():
#print dataset['identifier']
check_status = value['Status']
if check_status in totals:
totals[check_status] += 1
else:
totals[check_status] = 1
#break
totals
Out[211]:
In [244]:
#json_data[0]['dataset'][0,1,3]
for dataset in json_data[0]['dataset'][:2]:
# print dataset
check_key = dataset['identifier']
#json_data[0]['dataset'][:2]
In [3]:
import requests
# find list of files
directory_url = 'http://data-staging.civicagency.org/archive/datajson'
directory_response = requests.get(directory_url)
In [25]:
#directory_response.status_code
#directory_response.text
In [ ]:
# Get HTML page
from lxml import html
tree = html.fromstring(directory_response.content)
tree.xpath
In [29]:
from bs4 import BeautifulSoup
directory_soup = BeautifulSoup(directory_response.text)
In [124]:
suffix_url = '/49021.json'
datajson_url_list = []
for a_tag in directory_soup.find_all('a', href=True):
a_text = a_tag.text.replace("/", "")
if valid_date(a_text):
print(directory_url+"/"+a_text+suffix_url)
datajson_url_list.append(directory_url+"/"+a_text+suffix_url)
#print("Found the URL:"+ a_tag['href'] + " Text: "+ a_tag.text, valid_date(a_tag.text))
#print re.sub('[/]', '', a_tag.text)
# Sorts list to start with most recent
datajson_url_list.sort(reverse=True)
In [137]:
read_limit = 10
for index,url in enumerate(datajson_url_list):
print url
head = requests.head(url,headers={'Accept-Encoding': 'identity'})
# print (head.headers['content-length'])
if head.status_code == 200:
break
else:
print head.status_code
if read_limit:
if index+1 >= read_limit:
break
In [158]:
# Function to obtain size of page from headers
def get_page_size(url):
head = requests.head(url,headers={'Accept-Encoding': 'identity'})
# print (head.headers['content-length'])
if head.status_code == 200:
if 'Content-Length' in head.headers:
size = head.headers['Content-Length']
return size
return 0 # When no size is avail
#--- Test function ---
print datajson_url_list[100]+": "+str(get_page_size(datajson_url_list[100]))
print datajson_url_list[0] +": "+str(get_page_size(datajson_url_list[0 ]))
In [ ]:
In [70]:
# Function to help find relevant links
import datetime
def valid_date(date_text):
try:
datetime.datetime.strptime(date_text, '%Y-%m-%d')
return True
except ValueError:
# raise ValueError("Incorrect data format, should be YYYY-MM-DD")
return False
print(valid_date('2003-12-223'))
In [70]:
Accept
requests.head(url,headers={'Accept-Encoding': 'identity'})
r = requests.head('http://pymotw.com/2/urllib/index.html',headers={'Accept-Encoding': 'identity'})
>>> r.headers['content-length']
In [41]:
#json_data[0] #['Dataset']
json_data[0]['dataset'][0]
Out[41]:
In [ ]: