In [1]:
import gzip
import json
import os
import re
from __future__ import print_function, unicode_literals
In [2]:
catalog_dir = os.path.join('..','catalogs')
In [3]:
files = list(filter(lambda x: bool(re.search('json\.gz$', x)), os.listdir(catalog_dir)))
In [4]:
files
Out[4]:
In [5]:
my_files = [os.path.join(catalog_dir, files[k]) for k in (3, 6, 9, 8, 10)] # we love list comprehensions!
In [6]:
my_files
Out[6]:
Give the databases short names for reference
In [7]:
names = ['EI (u)', 'ELCD', 'GaBi-Pro', 'GaBi-Ext', 'US LCI']
In [8]:
def load_archive(filename):
with gzip.open(filename, 'r') as fp:
J = json.load(fp)
return J
Load all the archives into a collection called 'C'
In [9]:
C = [load_archive(fname) for fname in my_files]
(should take about 2-5 seconds)
In [10]:
geog = []
for i, archive in enumerate(C):
for p in archive['processes']:
geog.append({'db': names[i], 'process': p['Name'], 'geog': p['SpatialScope']})
there should be one entry in geog for each process listed in a database - total of around 25,000 processes
In [11]:
len(geog)
Out[11]:
use pandas to draw the pivot charts
In [12]:
import pandas as pd
In [13]:
P = pd.DataFrame(geog).pivot_table(index='geog', columns='db', aggfunc=len, fill_value='', margins=True)
In [14]:
P.sort_values(by=('process','All'), ascending=False)[:20] # only show the top 20 rows
Out[14]:
In [15]:
from collections import Counter
In [16]:
def create_flow_map(archive):
"""
This function creates a hash map from the entityId to the entity's tags- very fast
"""
flow_map = dict()
for f in archive['flows']:
flow_map[f['entityId']] = f
return flow_map
In [17]:
def count_ref_flows(archive):
rfs = Counter()
flow_map = create_flow_map(archive)
for i in archive['processes']:
x = [v for v in i['exchanges'] if 'isReference' in v and v['isReference'] is True]
if len(x) == 0:
count_key = (None, None)
rfs[count_key] += 1
else:
for xc in x:
direc = xc['direction']
flowref = xc['flow']
try:
flowname = flow_map[flowref]['Name']
except KeyError:
flowname = flow_map[int(flowref)]['Name']
except KeyError:
flowname = 'Flow Not Found!'
count_key = (direc, flowname)
rfs[count_key] += 1
return rfs
In [18]:
rf_count = []
for i, archive in enumerate(C):
print('Parsing archive %s' % names[i])
rfs = count_ref_flows(archive)
for rf, count in rfs.items():
try:
rf_count.append({'db': names[i], 'exchange': '%s: %s' % (rf[0], rf[1]), 'count': count})
except TypeError:
print('rf: %s (type %s) count: %d' %(rf, type(rf), count))
(should take << 1 second)
In [19]:
RF = pd.DataFrame(rf_count).pivot_table(index='exchange', columns='db', aggfunc=sum, fill_value='', margins=True)
In [20]:
RF.sort_values(('count','All'), ascending=False)[:20]
Out[20]:
In [21]:
from collections import defaultdict #, Counter # already imported
def tags(entity, look_in, delimiter=';\s*|,\s*|\s*\(|\)\s*|/'):
"""
tags(entity, look_in, delimiter=';\s*|,\s*|\s*\(|\)\s*|/')
Parse the specified fields to generate a list of tags, delimited as specified
entity: a JSON serialized entity
look_in: a set of fields to extract tags from
delimiter: regexp for re.split() Default specifies:
semicolon with trailing space OR
comma with trailing space OR
open parens with leading space OR
close parens with trailing space OR
slash
"""
tags = set()
for k, v in entity.items():
if v is None: continue
if k in look_in:
try:
tags = tags.union('='.join([k,f]) for f in filter(bool, re.split(delimiter, v)))
except TypeError:
tags = tags.union('='.join([k,f]) for f in filter(bool, re.split(delimiter, ', '.join(v))))
return tags
def count_tags(e_list, search=None, include=None, exclude=None):
"""
count_tags(e_list, search=None, include=None, exclude=None)
Extract tags from entity list.
Optional search term: only counts entities where the search term is found
Default fields: 'Name', 'Comment', 'SpatialScope', 'Classifications'
add additional fields with include=; remove fields with exclude=
Returns d, m
d = a Counter object containing tags with their counts
m = a dictionary: keys are tags, values are lists of entity IDs bearing the tag
"""
look_in = {'Name', 'Comment', 'SpatialScope', 'Classifications'}
if include is not None:
look_in = look_in.union(set(include))
if exclude is not None:
look_in = look_in.difference(set(exclude))
d = Counter()
m = defaultdict(list)
for e in e_list:
t = tags(e, look_in)
if search is not None:
if not any([bool(re.search(search, k, flags=re.IGNORECASE)) for k in t]):
continue
for i in t:
d[i] += 1
m[i].append(e['entityId'])
return d, dict(m)
In [22]:
for i, archive in enumerate(C):
"""
Search each catalog for processes containing the term 'EURO.?[0-9]' and print their most common tags
"""
print('\n%s:' % names[i])
d0, m0 = count_tags(archive['processes'], search='EURO.?[0-9]', include=['TechnologyLevel', 'IsicClass'])
print([k for k in d0.most_common() if k[1] > 10])
After that, it's just some formatting to get it into tabular form