notebook.community



In [1]:

    
import gzip
import json
import os
import re

from __future__ import print_function, unicode_literals



In [2]:

    
catalog_dir = os.path.join('..','catalogs')



In [3]:

    
files = list(filter(lambda x: bool(re.search('json\.gz$', x)), os.listdir(catalog_dir)))



In [4]:

    
files









    Out[4]:





[u'ecoinvent_3.2_apos_xlsx.json.gz',
 u'ecoinvent_3.2_consequential_xlsx.json.gz',
 u'ecoinvent_3.2_cut-off_xlsx.json.gz',
 u'ecoinvent_3.2_undefined_xlsx.json.gz',
 u'ei3.2_undefined_spold.json.gz',
 u'ei_lcia.json.gz',
 u'elcd_3.2.json.gz',
 u'elcd_lcia.json.gz',
 u'gabi_2016_all-extensions.json.gz',
 u'gabi_2016_professional-database-2016.json.gz',
 u'uslci_ecospold.json.gz']

Select databases of interest



In [5]:

    
my_files = [os.path.join(catalog_dir, files[k]) for k in (3, 6, 9, 8, 10)]  # we love list comprehensions!



In [6]:

    
my_files









    Out[6]:





[u'../catalogs/ecoinvent_3.2_undefined_xlsx.json.gz',
 u'../catalogs/elcd_3.2.json.gz',
 u'../catalogs/gabi_2016_professional-database-2016.json.gz',
 u'../catalogs/gabi_2016_all-extensions.json.gz',
 u'../catalogs/uslci_ecospold.json.gz']

Give the databases short names for reference



In [7]:

    
names = ['EI (u)', 'ELCD', 'GaBi-Pro', 'GaBi-Ext', 'US LCI']



In [8]:

    
def load_archive(filename):
    with gzip.open(filename, 'r') as fp:
        J = json.load(fp)

    return J

Load all the archives into a collection called 'C'



In [9]:

    
C = [load_archive(fname) for fname in my_files]

(should take about 2-5 seconds)

Create geography grid (Table 4 in manuscript)



In [10]:

    
geog = []
for i, archive in enumerate(C):
    for p in archive['processes']:
        geog.append({'db': names[i], 'process': p['Name'], 'geog': p['SpatialScope']})

there should be one entry in geog for each process listed in a database - total of around 25,000 processes



In [11]:

    
len(geog)









    Out[11]:





25287

use pandas to draw the pivot charts



In [12]:

    
import pandas as pd



In [13]:

    
P = pd.DataFrame(geog).pivot_table(index='geog', columns='db',  aggfunc=len, fill_value='', margins=True)



In [14]:

    
P.sort_values(by=('process','All'), ascending=False)[:20]  # only show the top 20 rows

Count Reference Flow Frequency (table 3 in manuscript)



In [15]:

    
from collections import Counter



In [16]:

    
def create_flow_map(archive):
    """
    This function creates a hash map from the entityId to the entity's tags- very fast
    """
    flow_map = dict()
    for f in archive['flows']:
        flow_map[f['entityId']] = f
    return flow_map



In [17]:

    
def count_ref_flows(archive):
    rfs = Counter()
    flow_map = create_flow_map(archive)
    for i in archive['processes']:
        x = [v for v in i['exchanges'] if 'isReference' in v and v['isReference'] is True]
        if len(x) == 0:
            count_key = (None, None)
            rfs[count_key] += 1
        else:
            for xc in x:
                direc = xc['direction']
                flowref = xc['flow']
                try:
                    flowname = flow_map[flowref]['Name']
                except KeyError:
                    flowname = flow_map[int(flowref)]['Name']
                except KeyError:
                    flowname = 'Flow Not Found!'
                count_key = (direc, flowname)
                rfs[count_key] += 1
    return rfs



In [18]:

    
rf_count = []
for i, archive in enumerate(C):
    print('Parsing archive %s' % names[i])
    rfs = count_ref_flows(archive)
    for rf, count in rfs.items():
        try:
            rf_count.append({'db': names[i], 'exchange': '%s: %s' % (rf[0], rf[1]), 'count': count})
        except TypeError:
            print('rf: %s (type %s) count: %d' %(rf, type(rf), count))









    



Parsing archive EI (u)
Parsing archive ELCD
Parsing archive GaBi-Pro
Parsing archive GaBi-Ext
Parsing archive US LCI

(should take << 1 second)



In [19]:

    
RF = pd.DataFrame(rf_count).pivot_table(index='exchange', columns='db', aggfunc=sum, fill_value='', margins=True)



In [20]:

    
RF.sort_values(('count','All'), ascending=False)[:20]









    Out[20]:






  
    
      
      count
    
    
      db
      EI (u)
      ELCD
      GaBi-Ext
      GaBi-Pro
      US LCI
      All
    
    
      exchange
      
      
      
      
      
      
    
  
  
    
      All
      14158
      503
      7457
      3319
      701
      26138
    
    
      Output: electricity, high voltage
      2350
      
      
      
      
      2350
    
    
      Output: Thermal energy (MJ)
      
      
      944
      236
      
      1180
    
    
      Output: Electricity
      
      64
      522
      471
      
      1057
    
    
      Output: Steam (MJ)
      
      
      340
      622
      
      962
    
    
      Output: electricity, low voltage
      730
      
      
      
      
      730
    
    
      Input: Housing technology
      
      
      340
      191
      
      531
    
    
      Output: electricity, medium voltage
      423
      
      
      
      
      423
    
    
      Output: heat, district or industrial, other than natural gas
      402
      
      
      
      
      402
    
    
      Output: Cargo
      
      
      80
      127
      
      207
    
    
      Output: heat, district or industrial, natural gas
      141
      
      
      
      
      141
    
    
      Output: heat, central or small-scale, other than natural gas
      139
      
      
      
      
      139
    
    
      None: None
      
      3
      107
      10
      1
      121
    
    
      Input: Aluminium scrap
      
      
      60
      59
      
      119
    
    
      Input: Incineration good
      
      
      53
      58
      
      111
    
    
      Input: Steel scrap
      
      
      55
      54
      
      109
    
    
      Output: blast furnace gas
      108
      
      
      
      
      108
    
    
      Output: biogas
      97
      
      
      
      
      97
    
    
      Output: Elevator utilization
      
      
      90
      6
      
      96
    
    
      Output: Used cooling (Emission air conditioner)
      
      
      86
      4
      
      90

Text Co-occurrence (table 5 in manuscript)



In [21]:

    
from collections import defaultdict  #, Counter  # already imported

def tags(entity, look_in, delimiter=';\s*|,\s*|\s*\(|\)\s*|/'):
    """
    tags(entity, look_in, delimiter=';\s*|,\s*|\s*\(|\)\s*|/')

    Parse the specified fields to generate a list of tags, delimited as specified
    
    entity: a JSON serialized entity
    look_in: a set of fields to extract tags from
    delimiter: regexp for re.split()  Default specifies: 
       semicolon with trailing space OR
       comma with trailing space OR
       open parens with leading space OR
       close parens with trailing space OR
       slash
    
    """
    tags = set()
    for k, v in entity.items():
        if v is None: continue
        if k in look_in:
            try:
                tags = tags.union('='.join([k,f]) for f in filter(bool, re.split(delimiter, v)))
            except TypeError:
                tags = tags.union('='.join([k,f]) for f in filter(bool, re.split(delimiter, ', '.join(v))))
                        
    return tags

def count_tags(e_list, search=None, include=None, exclude=None):
    """
    count_tags(e_list, search=None, include=None, exclude=None)
    Extract tags from entity list.
    Optional search term: only counts entities where the search term is found
    
    Default fields: 'Name', 'Comment', 'SpatialScope', 'Classifications'
    add additional fields with include=; remove fields with exclude=
    
    Returns d, m
    d = a Counter object containing tags with their counts
    m = a dictionary: keys are tags, values are lists of entity IDs bearing the tag
    """
    look_in = {'Name', 'Comment', 'SpatialScope', 'Classifications'}
    if include is not None:
        look_in = look_in.union(set(include))
    if exclude is not None:
        look_in = look_in.difference(set(exclude))
    d = Counter()
    m = defaultdict(list)
    for e in e_list:
        t = tags(e, look_in)
        if search is not None:
            if not any([bool(re.search(search, k, flags=re.IGNORECASE)) for k in t]):
                continue
        for i in t:
            d[i] += 1
            m[i].append(e['entityId'])
    return d, dict(m)



In [22]:

    
for i, archive in enumerate(C):
    """
    Search each catalog for processes containing the term 'EURO.?[0-9]' and print their most common tags 
    """
    print('\n%s:' % names[i])
    d0, m0 = count_tags(archive['processes'], search='EURO.?[0-9]', include=['TechnologyLevel', 'IsicClass'])
    print([k for k in d0.most_common() if k[1] > 10])









    



EI (u):
[(u'SpatialScope=GLO', 190), (u'Name=transport', 173), (u'IsicClass=Freight transport by road', 160), (u'Name=freight', 160), (u'Name=passenger car', 95), (u'IsicClass=Other passenger land transport', 95), (u'TechnologyLevel=Current', 91), (u'Name=market for transport', 82), (u'TechnologyLevel=Undefined', 82), (u'SpatialScope=RER', 65), (u'Name=lorry with refrigeration machine', 64), (u'Comment=internal combustion engine', 61), (u'Name=EURO3', 40), (u'Name=EURO5', 40), (u'Name=EURO4', 40), (u'Name=R134a refrigerant', 40), (u'Name=EURO 5', 33), (u'Name=3.5-7.5 ton', 32), (u'Name=EURO6', 32), (u'Name=carbon dioxide', 32), (u'Name=7.5-16 ton', 32), (u'Name=EURO 3', 31), (u'Name=EURO 4', 31), (u'TechnologyLevel=Modern', 30), (u'Name=medium size', 29), (u'TechnologyLevel=New', 28), (u'Name=large size', 27), (u'Name=natural gas', 27), (u'Name=small size', 27), (u'Name=diesel', 27), (u'Name=petrol', 26), (u'TechnologyLevel=Old', 24), (u'Name=cooling', 24), (u'Name=freezing', 24), (u'Comment=linking activity', 24), (u'Name=lorry 7.5-16 metric ton', 18), (u'Name=lorry >32 metric ton', 18), (u'Name=lorry 3.5-7.5 metric ton', 18), (u'Name=lorry 16-32 metric ton', 18), (u'Name=small lorry with refrigeration machine', 16), (u'Name=...', 16), (u'Name=liquid refrigerant', 16), (u'Comment=ConvertedDataset', 12)]

ELCD:
[]

GaBi-Pro:
[(u'Classifications=Processes', 81), (u'Classifications=Truck', 79), (u'Comment=driving share motorway', 76), (u'Classifications=Transport', 76), (u'SpatialScope=GLO', 76), (u'Comment=rural', 76), (u'Comment=utilisation [-]', 76), (u'Comment=payload [t]', 76), (u'Classifications=Road', 76), (u'Comment=status January 2010\n- input parameter: distance [km]', 76), (u'Comment=driving share: HBEFA 3.1', 76), (u'Comment=urban\n- average sulphur content: EU = 10 ppm', 68), (u'Comment=sulphur content diesel [ppm]', 58), (u'Name=Truck', 50), (u'Name=Truck-trailer', 20), (u'Comment=- source emissions', 12), (u'Comment=- average emission values Euro 3\n- source emissions', 12), (u'Comment=- average emission values Euro 6 SCR', 12), (u'Comment=- average emission values Euro 1\n- source emissions', 12), (u'Comment=- average emission values Euro 4\n- source emissions', 12), (u'Comment=- average emission values Euro 2\n- source emissions', 12), (u'Comment=non tampered', 12), (u'Comment=- average emission values Euro 5 SCR\n- source emissions', 11)]

GaBi-Ext:
[(u'Classifications=Processes', 52), (u'Classifications=Truck', 48), (u'Comment=driving share motorway', 42), (u'Classifications=Transport', 42), (u'Comment=rural', 42), (u'Comment=utilisation [-]', 42), (u'Comment=payload [t]', 42), (u'Classifications=Road', 42), (u'Comment=sulphur content diesel [ppm]', 40), (u'Comment=04', 36), (u'SpatialScope=IN', 36), (u'Comment=CSIR - Central Road Research Institute New Dehli-25', 36), (u'Comment=2014\n- input parameter: distance [km]', 36), (u'Comment=status January 2010\n- Driving share: Fuel efficicency standards of heavy duty vehicles in India', 36), (u'Comment=urban\n- average sulphur content: IN = 350 ppm', 35), (u'Name=Truck', 29), (u'Comment=- average emission values Bharat stage III considered as equal to Euro 3\n- source emissions:  HBEFA 3.1', 12), (u'Comment=- average emission values Bharat stage IV considered as equal to Euro 4\n- source emissions:  HBEFA 3.1', 12), (u'Comment=- average emission values Bharat stage II considered as equal to Euro 2\n- source emissions:  HBEFA 3.1', 12)]

US LCI:
[]

After that, it's just some formatting to get it into tabular form

	process
db	EI (u)	ELCD	GaBi-Ext	GaBi-Pro	US LCI	All
geog
All	13307	503	7457	3319	701	25287
GLO	6218	25	446	338	15	7042
DE	168	19	2131	314		2632
US	92		1179	137	16	1424
RNA	13		649	19	667	1348
CH	1260	10	44	33		1347
RER	1136	75	14	84	3	1312
EU-27		96	296	869		1261
CA-QC	346					346
IN	60		187	55		302
IT	73	11	149	52		285
BR	66		153	59		278
NL	76	10	117	65		268
CN	65		99	99		263
FR	94	10	106	50		260
GB	70	10	97	62		239
AT	73	10	94	47		224
ES	75	10	75	34		194
SE	77	10	60	46		193
BE	67	10	72	44		193