In [36]:
#Script to reformat methane data Top-Down file for two-level Sankey
#In BU approach, Sources = methane sources, Targets = regions

#Output: json file formatted for Sankey diagram

#Created:       31.05.2016
#Last modified: 02.06.2016 New data files from Marielle Saunois; remove GLO category from final file
#               03.06.2016 Add "Not affected" category; Exclude 0 and -99 values
#               06.06.2016 Output json files (Sankey format) with min and max values, respectively
#               07.06.2016 Min/max values output only when mean != -99 or != 0
#               09.06.2016 three levels
#               10.06.2016 New data files from Marielle

In [19]:
import numpy as np
import pandas as pd
import collections
import os
import xlrd

In [20]:
#Subcategories
#Pour Wetlands, il n'y a pas de sous catégories.
#Pour Other Natural, tu pourrais essayer avec :
#  Fresh water (122)
#  Geological source (52)
#  Wild animals (10)
#  Termites( 9)
#  Permafrost and hydrates (3)
#  Wild fires (1)

122+52+10+9+3+3


Out[20]:
199

In [90]:
df_BU = pd.read_csv("../data/Sankey_BU_2003-2012_10june2016.txt", header=1, delim_whitespace=True)
df_BU


Out[90]:
proc Bor_NAme contUSA Cent_NAme Trop_SAme Temp_SAme NAfr SAfr Russia Oceania Europe China India SE_Asia Temp_Eurasia_Japan GLO
Wetlands mean 32 13 2 42 4 8 19 14 3 4 5 6 29 3 185
Wetlands min 15 6 1 19 1 3 15 5 1 1 1 1 18 1 153
Wetlands max 61 23 4 59 7 16 22 26 6 7 9 13 35 6 227
OtherNat mean -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 199
OtherNat min -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 104
OtherNat max -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 297
Agriwast mean 3 17 5 21 6 14 6 5 5 17 30 21 22 20 195
Agriwast min 2 15 2 18 5 12 5 5 5 15 23 16 18 17 178
Agriwast max 3 23 6 23 6 18 7 5 5 18 36 24 25 22 206
Fossil mean 2 11 2 8 1 9 4 20 2 6 24 3 6 22 121
Fossil min 1 9 0 3 1 7 3 18 1 3 15 2 5 20 114
Fossil max 3 16 3 19 1 14 5 26 2 8 31 4 7 26 133
BioBurBiof mean 1 1 1 3 0 4 6 2 1 1 3 2 5 1 30
BioBurBiof min 0 0 0 1 0 3 4 0 0 0 2 2 4 1 27
BioBurBiof max 1 2 1 6 1 5 10 4 1 1 4 3 5 2 35

In [91]:
df_BU.rename(columns = {'proc':'stats'}, inplace = True)
df_BU.index.name = 'proc'
df_BU


Out[91]:
stats Bor_NAme contUSA Cent_NAme Trop_SAme Temp_SAme NAfr SAfr Russia Oceania Europe China India SE_Asia Temp_Eurasia_Japan GLO
proc
Wetlands mean 32 13 2 42 4 8 19 14 3 4 5 6 29 3 185
Wetlands min 15 6 1 19 1 3 15 5 1 1 1 1 18 1 153
Wetlands max 61 23 4 59 7 16 22 26 6 7 9 13 35 6 227
OtherNat mean -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 199
OtherNat min -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 104
OtherNat max -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 297
Agriwast mean 3 17 5 21 6 14 6 5 5 17 30 21 22 20 195
Agriwast min 2 15 2 18 5 12 5 5 5 15 23 16 18 17 178
Agriwast max 3 23 6 23 6 18 7 5 5 18 36 24 25 22 206
Fossil mean 2 11 2 8 1 9 4 20 2 6 24 3 6 22 121
Fossil min 1 9 0 3 1 7 3 18 1 3 15 2 5 20 114
Fossil max 3 16 3 19 1 14 5 26 2 8 31 4 7 26 133
BioBurBiof mean 1 1 1 3 0 4 6 2 1 1 3 2 5 1 30
BioBurBiof min 0 0 0 1 0 3 4 0 0 0 2 2 4 1 27
BioBurBiof max 1 2 1 6 1 5 10 4 1 1 4 3 5 2 35

In [92]:
#Add "Not affected" column and set values to 0
df_BU['Not affected'] = 0
df_BU


Out[92]:
stats Bor_NAme contUSA Cent_NAme Trop_SAme Temp_SAme NAfr SAfr Russia Oceania Europe China India SE_Asia Temp_Eurasia_Japan GLO Not affected
proc
Wetlands mean 32 13 2 42 4 8 19 14 3 4 5 6 29 3 185 0
Wetlands min 15 6 1 19 1 3 15 5 1 1 1 1 18 1 153 0
Wetlands max 61 23 4 59 7 16 22 26 6 7 9 13 35 6 227 0
OtherNat mean -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 199 0
OtherNat min -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 104 0
OtherNat max -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 297 0
Agriwast mean 3 17 5 21 6 14 6 5 5 17 30 21 22 20 195 0
Agriwast min 2 15 2 18 5 12 5 5 5 15 23 16 18 17 178 0
Agriwast max 3 23 6 23 6 18 7 5 5 18 36 24 25 22 206 0
Fossil mean 2 11 2 8 1 9 4 20 2 6 24 3 6 22 121 0
Fossil min 1 9 0 3 1 7 3 18 1 3 15 2 5 20 114 0
Fossil max 3 16 3 19 1 14 5 26 2 8 31 4 7 26 133 0
BioBurBiof mean 1 1 1 3 0 4 6 2 1 1 3 2 5 1 30 0
BioBurBiof min 0 0 0 1 0 3 4 0 0 0 2 2 4 1 27 0
BioBurBiof max 1 2 1 6 1 5 10 4 1 1 4 3 5 2 35 0

In [93]:
#Copy GLO values for OtherNat into "Not affected" col
df_BU['Not affected']['OtherNat'] = df_BU['GLO']['OtherNat']


/usr/lib/python2.7/site-packages/IPython/kernel/__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from IPython.kernel.zmq import kernelapp as app

In [94]:
df_BU


Out[94]:
stats Bor_NAme contUSA Cent_NAme Trop_SAme Temp_SAme NAfr SAfr Russia Oceania Europe China India SE_Asia Temp_Eurasia_Japan GLO Not affected
proc
Wetlands mean 32 13 2 42 4 8 19 14 3 4 5 6 29 3 185 0
Wetlands min 15 6 1 19 1 3 15 5 1 1 1 1 18 1 153 0
Wetlands max 61 23 4 59 7 16 22 26 6 7 9 13 35 6 227 0
OtherNat mean -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 199 199
OtherNat min -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 104 104
OtherNat max -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 297 297
Agriwast mean 3 17 5 21 6 14 6 5 5 17 30 21 22 20 195 0
Agriwast min 2 15 2 18 5 12 5 5 5 15 23 16 18 17 178 0
Agriwast max 3 23 6 23 6 18 7 5 5 18 36 24 25 22 206 0
Fossil mean 2 11 2 8 1 9 4 20 2 6 24 3 6 22 121 0
Fossil min 1 9 0 3 1 7 3 18 1 3 15 2 5 20 114 0
Fossil max 3 16 3 19 1 14 5 26 2 8 31 4 7 26 133 0
BioBurBiof mean 1 1 1 3 0 4 6 2 1 1 3 2 5 1 30 0
BioBurBiof min 0 0 0 1 0 3 4 0 0 0 2 2 4 1 27 0
BioBurBiof max 1 2 1 6 1 5 10 4 1 1 4 3 5 2 35 0

In [95]:
#sub_df = pd.read_csv("../data/Sankey_BU_subcategories.csv", header=1, sep="\;")
sub_df = pd.read_csv("../data/Sankey_BU_subcategories_withFake.csv", header=1, sep="\;")
sub_df


Out[95]:
proc stats Fossil Agriwast BioBurBiof OtherNat Wetlands
0 Coal mean 41 0 0 0 0
1 Coal min 26 0 0 0 0
2 Coal max 50 0 0 0 0
3 GasAndOilIndustry mean 79 0 0 0 0
4 GasAndOilIndustry min 69 0 0 0 0
5 GasAndOilIndustry max 88 0 0 0 0
6 EntericFermentationAndManure mean 0 106 0 0 0
7 EntericFermentationAndManure min 0 97 0 0 0
8 EntericFermentationAndManure max 0 111 0 0 0
9 LandfillsAndWaste mean 0 59 0 0 0
10 LandfillsAndWaste min 0 52 0 0 0
11 LandfillsAndWaste max 0 63 0 0 0
12 Rice mean 0 30 0 0 0
13 Rice min 0 24 0 0 0
14 Rice max 0 36 0 0 0
15 BiomassBurning mean 0 0 18 0 0
16 BiomassBurning min 0 0 15 0 0
17 BiomassBurning max 0 0 21 0 0
18 BiofuelBurning mean 0 0 12 0 0
19 BiofuelBurning min 0 0 10 0 0
20 BiofuelBurning max 0 0 14 0 0
21 Freshwater mean 0 0 0 122 0
22 Freshwater min 0 0 0 60 0
23 Freshwater max 0 0 0 180 0
24 GeologicalSources mean 0 0 0 52 0
25 GeologicalSources min 0 0 0 32 0
26 GeologicalSources max 0 0 0 76 0
27 WildAnimals mean 0 0 0 10 0
28 WildAnimals min 0 0 0 5 0
29 WildAnimals max 0 0 0 15 0
30 Termites mean 0 0 0 9 0
31 Termites min 0 0 0 3 0
32 Termites max 0 0 0 15 0
33 PermafrostAndHydrates mean 0 0 0 3 0
34 PermafrostAndHydrates min 0 0 0 -888 0
35 PermafrostAndHydrates max 0 0 0 -889 0
36 Wildfires mean 0 0 0 3 0
37 Wildfires min 0 0 0 1 0
38 Wildfires max 0 0 0 5 0
39 fakeSource mean 0 0 0 0 100
40 fakeSource min 0 0 0 0 1
41 fakeSource max 0 0 0 0 10

In [96]:
sub_df.index = sub_df['proc']
sub_df = sub_df.drop(['proc'], axis=1)
sub_df.head(10)


Out[96]:
stats Fossil Agriwast BioBurBiof OtherNat Wetlands
proc
Coal mean 41 0 0 0 0
Coal min 26 0 0 0 0
Coal max 50 0 0 0 0
GasAndOilIndustry mean 79 0 0 0 0
GasAndOilIndustry min 69 0 0 0 0
GasAndOilIndustry max 88 0 0 0 0
EntericFermentationAndManure mean 0 106 0 0 0
EntericFermentationAndManure min 0 97 0 0 0
EntericFermentationAndManure max 0 111 0 0 0
LandfillsAndWaste mean 0 59 0 0 0

In [97]:
#Pivot table so that "stats" Mean, Min, Max become three columns under each region column
#and "proc" becomes index
df_BU_piv = df_BU.pivot(columns='stats', index=df_BU.index)
df_BU_piv


Out[97]:
Bor_NAme contUSA Cent_NAme Trop_SAme ... SE_Asia Temp_Eurasia_Japan GLO Not affected
stats max mean min max mean min max mean min max ... min max mean min max mean min max mean min
proc
Agriwast 3 3 2 23 17 15 6 5 2 23 ... 18 22 20 17 206 195 178 0 0 0
BioBurBiof 1 1 0 2 1 0 1 1 0 6 ... 4 2 1 1 35 30 27 0 0 0
Fossil 3 2 1 16 11 9 3 2 0 19 ... 5 26 22 20 133 121 114 0 0 0
OtherNat -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 ... -99 -99 -99 -99 297 199 104 297 199 104
Wetlands 61 32 15 23 13 6 4 2 1 59 ... 18 6 3 1 227 185 153 0 0 0

5 rows × 48 columns


In [98]:
#Pivot table so that "stats" Mean, Min, Max become three columns under each region column
#and "proc" becomes index
sub_df_piv = sub_df.pivot(columns='stats', index=sub_df.index)
sub_df_piv


Out[98]:
Fossil Agriwast BioBurBiof OtherNat Wetlands
stats max min mean max min mean max min mean max min mean max min mean
proc
BiofuelBurning 0 0 0 0 0 0 14 10 12 0 0 0 0 0 0
BiomassBurning 0 0 0 0 0 0 21 15 18 0 0 0 0 0 0
Coal 50 26 41 0 0 0 0 0 0 0 0 0 0 0 0
EntericFermentationAndManure 0 0 0 111 97 106 0 0 0 0 0 0 0 0 0
Freshwater 0 0 0 0 0 0 0 0 0 180 60 122 0 0 0
GasAndOilIndustry 88 69 79 0 0 0 0 0 0 0 0 0 0 0 0
GeologicalSources 0 0 0 0 0 0 0 0 0 76 32 52 0 0 0
LandfillsAndWaste 0 0 0 63 52 59 0 0 0 0 0 0 0 0 0
PermafrostAndHydrates 0 0 0 0 0 0 0 0 0 -889 -888 3 0 0 0
Rice 0 0 0 36 24 30 0 0 0 0 0 0 0 0 0
Termites 0 0 0 0 0 0 0 0 0 15 3 9 0 0 0
WildAnimals 0 0 0 0 0 0 0 0 0 15 5 10 0 0 0
Wildfires 0 0 0 0 0 0 0 0 0 5 1 3 0 0 0
fakeSource 0 0 0 0 0 0 0 0 0 0 0 0 10 1 100

In [99]:
df_BU_piv.columns[0][0] #Bor_NAme
df_BU_piv.columns[3][0] #contUSA
df_BU_piv.columns[6][0] #CentName
df_BU_piv.columns[9][0] #Trop_SAme


Out[99]:
'Trop_SAme'

In [100]:
df_BU_piv['Bor_NAme']['mean']


Out[100]:
proc
Agriwast       3
BioBurBiof     1
Fossil         2
OtherNat     -99
Wetlands      32
Name: mean, dtype: int64

In [101]:
df_BU_piv['Bor_NAme'].loc['Agriwast']


Out[101]:
stats
max     3
mean    3
min     2
Name: Agriwast, dtype: int64

In [102]:
df_BU_piv['Bor_NAme'].loc['BioBurBiof']['mean']


Out[102]:
1

In [103]:
#Create regions, categories and subcategories array

#regions
#Store region names in list
numRegions = df_BU_piv.shape[1] / 3

idx = 0
regions = []
for num in range(0,numRegions):    
    if df_BU_piv.columns[idx][0] != "GLO" : #exclude GLO column
      regions.append(df_BU_piv.columns[idx][0])
    idx = idx + 3

#categories    
categories = df_BU_piv.index.tolist()
 
#subcategories
subcategories = sub_df.index.tolist()
subcategories = list(set(subcategories)) #finds unique values
   
regions
categories
subcategories


Out[103]:
['Termites',
 'Wildfires',
 'Freshwater',
 'WildAnimals',
 'EntericFermentationAndManure',
 'fakeSource',
 'LandfillsAndWaste',
 'Coal',
 'BiomassBurning',
 'GeologicalSources',
 'PermafrostAndHydrates',
 'Rice',
 'BiofuelBurning',
 'GasAndOilIndustry']

In [104]:
nodes = categories + regions + subcategories
nodes


Out[104]:
['Agriwast',
 'BioBurBiof',
 'Fossil',
 'OtherNat',
 'Wetlands',
 'Bor_NAme',
 'contUSA',
 'Cent_NAme',
 'Trop_SAme',
 'Temp_SAme',
 'NAfr',
 'SAfr',
 'Russia',
 'Oceania',
 'Europe',
 'China',
 'India',
 'SE_Asia',
 'Temp_Eurasia_Japan',
 'Not affected',
 'Termites',
 'Wildfires',
 'Freshwater',
 'WildAnimals',
 'EntericFermentationAndManure',
 'fakeSource',
 'LandfillsAndWaste',
 'Coal',
 'BiomassBurning',
 'GeologicalSources',
 'PermafrostAndHydrates',
 'Rice',
 'BiofuelBurning',
 'GasAndOilIndustry']

In [105]:
#Replace -99 values with 0
#df_BU_piv = df_BU_piv.replace(-99, 1)
#df_BU_piv

#Replace -99 and 0 values with -1
#df_BU_piv = df_BU_piv.replace(-99, -1)
#df_BU_piv = df_BU_piv.replace(0, -1)
df_BU_piv


Out[105]:
Bor_NAme contUSA Cent_NAme Trop_SAme ... SE_Asia Temp_Eurasia_Japan GLO Not affected
stats max mean min max mean min max mean min max ... min max mean min max mean min max mean min
proc
Agriwast 3 3 2 23 17 15 6 5 2 23 ... 18 22 20 17 206 195 178 0 0 0
BioBurBiof 1 1 0 2 1 0 1 1 0 6 ... 4 2 1 1 35 30 27 0 0 0
Fossil 3 2 1 16 11 9 3 2 0 19 ... 5 26 22 20 133 121 114 0 0 0
OtherNat -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 ... -99 -99 -99 -99 297 199 104 297 199 104
Wetlands 61 32 15 23 13 6 4 2 1 59 ... 18 6 3 1 227 185 153 0 0 0

5 rows × 48 columns


In [106]:
sub_df_piv


Out[106]:
Fossil Agriwast BioBurBiof OtherNat Wetlands
stats max min mean max min mean max min mean max min mean max min mean
proc
BiofuelBurning 0 0 0 0 0 0 14 10 12 0 0 0 0 0 0
BiomassBurning 0 0 0 0 0 0 21 15 18 0 0 0 0 0 0
Coal 50 26 41 0 0 0 0 0 0 0 0 0 0 0 0
EntericFermentationAndManure 0 0 0 111 97 106 0 0 0 0 0 0 0 0 0
Freshwater 0 0 0 0 0 0 0 0 0 180 60 122 0 0 0
GasAndOilIndustry 88 69 79 0 0 0 0 0 0 0 0 0 0 0 0
GeologicalSources 0 0 0 0 0 0 0 0 0 76 32 52 0 0 0
LandfillsAndWaste 0 0 0 63 52 59 0 0 0 0 0 0 0 0 0
PermafrostAndHydrates 0 0 0 0 0 0 0 0 0 -889 -888 3 0 0 0
Rice 0 0 0 36 24 30 0 0 0 0 0 0 0 0 0
Termites 0 0 0 0 0 0 0 0 0 15 3 9 0 0 0
WildAnimals 0 0 0 0 0 0 0 0 0 15 5 10 0 0 0
Wildfires 0 0 0 0 0 0 0 0 0 5 1 3 0 0 0
fakeSource 0 0 0 0 0 0 0 0 0 0 0 0 10 1 100

In [107]:
#df_BU_piv['Bor_NAme'].loc['Agriwast']['mean']
sub_df_piv['OtherNat'].loc['Freshwater'][2] #[0] = max, [1] = min, [2] = mean

sub_df_piv['Agriwast'].loc['Termites'][2] #[0] = max, [1] = min, [2] = mean


Out[107]:
0

In [108]:
#Write json file for 3-level Sankey, MEAN VALUES ONLY
#GLO omitted

file = open('../data/Sankey_BU_2003-2012_10june2016_mean_3levels.json', 'w')

file.write('{\n')
file.write('"nodes": [\n')
for node in nodes:
    file.write('{"name": "%s"},\n' %(node))
# remove last comma
file.seek(-2, os.SEEK_END)
file.truncate()
file.write('\n],\n')

file.write('"links": [\n')

#source-target pairs for categories -> regions
for category in categories:
    #print category
    for region in regions:
        #print region
        value = df_BU_piv[region].loc[category]['mean']     
        if value != -99:
            if value != 0:
                #print value
                file.write('{"source": "%s", "target": "%s", "value": "%.2f"},\n' %(category, region, float(value)))                
    
#source-target pairs for subcategories -> categories
for subcategory in subcategories:    
    for category in categories:    
        value2 = sub_df_piv[category].loc[subcategory][2] #[0] = max, [1] = min, [2] = mean
        if value2 != 0:
            file.write('{"source": "%s", "target": "%s", "value": "%.2f"},\n' %(subcategory, category, float(value2)))

# remove last comma
file.seek(-2, os.SEEK_END)
file.truncate()
file.write('\n]\n')
file.write('}\n')

file.close()

In [109]:
#Write json file for 3-level Sankey, MIN VALUES ONLY
#GLO omitted

file = open('../data/Sankey_BU_2003-2012_10june2016_min_3levels.json', 'w')

file.write('{\n')
file.write('"nodes": [\n')
for node in nodes:
    file.write('{"name": "%s"},\n' %(node))
# remove last comma
file.seek(-2, os.SEEK_END)
file.truncate()
file.write('\n],\n')

file.write('"links": [\n')

#source-target pairs for categories -> regions
for category in categories:
    for region in regions:
        meanValue = df_BU_piv[region].loc[category]['mean']
        value = df_BU_piv[region].loc[category]['min']
        if meanValue != -99:
            if meanValue != 0:
                file.write('{"source": "%s", "target": "%s", "value": "%.2f"},\n' %(category, region, float(value)))
                
#source-target pairs for subcategories -> categories
for subcategory in subcategories:
    for category in categories:    
        value2 = sub_df_piv[category].loc[subcategory][1] #[0] = max, [1] = min, [2] = mean
        if value2 != 0:
            file.write('{"source": "%s", "target": "%s", "value": "%.2f"},\n' %(subcategory, category, float(value2)))

# remove last comma
file.seek(-2, os.SEEK_END)
file.truncate()
file.write('\n]\n')
file.write('}\n')

file.close()

In [110]:
#Write json file for Sankey, MAX VALUES ONLY
#GLO omitted

file = open('../data/Sankey_BU_2003-2012_10june2016_max_3levels.json', 'w')

file.write('{\n')
file.write('"nodes": [\n')
for node in nodes:
    file.write('{"name": "%s"},\n' %(node))
# remove last comma
file.seek(-2, os.SEEK_END)
file.truncate()
file.write('\n],\n')

file.write('"links": [\n')

#source-target pairs for categories -> regions
for category in categories:
    for region in regions:
        meanValue = df_BU_piv[region].loc[category]['mean']
        value = df_BU_piv[region].loc[category]['max']
        if meanValue != -99:
            if meanValue != 0:
                file.write('{"source": "%s", "target": "%s", "value": "%.2f"},\n' %(category, region, float(value)))
                
#source-target pairs for subcategories -> categories
for subcategory in subcategories:
    for category in categories:    
        value2 = sub_df_piv[category].loc[subcategory][0] #[0] = max, [1] = min, [2] = mean
        if value2 != 0:
            file.write('{"source": "%s", "target": "%s", "value": "%.2f"},\n' %(subcategory, category, float(value2)))

# remove last comma
file.seek(-2, os.SEEK_END)
file.truncate()
file.write('\n]\n')
file.write('}\n')

file.close()

In [ ]:


In [ ]:


In [40]:


In [ ]: