In [1]:
#Script to reformat methane data Top-Down file for two-level Sankey
#In TD approach, Sources = regions, Targets = methane sources

#Output: json file formatted for Sankey diagram

#Created:       30.05.2016
#Last modified: 31.05.2016 Read in orig file (no manual reformatting necessary)
#               02.06.2016 New data files from Marielle Saunois; remove GLO category from final file
#               06.06.2016 Output json files (Sankey format) with min and max values, respectively
#               07.06.2016 Min/max values output only when mean != -99 or != 0
#               10.06.2016 New data files from Marielle

In [2]:
import numpy as np
import pandas as pd
import collections
import os

In [3]:
df_TD = pd.read_csv("../data/Sankey_TD_2003-2012_sept2016.txt", header=1, delim_whitespace=True)
#df_TD = pd.read_csv("../data/Sankey_TD_2003-2012_10june2016.txt", header=1, delim_whitespace=True)
#df_TD = pd.read_csv("../data/Sankey_TD_2003-2012_25MAy2016.txt", header=1, delim_whitespace=True)
#df_TD = pd.read_csv("../data/current/Sankey_TD_2003-2012_june2016.txt", header=1, delim_whitespace=True)
#df_TD = pd.read_csv("../data/current/myeditsSankey_TD_2003-2012_june2016.txt", header=1, delim_whitespace=True)
#df_TD = pd.read_csv("../data/current/mySankey_TD_2003-2012_june2016.txt", header=1, sep="\;")
df_TD


Out[3]:
proc Bor_NAme contUSA Cent_NAme Trop_SAme Temp_SAme NAfr SAfr Russia Oceania Europe China India SE_Asia Temp_Eurasia_Japan GLO
Wetlands mean 13 9 2 44 8 12 20 13 3 2 5 6 27 3 167
Wetlands min 7 6 1 30 4 8 11 10 1 1 3 0 8 1 127
Wetlands max 21 13 3 61 10 14 29 16 2 4 7 10 41 5 202
OtherNatural mean 2 3 1 9 2 6 6 2 3 3 6 4 9 4 64
OtherNatural min 0 1 0 5 1 3 4 1 1 0 1 1 2 1 21
OtherNatural max 4 9 2 22 2 11 11 4 3 2 16 12 26 13 132
AgriWaste mean 3 18 5 21 6 12 7 5 4 15 27 25 24 19 188
AgriWaste min 2 16 2 19 5 11 7 3 3 9 16 14 12 9 115
AgriWaste max 4 23 8 27 8 15 9 7 5 19 37 43 32 24 243
Fossil mean 2 12 2 4 1 7 4 17 1 7 18 3 8 20 105
Fossil min 1 7 0 1 0 3 2 11 0 5 9 1 5 14 77
Fossil max 4 18 3 8 1 8 5 24 2 9 30 4 14 27 133
BioBurBiof mean 1 0 1 7 0 5 6 1 1 1 3 2 6 1 34
BioBurBiof min 0 0 0 3 0 2 4 1 0 0 0 0 3 0 15
BioBurBiof max 1 1 1 15 1 6 9 2 1 1 4 3 8 1 53
SumSources mean 20 41 11 84 17 42 44 38 11 28 58 39 74 46 558
SumSources min 13 34 5 65 12 36 38 31 7 21 51 28 55 38 540
SumSources max 27 49 15 101 27 55 53 45 19 34 72 46 84 54 568

In [4]:
df_TD.rename(columns = {'proc':'stats'}, inplace = True)
df_TD.index.name = 'proc'
df_TD


Out[4]:
stats Bor_NAme contUSA Cent_NAme Trop_SAme Temp_SAme NAfr SAfr Russia Oceania Europe China India SE_Asia Temp_Eurasia_Japan GLO
proc
Wetlands mean 13 9 2 44 8 12 20 13 3 2 5 6 27 3 167
Wetlands min 7 6 1 30 4 8 11 10 1 1 3 0 8 1 127
Wetlands max 21 13 3 61 10 14 29 16 2 4 7 10 41 5 202
OtherNatural mean 2 3 1 9 2 6 6 2 3 3 6 4 9 4 64
OtherNatural min 0 1 0 5 1 3 4 1 1 0 1 1 2 1 21
OtherNatural max 4 9 2 22 2 11 11 4 3 2 16 12 26 13 132
AgriWaste mean 3 18 5 21 6 12 7 5 4 15 27 25 24 19 188
AgriWaste min 2 16 2 19 5 11 7 3 3 9 16 14 12 9 115
AgriWaste max 4 23 8 27 8 15 9 7 5 19 37 43 32 24 243
Fossil mean 2 12 2 4 1 7 4 17 1 7 18 3 8 20 105
Fossil min 1 7 0 1 0 3 2 11 0 5 9 1 5 14 77
Fossil max 4 18 3 8 1 8 5 24 2 9 30 4 14 27 133
BioBurBiof mean 1 0 1 7 0 5 6 1 1 1 3 2 6 1 34
BioBurBiof min 0 0 0 3 0 2 4 1 0 0 0 0 3 0 15
BioBurBiof max 1 1 1 15 1 6 9 2 1 1 4 3 8 1 53
SumSources mean 20 41 11 84 17 42 44 38 11 28 58 39 74 46 558
SumSources min 13 34 5 65 12 36 38 31 7 21 51 28 55 38 540
SumSources max 27 49 15 101 27 55 53 45 19 34 72 46 84 54 568

In [5]:
#Pivot table so that "stats" Mean, Min, Max become three columns under each region column
#and "proc" becomes index
df_TD_piv = df_TD.pivot(columns='stats', index=df_TD.index)
df_TD_piv


Out[5]:
Bor_NAme contUSA Cent_NAme Trop_SAme ... India SE_Asia Temp_Eurasia_Japan GLO
stats max mean min max mean min max mean min max ... min max mean min max mean min max mean min
proc
AgriWaste 4 3 2 23 18 16 8 5 2 27 ... 14 32 24 12 24 19 9 243 188 115
BioBurBiof 1 1 0 1 0 0 1 1 0 15 ... 0 8 6 3 1 1 0 53 34 15
Fossil 4 2 1 18 12 7 3 2 0 8 ... 1 14 8 5 27 20 14 133 105 77
OtherNatural 4 2 0 9 3 1 2 1 0 22 ... 1 26 9 2 13 4 1 132 64 21
SumSources 27 20 13 49 41 34 15 11 5 101 ... 28 84 74 55 54 46 38 568 558 540
Wetlands 21 13 7 13 9 6 3 2 1 61 ... 0 41 27 8 5 3 1 202 167 127

6 rows × 45 columns


In [6]:
df_TD_piv.columns[0][0]


Out[6]:
'Bor_NAme'

In [7]:
df_TD_piv.columns[0][0] #Bor_NAme
df_TD_piv.columns[3][0] #contUSA
df_TD_piv.columns[6][0] #CentName
df_TD_piv.columns[9][0] #Trop_SAme


Out[7]:
'Trop_SAme'

In [8]:
df_TD_piv['Bor_NAme']['mean']


Out[8]:
proc
AgriWaste        3
BioBurBiof       1
Fossil           2
OtherNatural     2
SumSources      20
Wetlands        13
Name: mean, dtype: int64

In [11]:
df_TD_piv['Bor_NAme'].loc['AgriWaste']['mean']


Out[11]:
3

In [12]:
df_TD_piv['Bor_NAme'].loc['BioBurBiof']['mean']


Out[12]:
1

In [13]:
#Store region names in list
numRegions = df_TD_piv.shape[1] / 3

idx = 0
sources = []
for num in range(0,numRegions):
    sources.append(df_TD_piv.columns[idx][0])
    idx = idx + 3
   
sources


Out[13]:
['Bor_NAme',
 'contUSA',
 'Cent_NAme',
 'Trop_SAme',
 'Temp_SAme',
 'NAfr',
 'SAfr',
 'Russia',
 'Oceania',
 'Europe',
 'China',
 'India',
 'SE_Asia',
 'Temp_Eurasia_Japan',
 'GLO']

In [14]:
#EXCLUDE GLO COLUMN
sources = sources[:len(sources) -1]

In [15]:
sources


Out[15]:
['Bor_NAme',
 'contUSA',
 'Cent_NAme',
 'Trop_SAme',
 'Temp_SAme',
 'NAfr',
 'SAfr',
 'Russia',
 'Oceania',
 'Europe',
 'China',
 'India',
 'SE_Asia',
 'Temp_Eurasia_Japan']

In [22]:
#Get target list
targets_orig = df_TD_piv.index.tolist()
targets_orig


Out[22]:
['AgriWaste', 'BioBurBiof', 'Fossil', 'OtherNatural', 'SumSources', 'Wetlands']

In [34]:
#EXCLUDE SumSources COLUMN
targets = targets_orig[0:5]
targets[len(targets) -1] = targets_orig[5]

targets


Out[34]:
['AgriWaste', 'BioBurBiof', 'Fossil', 'OtherNatural', 'Wetlands']

In [35]:
nodes = sources + targets
nodes


Out[35]:
['Bor_NAme',
 'contUSA',
 'Cent_NAme',
 'Trop_SAme',
 'Temp_SAme',
 'NAfr',
 'SAfr',
 'Russia',
 'Oceania',
 'Europe',
 'China',
 'India',
 'SE_Asia',
 'Temp_Eurasia_Japan',
 'AgriWaste',
 'BioBurBiof',
 'Fossil',
 'OtherNatural',
 'Wetlands']

In [36]:
file = open('../data/Sankey_TD_2003-2012_16sept2016_mean_noZeros.json', 'w')

file.write('{\n')
file.write('"nodes": [\n')
for node in nodes:
    file.write('{"name": "%s"},\n' %(node))
# remove last comma
file.seek(-2, os.SEEK_END)
file.truncate()
file.write('\n],\n')

file.write('"links": [\n')

for source in sources:
    print source
    for target in targets:
        print target
        #print df_TD_piv[source].loc[target]['mean']
        value = df_TD_piv[source].loc[target]['mean']
        if value != -99:
            if value != 0:
                print value
                file.write('{"source": "%s", "target": "%s", "value": "%.2f"},\n' %(source, target, float(value)))

# remove last comma
file.seek(-2, os.SEEK_END)
file.truncate()
file.write('\n]\n')
file.write('}\n')

file.close()


Bor_NAme
AgriWaste
3
BioBurBiof
1
Fossil
2
OtherNatural
2
Wetlands
13
contUSA
AgriWaste
18
BioBurBiof
Fossil
12
OtherNatural
3
Wetlands
9
Cent_NAme
AgriWaste
5
BioBurBiof
1
Fossil
2
OtherNatural
1
Wetlands
2
Trop_SAme
AgriWaste
21
BioBurBiof
7
Fossil
4
OtherNatural
9
Wetlands
44
Temp_SAme
AgriWaste
6
BioBurBiof
Fossil
1
OtherNatural
2
Wetlands
8
NAfr
AgriWaste
12
BioBurBiof
5
Fossil
7
OtherNatural
6
Wetlands
12
SAfr
AgriWaste
7
BioBurBiof
6
Fossil
4
OtherNatural
6
Wetlands
20
Russia
AgriWaste
5
BioBurBiof
1
Fossil
17
OtherNatural
2
Wetlands
13
Oceania
AgriWaste
4
BioBurBiof
1
Fossil
1
OtherNatural
3
Wetlands
3
Europe
AgriWaste
15
BioBurBiof
1
Fossil
7
OtherNatural
3
Wetlands
2
China
AgriWaste
27
BioBurBiof
3
Fossil
18
OtherNatural
6
Wetlands
5
India
AgriWaste
25
BioBurBiof
2
Fossil
3
OtherNatural
4
Wetlands
6
SE_Asia
AgriWaste
24
BioBurBiof
6
Fossil
8
OtherNatural
9
Wetlands
27
Temp_Eurasia_Japan
AgriWaste
19
BioBurBiof
1
Fossil
20
OtherNatural
4
Wetlands
3

In [37]:
#Write json file for Sankey, MIN VALUES ONLY

file = open('../data/Sankey_TD_2003-2012_16sept2016_min.json', 'w')

file.write('{\n')
file.write('"nodes": [\n')
for node in nodes:
    file.write('{"name": "%s"},\n' %(node))
# remove last comma
file.seek(-2, os.SEEK_END)
file.truncate()
file.write('\n],\n')

file.write('"links": [\n')

for source in sources:
    #print source
    for target in targets:
        #print target
        #print df_TD_piv[source].loc[target]['min']
        meanValue = df_TD_piv[source].loc[target]['mean']
        value = df_TD_piv[source].loc[target]['min']
        if meanValue != -99:
            if meanValue != 0:
                file.write('{"source": "%s", "target": "%s", "value": "%.2f"},\n' %(source, target, float(value)))

# remove last comma
file.seek(-2, os.SEEK_END)
file.truncate()
file.write('\n]\n')
file.write('}\n')

file.close()

In [38]:
#Write json file for Sankey, MAX VALUES ONLY

file = open('../data/Sankey_TD_2003-2012_16sept2016_max.json', 'w')

file.write('{\n')
file.write('"nodes": [\n')
for node in nodes:
    file.write('{"name": "%s"},\n' %(node))
# remove last comma
file.seek(-2, os.SEEK_END)
file.truncate()
file.write('\n],\n')

file.write('"links": [\n')

for source in sources:
    #print source
    for target in targets:
        #print target
        #print df_TD_piv[source].loc[target]['max']
        meanValue = df_TD_piv[source].loc[target]['mean']
        value = df_TD_piv[source].loc[target]['max']
        if meanValue != -99:
            if meanValue != 0:
                file.write('{"source": "%s", "target": "%s", "value": "%.2f"},\n' %(source, target, float(value)))

# remove last comma
file.seek(-2, os.SEEK_END)
file.truncate()
file.write('\n]\n')
file.write('}\n')

file.close()

In [ ]: