In [36]:
#Script to reformat methane data Top-Down file for two-level Sankey
#In TD approach, Sources = regions, Targets = methane sources

#Output: json file formatted for Sankey diagram

#Created:       30.05.2016
#Last modified: 31.05.2016 Read in orig file (no manual reformatting necessary)
#               02.06.2016 New data files from Marielle Saunois; remove GLO category from final file
#               06.06.2016 Output json files (Sankey format) with min and max values, respectively
#               07.06.2016 Min/max values output only when mean != -99 or != 0
#               10.06.2016 New data files from Marielle

In [1]:
import numpy as np
import pandas as pd
import collections
import os

In [2]:
df_TD = pd.read_csv("../data/Sankey_TD_2003-2012_10june2016.txt", header=1, delim_whitespace=True)
#df_TD = pd.read_csv("../data/Sankey_TD_2003-2012_25MAy2016.txt", header=1, delim_whitespace=True)
#df_TD = pd.read_csv("../data/current/Sankey_TD_2003-2012_june2016.txt", header=1, delim_whitespace=True)
#df_TD = pd.read_csv("../data/current/myeditsSankey_TD_2003-2012_june2016.txt", header=1, delim_whitespace=True)
#df_TD = pd.read_csv("../data/current/mySankey_TD_2003-2012_june2016.txt", header=1, sep="\;")
df_TD


Out[2]:
proc Bor_NAme contUSA Cent_NAme Trop_SAme Temp_SAme NAfr SAfr Russia Oceania Europe China India SE_Asia Temp_Eurasia_Japan GLO
Wetlands mean 13 9 2 44 8 12 20 13 3 2 5 6 27 3 167
Wetlands min 7 6 1 30 4 8 11 10 1 1 3 0 8 1 127
Wetlands max 21 13 3 61 10 14 29 16 2 4 7 10 41 5 202
OtherNat mean 2 3 1 9 2 6 6 2 3 3 6 4 9 4 64
OtherNat min 0 1 0 5 1 3 4 1 1 0 1 1 2 1 21
OtherNat max 4 9 2 22 2 11 11 4 3 2 16 12 26 13 132
Agriwast mean 3 18 5 21 6 12 7 5 4 15 27 25 24 19 188
Agriwast min 2 16 2 19 5 11 7 3 3 9 16 14 12 9 115
Agriwast max 4 23 8 27 8 15 9 7 5 19 37 43 32 24 243
Fossil mean 2 12 2 4 1 7 4 17 1 7 18 3 8 20 105
Fossil min 1 7 0 1 0 3 2 11 0 5 9 1 5 14 77
Fossil max 4 18 3 8 1 8 5 24 2 9 30 4 14 27 133
BioBurBiof mean 1 0 1 7 0 5 6 1 1 1 3 2 6 1 34
BioBurBiof min 0 0 0 3 0 2 4 1 0 0 0 0 3 0 15
BioBurBiof max 1 1 1 15 1 6 9 2 1 1 4 3 8 1 53

In [3]:
df_TD.rename(columns = {'proc':'stats'}, inplace = True)
df_TD.index.name = 'proc'
df_TD


Out[3]:
stats Bor_NAme contUSA Cent_NAme Trop_SAme Temp_SAme NAfr SAfr Russia Oceania Europe China India SE_Asia Temp_Eurasia_Japan GLO
proc
Wetlands mean 13 9 2 44 8 12 20 13 3 2 5 6 27 3 167
Wetlands min 7 6 1 30 4 8 11 10 1 1 3 0 8 1 127
Wetlands max 21 13 3 61 10 14 29 16 2 4 7 10 41 5 202
OtherNat mean 2 3 1 9 2 6 6 2 3 3 6 4 9 4 64
OtherNat min 0 1 0 5 1 3 4 1 1 0 1 1 2 1 21
OtherNat max 4 9 2 22 2 11 11 4 3 2 16 12 26 13 132
Agriwast mean 3 18 5 21 6 12 7 5 4 15 27 25 24 19 188
Agriwast min 2 16 2 19 5 11 7 3 3 9 16 14 12 9 115
Agriwast max 4 23 8 27 8 15 9 7 5 19 37 43 32 24 243
Fossil mean 2 12 2 4 1 7 4 17 1 7 18 3 8 20 105
Fossil min 1 7 0 1 0 3 2 11 0 5 9 1 5 14 77
Fossil max 4 18 3 8 1 8 5 24 2 9 30 4 14 27 133
BioBurBiof mean 1 0 1 7 0 5 6 1 1 1 3 2 6 1 34
BioBurBiof min 0 0 0 3 0 2 4 1 0 0 0 0 3 0 15
BioBurBiof max 1 1 1 15 1 6 9 2 1 1 4 3 8 1 53

In [4]:
#Pivot table so that "stats" Mean, Min, Max become three columns under each region column
#and "proc" becomes index
df_TD_piv = df_TD.pivot(columns='stats', index=df_TD.index)
df_TD_piv


Out[4]:
Bor_NAme contUSA Cent_NAme Trop_SAme ... India SE_Asia Temp_Eurasia_Japan GLO
stats max mean min max mean min max mean min max ... min max mean min max mean min max mean min
proc
Agriwast 4 3 2 23 18 16 8 5 2 27 ... 14 32 24 12 24 19 9 243 188 115
BioBurBiof 1 1 0 1 0 0 1 1 0 15 ... 0 8 6 3 1 1 0 53 34 15
Fossil 4 2 1 18 12 7 3 2 0 8 ... 1 14 8 5 27 20 14 133 105 77
OtherNat 4 2 0 9 3 1 2 1 0 22 ... 1 26 9 2 13 4 1 132 64 21
Wetlands 21 13 7 13 9 6 3 2 1 61 ... 0 41 27 8 5 3 1 202 167 127

5 rows × 45 columns


In [5]:
df_TD_piv.columns[0][0]


Out[5]:
'Bor_NAme'

In [6]:
df_TD_piv.columns[0][0] #Bor_NAme
df_TD_piv.columns[3][0] #contUSA
df_TD_piv.columns[6][0] #CentName
df_TD_piv.columns[9][0] #Trop_SAme


Out[6]:
'Trop_SAme'

In [7]:
df_TD_piv['Bor_NAme']['mean']


Out[7]:
proc
Agriwast       3
BioBurBiof     1
Fossil         2
OtherNat       2
Wetlands      13
Name: mean, dtype: int64

In [8]:
df_TD_piv['Bor_NAme'].loc['Agriwast']['mean']


Out[8]:
3

In [9]:
df_TD_piv['Bor_NAme'].loc['BioBurBiof']['mean']


Out[9]:
1

In [10]:
#Store region names in list
numRegions = df_TD_piv.shape[1] / 3

idx = 0
sources = []
for num in range(0,numRegions):
    sources.append(df_TD_piv.columns[idx][0])
    idx = idx + 3
   
sources


Out[10]:
['Bor_NAme',
 'contUSA',
 'Cent_NAme',
 'Trop_SAme',
 'Temp_SAme',
 'NAfr',
 'SAfr',
 'Russia',
 'Oceania',
 'Europe',
 'China',
 'India',
 'SE_Asia',
 'Temp_Eurasia_Japan',
 'GLO']

In [11]:
#EXCLUDE GLO COLUMN
sources = sources[:len(sources) -1]

In [12]:
sources


Out[12]:
['Bor_NAme',
 'contUSA',
 'Cent_NAme',
 'Trop_SAme',
 'Temp_SAme',
 'NAfr',
 'SAfr',
 'Russia',
 'Oceania',
 'Europe',
 'China',
 'India',
 'SE_Asia',
 'Temp_Eurasia_Japan']

In [13]:
#Get target list
targets = df_TD_piv.index.tolist()
targets


Out[13]:
['Agriwast', 'BioBurBiof', 'Fossil', 'OtherNat', 'Wetlands']

In [14]:
nodes = sources + targets
nodes


Out[14]:
['Bor_NAme',
 'contUSA',
 'Cent_NAme',
 'Trop_SAme',
 'Temp_SAme',
 'NAfr',
 'SAfr',
 'Russia',
 'Oceania',
 'Europe',
 'China',
 'India',
 'SE_Asia',
 'Temp_Eurasia_Japan',
 'Agriwast',
 'BioBurBiof',
 'Fossil',
 'OtherNat',
 'Wetlands']

In [15]:
file = open('../data/Sankey_TD_2003-2012_10june2016_mean_noZeros.json', 'w')

file.write('{\n')
file.write('"nodes": [\n')
for node in nodes:
    file.write('{"name": "%s"},\n' %(node))
# remove last comma
file.seek(-2, os.SEEK_END)
file.truncate()
file.write('\n],\n')

file.write('"links": [\n')

for source in sources:
    print source
    for target in targets:
        print target
        #print df_TD_piv[source].loc[target]['mean']
        value = df_TD_piv[source].loc[target]['mean']
        if value != -99:
            if value != 0:
                print value
                file.write('{"source": "%s", "target": "%s", "value": "%.2f"},\n' %(source, target, float(value)))

# remove last comma
file.seek(-2, os.SEEK_END)
file.truncate()
file.write('\n]\n')
file.write('}\n')

file.close()


Bor_NAme
Agriwast
3
BioBurBiof
1
Fossil
2
OtherNat
2
Wetlands
13
contUSA
Agriwast
18
BioBurBiof
Fossil
12
OtherNat
3
Wetlands
9
Cent_NAme
Agriwast
5
BioBurBiof
1
Fossil
2
OtherNat
1
Wetlands
2
Trop_SAme
Agriwast
21
BioBurBiof
7
Fossil
4
OtherNat
9
Wetlands
44
Temp_SAme
Agriwast
6
BioBurBiof
Fossil
1
OtherNat
2
Wetlands
8
NAfr
Agriwast
12
BioBurBiof
5
Fossil
7
OtherNat
6
Wetlands
12
SAfr
Agriwast
7
BioBurBiof
6
Fossil
4
OtherNat
6
Wetlands
20
Russia
Agriwast
5
BioBurBiof
1
Fossil
17
OtherNat
2
Wetlands
13
Oceania
Agriwast
4
BioBurBiof
1
Fossil
1
OtherNat
3
Wetlands
3
Europe
Agriwast
15
BioBurBiof
1
Fossil
7
OtherNat
3
Wetlands
2
China
Agriwast
27
BioBurBiof
3
Fossil
18
OtherNat
6
Wetlands
5
India
Agriwast
25
BioBurBiof
2
Fossil
3
OtherNat
4
Wetlands
6
SE_Asia
Agriwast
24
BioBurBiof
6
Fossil
8
OtherNat
9
Wetlands
27
Temp_Eurasia_Japan
Agriwast
19
BioBurBiof
1
Fossil
20
OtherNat
4
Wetlands
3

In [16]:
#Write json file for Sankey, MIN VALUES ONLY

file = open('../data/Sankey_TD_2003-2012_10june2016_min.json', 'w')

file.write('{\n')
file.write('"nodes": [\n')
for node in nodes:
    file.write('{"name": "%s"},\n' %(node))
# remove last comma
file.seek(-2, os.SEEK_END)
file.truncate()
file.write('\n],\n')

file.write('"links": [\n')

for source in sources:
    #print source
    for target in targets:
        #print target
        #print df_TD_piv[source].loc[target]['min']
        meanValue = df_TD_piv[source].loc[target]['mean']
        value = df_TD_piv[source].loc[target]['min']
        if meanValue != -99:
            if meanValue != 0:
                file.write('{"source": "%s", "target": "%s", "value": "%.2f"},\n' %(source, target, float(value)))

# remove last comma
file.seek(-2, os.SEEK_END)
file.truncate()
file.write('\n]\n')
file.write('}\n')

file.close()

In [17]:
#Write json file for Sankey, MAX VALUES ONLY

file = open('../data/Sankey_TD_2003-2012_10june2016_max.json', 'w')

file.write('{\n')
file.write('"nodes": [\n')
for node in nodes:
    file.write('{"name": "%s"},\n' %(node))
# remove last comma
file.seek(-2, os.SEEK_END)
file.truncate()
file.write('\n],\n')

file.write('"links": [\n')

for source in sources:
    #print source
    for target in targets:
        #print target
        #print df_TD_piv[source].loc[target]['max']
        meanValue = df_TD_piv[source].loc[target]['mean']
        value = df_TD_piv[source].loc[target]['max']
        if meanValue != -99:
            if meanValue != 0:
                file.write('{"source": "%s", "target": "%s", "value": "%.2f"},\n' %(source, target, float(value)))

# remove last comma
file.seek(-2, os.SEEK_END)
file.truncate()
file.write('\n]\n')
file.write('}\n')

file.close()

In [ ]: