In [36]:
#Script to reformat methane data Top-Down file for two-level Sankey
#In BU approach, Sources = methane sources, Targets = regions

#Output: json file formatted for Sankey diagram

#Created:       31.05.2016
#Last modified: 02.06.2016 New data files from Marielle Saunois; remove GLO category from final file
#               03.06.2016 Add "Not affected" category; Exclude 0 and -99 values
#               06.06.2016 Output json files (Sankey format) with min and max values, respectively
#               07.06.2016 Min/max values output only when mean != -99 or != 0

In [1]:
import numpy as np
import pandas as pd
import collections
import os
import xlrd

In [21]:
df_BU = pd.read_csv("../data/Sankey_BU_2003-2012_25MAy2016.txt", header=1, delim_whitespace=True)
df_BU


Out[21]:
proc Bor_NAme contUSA Cent_NAme Trop_SAme Temp_SAme NAfr SAfr Russia Oceania Europe China India SE_Asia Temp_Eurasia_Japan GLO
Wetlands mean 32 13 2 42 4 8 19 14 3 4 5 6 29 3 185
Wetlands min 15 6 1 19 1 3 15 5 1 1 1 1 18 1 153
Wetlands max 61 23 4 59 7 16 22 26 6 7 9 13 35 6 227
OtherNat mean -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 199
OtherNat min -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 104
OtherNat max -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 297
Agriwast mean 3 17 5 21 6 14 6 5 5 17 30 21 22 20 195
Agriwast min 2 15 2 18 5 12 5 5 5 15 23 16 18 17 178
Agriwast max 3 23 6 23 6 18 7 5 5 18 36 24 25 22 206
Fossil mean 2 11 2 8 1 9 4 20 2 6 24 3 6 22 121
Fossil min 1 9 0 3 1 7 3 18 1 3 15 2 5 20 114
Fossil max 3 16 3 19 1 14 5 26 2 8 31 4 7 26 133
BioBurBiof mean 1 1 1 3 0 4 6 2 1 1 3 2 5 1 30
BioBurBiof min 0 0 0 1 0 3 4 0 0 0 2 2 4 1 27
BioBurBiof max 1 2 1 6 1 5 10 4 1 1 4 3 5 2 35

In [22]:
df_BU.rename(columns = {'proc':'stats'}, inplace = True)
df_BU.index.name = 'proc'
df_BU


Out[22]:
stats Bor_NAme contUSA Cent_NAme Trop_SAme Temp_SAme NAfr SAfr Russia Oceania Europe China India SE_Asia Temp_Eurasia_Japan GLO
proc
Wetlands mean 32 13 2 42 4 8 19 14 3 4 5 6 29 3 185
Wetlands min 15 6 1 19 1 3 15 5 1 1 1 1 18 1 153
Wetlands max 61 23 4 59 7 16 22 26 6 7 9 13 35 6 227
OtherNat mean -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 199
OtherNat min -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 104
OtherNat max -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 297
Agriwast mean 3 17 5 21 6 14 6 5 5 17 30 21 22 20 195
Agriwast min 2 15 2 18 5 12 5 5 5 15 23 16 18 17 178
Agriwast max 3 23 6 23 6 18 7 5 5 18 36 24 25 22 206
Fossil mean 2 11 2 8 1 9 4 20 2 6 24 3 6 22 121
Fossil min 1 9 0 3 1 7 3 18 1 3 15 2 5 20 114
Fossil max 3 16 3 19 1 14 5 26 2 8 31 4 7 26 133
BioBurBiof mean 1 1 1 3 0 4 6 2 1 1 3 2 5 1 30
BioBurBiof min 0 0 0 1 0 3 4 0 0 0 2 2 4 1 27
BioBurBiof max 1 2 1 6 1 5 10 4 1 1 4 3 5 2 35

In [23]:
#Add "Not affected" column and set values to 0
df_BU['Not affected'] = 0
df_BU


Out[23]:
stats Bor_NAme contUSA Cent_NAme Trop_SAme Temp_SAme NAfr SAfr Russia Oceania Europe China India SE_Asia Temp_Eurasia_Japan GLO Not affected
proc
Wetlands mean 32 13 2 42 4 8 19 14 3 4 5 6 29 3 185 0
Wetlands min 15 6 1 19 1 3 15 5 1 1 1 1 18 1 153 0
Wetlands max 61 23 4 59 7 16 22 26 6 7 9 13 35 6 227 0
OtherNat mean -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 199 0
OtherNat min -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 104 0
OtherNat max -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 297 0
Agriwast mean 3 17 5 21 6 14 6 5 5 17 30 21 22 20 195 0
Agriwast min 2 15 2 18 5 12 5 5 5 15 23 16 18 17 178 0
Agriwast max 3 23 6 23 6 18 7 5 5 18 36 24 25 22 206 0
Fossil mean 2 11 2 8 1 9 4 20 2 6 24 3 6 22 121 0
Fossil min 1 9 0 3 1 7 3 18 1 3 15 2 5 20 114 0
Fossil max 3 16 3 19 1 14 5 26 2 8 31 4 7 26 133 0
BioBurBiof mean 1 1 1 3 0 4 6 2 1 1 3 2 5 1 30 0
BioBurBiof min 0 0 0 1 0 3 4 0 0 0 2 2 4 1 27 0
BioBurBiof max 1 2 1 6 1 5 10 4 1 1 4 3 5 2 35 0

In [24]:
#Copy GLO values for OtherNat into "Not affected" col
df_BU['Not affected']['OtherNat'] = df_BU['GLO']['OtherNat']


/usr/lib/python2.7/site-packages/IPython/kernel/__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from IPython.kernel.zmq import kernelapp as app

In [25]:
df_BU


Out[25]:
stats Bor_NAme contUSA Cent_NAme Trop_SAme Temp_SAme NAfr SAfr Russia Oceania Europe China India SE_Asia Temp_Eurasia_Japan GLO Not affected
proc
Wetlands mean 32 13 2 42 4 8 19 14 3 4 5 6 29 3 185 0
Wetlands min 15 6 1 19 1 3 15 5 1 1 1 1 18 1 153 0
Wetlands max 61 23 4 59 7 16 22 26 6 7 9 13 35 6 227 0
OtherNat mean -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 199 199
OtherNat min -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 104 104
OtherNat max -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 297 297
Agriwast mean 3 17 5 21 6 14 6 5 5 17 30 21 22 20 195 0
Agriwast min 2 15 2 18 5 12 5 5 5 15 23 16 18 17 178 0
Agriwast max 3 23 6 23 6 18 7 5 5 18 36 24 25 22 206 0
Fossil mean 2 11 2 8 1 9 4 20 2 6 24 3 6 22 121 0
Fossil min 1 9 0 3 1 7 3 18 1 3 15 2 5 20 114 0
Fossil max 3 16 3 19 1 14 5 26 2 8 31 4 7 26 133 0
BioBurBiof mean 1 1 1 3 0 4 6 2 1 1 3 2 5 1 30 0
BioBurBiof min 0 0 0 1 0 3 4 0 0 0 2 2 4 1 27 0
BioBurBiof max 1 2 1 6 1 5 10 4 1 1 4 3 5 2 35 0

In [26]:
#Pivot table so that "stats" Mean, Min, Max become three columns under each region column
#and "proc" becomes index
df_BU_piv = df_BU.pivot(columns='stats', index=df_BU.index)
df_BU_piv


Out[26]:
Bor_NAme contUSA Cent_NAme Trop_SAme ... SE_Asia Temp_Eurasia_Japan GLO Not affected
stats max mean min max mean min max mean min max ... min max mean min max mean min max mean min
proc
Agriwast 3 3 2 23 17 15 6 5 2 23 ... 18 22 20 17 206 195 178 0 0 0
BioBurBiof 1 1 0 2 1 0 1 1 0 6 ... 4 2 1 1 35 30 27 0 0 0
Fossil 3 2 1 16 11 9 3 2 0 19 ... 5 26 22 20 133 121 114 0 0 0
OtherNat -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 ... -99 -99 -99 -99 297 199 104 297 199 104
Wetlands 61 32 15 23 13 6 4 2 1 59 ... 18 6 3 1 227 185 153 0 0 0

5 rows × 48 columns


In [27]:
df_BU_piv.columns[0][0] #Bor_NAme
df_BU_piv.columns[3][0] #contUSA
df_BU_piv.columns[6][0] #CentName
df_BU_piv.columns[9][0] #Trop_SAme


Out[27]:
'Trop_SAme'

In [28]:
df_BU_piv['Bor_NAme']['mean']


Out[28]:
proc
Agriwast       3
BioBurBiof     1
Fossil         2
OtherNat     -99
Wetlands      32
Name: mean, dtype: int64

In [29]:
df_BU_piv['Bor_NAme'].loc['Agriwast']['mean']


Out[29]:
3

In [30]:
df_BU_piv['Bor_NAme'].loc['BioBurBiof']['mean']


Out[30]:
1

In [31]:
#Store region names in list
numRegions = df_BU_piv.shape[1] / 3

idx = 0
targets = []
for num in range(0,numRegions):    
    if df_BU_piv.columns[idx][0] != "GLO" : #exclude GLO column
      targets.append(df_BU_piv.columns[idx][0])
    idx = idx + 3
   
targets


Out[31]:
['Bor_NAme',
 'contUSA',
 'Cent_NAme',
 'Trop_SAme',
 'Temp_SAme',
 'NAfr',
 'SAfr',
 'Russia',
 'Oceania',
 'Europe',
 'China',
 'India',
 'SE_Asia',
 'Temp_Eurasia_Japan',
 'Not affected']

In [32]:
#Get target list
sources = df_BU_piv.index.tolist()
sources


Out[32]:
['Agriwast', 'BioBurBiof', 'Fossil', 'OtherNat', 'Wetlands']

In [33]:
nodes = sources + targets
nodes


Out[33]:
['Agriwast',
 'BioBurBiof',
 'Fossil',
 'OtherNat',
 'Wetlands',
 'Bor_NAme',
 'contUSA',
 'Cent_NAme',
 'Trop_SAme',
 'Temp_SAme',
 'NAfr',
 'SAfr',
 'Russia',
 'Oceania',
 'Europe',
 'China',
 'India',
 'SE_Asia',
 'Temp_Eurasia_Japan',
 'Not affected']

In [34]:
df_BU_piv


Out[34]:
Bor_NAme contUSA Cent_NAme Trop_SAme ... SE_Asia Temp_Eurasia_Japan GLO Not affected
stats max mean min max mean min max mean min max ... min max mean min max mean min max mean min
proc
Agriwast 3 3 2 23 17 15 6 5 2 23 ... 18 22 20 17 206 195 178 0 0 0
BioBurBiof 1 1 0 2 1 0 1 1 0 6 ... 4 2 1 1 35 30 27 0 0 0
Fossil 3 2 1 16 11 9 3 2 0 19 ... 5 26 22 20 133 121 114 0 0 0
OtherNat -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 ... -99 -99 -99 -99 297 199 104 297 199 104
Wetlands 61 32 15 23 13 6 4 2 1 59 ... 18 6 3 1 227 185 153 0 0 0

5 rows × 48 columns


In [35]:
#Replace -99 values with 0
#df_BU_piv = df_BU_piv.replace(-99, 1)
#df_BU_piv

#Replace -99 and 0 values with -1
#df_BU_piv = df_BU_piv.replace(-99, -1)
#df_BU_piv = df_BU_piv.replace(0, -1)
df_BU_piv


Out[35]:
Bor_NAme contUSA Cent_NAme Trop_SAme ... SE_Asia Temp_Eurasia_Japan GLO Not affected
stats max mean min max mean min max mean min max ... min max mean min max mean min max mean min
proc
Agriwast 3 3 2 23 17 15 6 5 2 23 ... 18 22 20 17 206 195 178 0 0 0
BioBurBiof 1 1 0 2 1 0 1 1 0 6 ... 4 2 1 1 35 30 27 0 0 0
Fossil 3 2 1 16 11 9 3 2 0 19 ... 5 26 22 20 133 121 114 0 0 0
OtherNat -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 ... -99 -99 -99 -99 297 199 104 297 199 104
Wetlands 61 32 15 23 13 6 4 2 1 59 ... 18 6 3 1 227 185 153 0 0 0

5 rows × 48 columns


In [47]:
#Write json file for Sankey, MEAN VALUES ONLY
#GLO omitted

file = open('../data/Sankey_BU_2003-2012_25MAy2016_mean_noZeros.json', 'w')

file.write('{\n')
file.write('"nodes": [\n')
for node in nodes:
    file.write('{"name": "%s"},\n' %(node))
# remove last comma
file.seek(-2, os.SEEK_END)
file.truncate()
file.write('\n],\n')

file.write('"links": [\n')

for source in sources:
    print source
    for target in targets:
        print target
        #print df_BU_piv[target].loc[source]['mean']
        value = df_BU_piv[target].loc[source]['mean']
        #print "value:"
        #print value
        if value != -99:
            if value != 0:
                print value
                file.write('{"source": "%s", "target": "%s", "value": "%.2f"},\n' %(source, target, float(value)))

# remove last comma
file.seek(-2, os.SEEK_END)
file.truncate()
file.write('\n]\n')
file.write('}\n')

file.close()


Agriwast
Bor_NAme
3
contUSA
17
Cent_NAme
5
Trop_SAme
21
Temp_SAme
6
NAfr
14
SAfr
6
Russia
5
Oceania
5
Europe
17
China
30
India
21
SE_Asia
22
Temp_Eurasia_Japan
20
Not affected
BioBurBiof
Bor_NAme
1
contUSA
1
Cent_NAme
1
Trop_SAme
3
Temp_SAme
NAfr
4
SAfr
6
Russia
2
Oceania
1
Europe
1
China
3
India
2
SE_Asia
5
Temp_Eurasia_Japan
1
Not affected
Fossil
Bor_NAme
2
contUSA
11
Cent_NAme
2
Trop_SAme
8
Temp_SAme
1
NAfr
9
SAfr
4
Russia
20
Oceania
2
Europe
6
China
24
India
3
SE_Asia
6
Temp_Eurasia_Japan
22
Not affected
OtherNat
Bor_NAme
contUSA
Cent_NAme
Trop_SAme
Temp_SAme
NAfr
SAfr
Russia
Oceania
Europe
China
India
SE_Asia
Temp_Eurasia_Japan
Not affected
199
Wetlands
Bor_NAme
32
contUSA
13
Cent_NAme
2
Trop_SAme
42
Temp_SAme
4
NAfr
8
SAfr
19
Russia
14
Oceania
3
Europe
4
China
5
India
6
SE_Asia
29
Temp_Eurasia_Japan
3
Not affected

In [50]:
#Write json file for Sankey, MIN VALUES ONLY
#GLO omitted

file = open('../data/Sankey_BU_2003-2012_25MAy2016_min.json', 'w')

file.write('{\n')
file.write('"nodes": [\n')
for node in nodes:
    file.write('{"name": "%s"},\n' %(node))
# remove last comma
file.seek(-2, os.SEEK_END)
file.truncate()
file.write('\n],\n')

file.write('"links": [\n')

for source in sources:
    #print source
    for target in targets:
        #print target
        #print df_BU_piv[target].loc[source]['min']
        meanValue = df_BU_piv[target].loc[source]['mean']
        value = df_BU_piv[target].loc[source]['min']
        if meanValue != -99:
            if meanValue != 0:
                file.write('{"source": "%s", "target": "%s", "value": "%.2f"},\n' %(source, target, float(value)))

# remove last comma
file.seek(-2, os.SEEK_END)
file.truncate()
file.write('\n]\n')
file.write('}\n')

file.close()

In [51]:
#Write json file for Sankey, MAX VALUES ONLY
#GLO omitted

file = open('../data/Sankey_BU_2003-2012_25MAy2016_max.json', 'w')

file.write('{\n')
file.write('"nodes": [\n')
for node in nodes:
    file.write('{"name": "%s"},\n' %(node))
# remove last comma
file.seek(-2, os.SEEK_END)
file.truncate()
file.write('\n],\n')

file.write('"links": [\n')

for source in sources:
    #print source
    for target in targets:
        #print target
        #print df_BU_piv[target].loc[source]['max']
        meanValue = df_BU_piv[target].loc[source]['mean']
        value = df_BU_piv[target].loc[source]['max']
        if meanValue != -99:
            if meanValue != 0:
                file.write('{"source": "%s", "target": "%s", "value": "%.2f"},\n' %(source, target, float(value)))

# remove last comma
file.seek(-2, os.SEEK_END)
file.truncate()
file.write('\n]\n')
file.write('}\n')

file.close()

In [20]:
#Write tsv file with min/max values
#file_name = "../data/statsBU.tsv"

#df_BU_piv.to_csv(file_name, sep='\t', encoding='utf-8')
#df_BU_piv

In [39]:
#junk = pd.read_csv("../data/Sankey_BU_2003-2012_25MAy2016.txt", header=1, delim_whitespace=True)
#junk.rename(columns = {'proc':'stats'}, inplace = True)

In [40]:
#Write tsv file with min/max values
#file_name = "../data/junk.tsv"

#junk.to_csv(file_name, sep='\t', encoding='utf-8')

In [ ]: