In [36]:
#Script to reformat methane data files for two-level Sankey
#Output: tsv file

#Created:           30.05.2016
#Last modified:

In [1]:
import numpy as np
import pandas as pd
import collections
import os
import xlrd

In [2]:
regions_dict = {
              "Bor_NAme": "1", "contUSA": "2", "contUSA": "3",
              "Trop_SAme": "4", "Temp_SAme": "5", "NAfr": "6",
              "SAfr": "7", "Russia": "8", "Oceania": "9",
              "Europe": "10", "China": "11", "India": "12",
              "SE_Asia": "13", "Temp_Eurasia_Japan": "14"
            };

In [3]:
df = pd.read_csv('test.csv', sep="\t", skiprows=[0], header = None)
#dataTD.columns = ["a", "b", "c", "d", "etc."]
df


Out[3]:
0 1 2 3 4 5
0 proc stats Bor_NAme contUSA Cent_NAme Trop_SAme
1 Wetlands mean 13 8 2 47
2 Wetlands min 7 6 2 30
3 Wetlands max 21 9 2 61
4 OtherNat mean 2 2 1 7
5 OtherNat min 1 1 1 5
6 OtherNat max 2 4 1 8
7 Agriwast mean 3 17 5 21
8 Agriwast min 2 15 6 19
9 Agriwast max 3 23 7 24
10 Fossil mean 2 11 2 5
11 Fossil min 1 9 1 1
12 Fossil max 2 15 3 8
13 BioBurBiof mean 1 0 1 6
14 BioBurBiof min 1 0 0 3
15 BioBurBiof max 1 1 1 7

In [4]:
df_TD = df.ix[1:]
df_TD.columns = ["proc", "stats", "Bor_NAme", "contUSA", "CentName", "Trop_SAme"]
df_TD


Out[4]:
proc stats Bor_NAme contUSA CentName Trop_SAme
1 Wetlands mean 13 8 2 47
2 Wetlands min 7 6 2 30
3 Wetlands max 21 9 2 61
4 OtherNat mean 2 2 1 7
5 OtherNat min 1 1 1 5
6 OtherNat max 2 4 1 8
7 Agriwast mean 3 17 5 21
8 Agriwast min 2 15 6 19
9 Agriwast max 3 23 7 24
10 Fossil mean 2 11 2 5
11 Fossil min 1 9 1 1
12 Fossil max 2 15 3 8
13 BioBurBiof mean 1 0 1 6
14 BioBurBiof min 1 0 0 3
15 BioBurBiof max 1 1 1 7

In [5]:
# Create dataframe from flux file

#xl_TD = pd.ExcelFile("Sankey_TD_2003-2012_25MAy2016.xls")
#xl_TD.sheet_names
#df_TD = xl.parse("Sankey_TD_2003-2012_25MAy2016",  skiprows=[0])
#df_TD.head(10)

In [6]:
#Pivot table so that Region names are listed in one Region column, and stats Mean, Min, Max become three columns
#http://stackoverflow.com/questions/29942167/transposing-one-column-in-python-pandas-with-the-simplest-index-possible
#dc.pivot(columns='disease_header', index='medicine', values='disease').reset_index()

df_TD_piv = df_TD.pivot(columns='stats', index='proc').reset_index()
#df05_piv = df05_piv.drop(['CLASS'], 1)
df_TD_piv


Out[6]:
proc Bor_NAme contUSA CentName Trop_SAme
stats max mean min max mean min max mean min max mean min
0 Agriwast 3 3 2 23 17 15 7 5 6 24 21 19
1 BioBurBiof 1 1 1 1 0 0 1 1 0 7 6 3
2 Fossil 2 2 1 15 11 9 3 2 1 8 5 1
3 OtherNat 2 2 1 4 2 1 1 1 1 8 7 5
4 Wetlands 21 13 7 9 8 6 2 2 2 61 47 30

In [7]:
test = df_TD.pivot('proc', 'stats')
test


Out[7]:
Bor_NAme contUSA CentName Trop_SAme
stats max mean min max mean min max mean min max mean min
proc
Agriwast 3 3 2 23 17 15 7 5 6 24 21 19
BioBurBiof 1 1 1 1 0 0 1 1 0 7 6 3
Fossil 2 2 1 15 11 9 3 2 1 8 5 1
OtherNat 2 2 1 4 2 1 1 1 1 8 7 5
Wetlands 21 13 7 9 8 6 2 2 2 61 47 30

In [8]:
#test.iloc[:,3].values #col values
test.columns[0][0] #Bor_NAme
test.columns[3][0] #contUSA
test.columns[6][0] #CentName
test.columns[9][0] #Trop_SAme


Out[8]:
'Trop_SAme'

In [49]:
test['Bor_NAme']['mean']


Out[49]:
proc
Agriwast       3
BioBurBiof     1
Fossil         2
OtherNat       2
Wetlands      13
Name: mean, dtype: object

In [58]:
test['Bor_NAme'].loc['Agriwast']['mean']


Out[58]:
'3'

In [59]:
test['Bor_NAme'].loc['BioBurBiof']['mean']


Out[59]:
'1'

In [28]:
#Store region names in list
numRegions = test.shape[1] / 3

idx = 0
sources = []
for num in range(0,numRegions):
    sources.append(test.columns[idx][0])
    idx = idx + 3
   
sources


Out[28]:
['Bor_NAme', 'contUSA', 'CentName', 'Trop_SAme']

In [41]:
#Get target list
targets = test.index.tolist()
targets


Out[41]:
['Agriwast', 'BioBurBiof', 'Fossil', 'OtherNat', 'Wetlands']

In [42]:
nodes = sources + targets
nodes


Out[42]:
['Bor_NAme',
 'contUSA',
 'CentName',
 'Trop_SAme',
 'Agriwast',
 'BioBurBiof',
 'Fossil',
 'OtherNat',
 'Wetlands']

In [60]:
idx = 0
for source in sources:
    print source
    for target in targets:
        #print idx        
        print target
        print test[source].loc[target]['mean']


Bor_NAme
Agriwast
3
BioBurBiof
1
Fossil
2
OtherNat
2
Wetlands
13
contUSA
Agriwast
17
BioBurBiof
0
Fossil
11
OtherNat
2
Wetlands
8
CentName
Agriwast
5
BioBurBiof
1
Fossil
2
OtherNat
1
Wetlands
2
Trop_SAme
Agriwast
21
BioBurBiof
6
Fossil
5
OtherNat
7
Wetlands
47

In [66]:
file = open('/homel/cnangini/PROJECTS/MethaneEmissions/sankey_TB_means.json', 'w')

file.write('{\n')
file.write('"nodes": [\n')
for node in nodes:
    file.write('{"name": "%s"},\n' %(node))
# remove last comma
file.seek(-2, os.SEEK_END)
file.truncate()
file.write('\n],\n')

file.write('"links": [\n')

for source in sources:
    print source
    for target in targets:
        print target
        print test[source].loc[target]['mean']
        value = test[source].loc[target]['mean']
        file.write('{"source": "%s", "target": "%s", "mean": "%.2f"},\n' %(source, target, float(value)))

# remove last comma
file.seek(-2, os.SEEK_END)
file.truncate()
file.write('\n]\n')
file.write('}\n')

file.close()


Bor_NAme
Agriwast
3
BioBurBiof
1
Fossil
2
OtherNat
2
Wetlands
13
contUSA
Agriwast
17
BioBurBiof
0
Fossil
11
OtherNat
2
Wetlands
8
CentName
Agriwast
5
BioBurBiof
1
Fossil
2
OtherNat
1
Wetlands
2
Trop_SAme
Agriwast
21
BioBurBiof
6
Fossil
5
OtherNat
7
Wetlands
47

In [ ]: