In [36]:
#Script to reformat methane data Top-Down file for two-level Sankey
#In BU approach, Sources = methane sources, Targets = regions

#Output: json file formatted for Sankey diagram

#Created:       31.05.2016
#Last modified: 02.06.2016 New data files from Marielle Saunois; remove GLO category from final file
#               03.06.2016 Add "Not affected" category; Exclude 0 and -99 values
#               06.06.2016 Output json files (Sankey format) with min and max values, respectively
#               07.06.2016 Min/max values output only when mean != -99 or != 0
#               09.06.2016 three levels

In [1]:
import numpy as np
import pandas as pd
import collections
import os
import xlrd

In [3]:
#Subcategories
#Pour Wetlands, il n'y a pas de sous catégories.
#Pour Other Natural, tu pourrais essayer avec :
#  Fresh water (122)
#  Geological source (52)
#  Wild animals (10)
#  Termites( 9)
#  Permafrost and hydrates (3)
#  Wild fires (1)

122+52+10+9+3+3


Out[3]:
199

In [4]:
df_BU = pd.read_csv("../data/Sankey_BU_2003-2012_25MAy2016.txt", header=1, delim_whitespace=True)
df_BU


Out[4]:
proc Bor_NAme contUSA Cent_NAme Trop_SAme Temp_SAme NAfr SAfr Russia Oceania Europe China India SE_Asia Temp_Eurasia_Japan GLO
Wetlands mean 32 13 2 42 4 8 19 14 3 4 5 6 29 3 185
Wetlands min 15 6 1 19 1 3 15 5 1 1 1 1 18 1 153
Wetlands max 61 23 4 59 7 16 22 26 6 7 9 13 35 6 227
OtherNat mean -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 199
OtherNat min -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 104
OtherNat max -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 297
Agriwast mean 3 17 5 21 6 14 6 5 5 17 30 21 22 20 195
Agriwast min 2 15 2 18 5 12 5 5 5 15 23 16 18 17 178
Agriwast max 3 23 6 23 6 18 7 5 5 18 36 24 25 22 206
Fossil mean 2 11 2 8 1 9 4 20 2 6 24 3 6 22 121
Fossil min 1 9 0 3 1 7 3 18 1 3 15 2 5 20 114
Fossil max 3 16 3 19 1 14 5 26 2 8 31 4 7 26 133
BioBurBiof mean 1 1 1 3 0 4 6 2 1 1 3 2 5 1 30
BioBurBiof min 0 0 0 1 0 3 4 0 0 0 2 2 4 1 27
BioBurBiof max 1 2 1 6 1 5 10 4 1 1 4 3 5 2 35

In [5]:
df_BU.rename(columns = {'proc':'stats'}, inplace = True)
df_BU.index.name = 'proc'
df_BU


Out[5]:
stats Bor_NAme contUSA Cent_NAme Trop_SAme Temp_SAme NAfr SAfr Russia Oceania Europe China India SE_Asia Temp_Eurasia_Japan GLO
proc
Wetlands mean 32 13 2 42 4 8 19 14 3 4 5 6 29 3 185
Wetlands min 15 6 1 19 1 3 15 5 1 1 1 1 18 1 153
Wetlands max 61 23 4 59 7 16 22 26 6 7 9 13 35 6 227
OtherNat mean -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 199
OtherNat min -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 104
OtherNat max -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 297
Agriwast mean 3 17 5 21 6 14 6 5 5 17 30 21 22 20 195
Agriwast min 2 15 2 18 5 12 5 5 5 15 23 16 18 17 178
Agriwast max 3 23 6 23 6 18 7 5 5 18 36 24 25 22 206
Fossil mean 2 11 2 8 1 9 4 20 2 6 24 3 6 22 121
Fossil min 1 9 0 3 1 7 3 18 1 3 15 2 5 20 114
Fossil max 3 16 3 19 1 14 5 26 2 8 31 4 7 26 133
BioBurBiof mean 1 1 1 3 0 4 6 2 1 1 3 2 5 1 30
BioBurBiof min 0 0 0 1 0 3 4 0 0 0 2 2 4 1 27
BioBurBiof max 1 2 1 6 1 5 10 4 1 1 4 3 5 2 35

In [6]:
#Add "Not affected" column and set values to 0
df_BU['Not affected'] = 0
df_BU


Out[6]:
stats Bor_NAme contUSA Cent_NAme Trop_SAme Temp_SAme NAfr SAfr Russia Oceania Europe China India SE_Asia Temp_Eurasia_Japan GLO Not affected
proc
Wetlands mean 32 13 2 42 4 8 19 14 3 4 5 6 29 3 185 0
Wetlands min 15 6 1 19 1 3 15 5 1 1 1 1 18 1 153 0
Wetlands max 61 23 4 59 7 16 22 26 6 7 9 13 35 6 227 0
OtherNat mean -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 199 0
OtherNat min -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 104 0
OtherNat max -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 297 0
Agriwast mean 3 17 5 21 6 14 6 5 5 17 30 21 22 20 195 0
Agriwast min 2 15 2 18 5 12 5 5 5 15 23 16 18 17 178 0
Agriwast max 3 23 6 23 6 18 7 5 5 18 36 24 25 22 206 0
Fossil mean 2 11 2 8 1 9 4 20 2 6 24 3 6 22 121 0
Fossil min 1 9 0 3 1 7 3 18 1 3 15 2 5 20 114 0
Fossil max 3 16 3 19 1 14 5 26 2 8 31 4 7 26 133 0
BioBurBiof mean 1 1 1 3 0 4 6 2 1 1 3 2 5 1 30 0
BioBurBiof min 0 0 0 1 0 3 4 0 0 0 2 2 4 1 27 0
BioBurBiof max 1 2 1 6 1 5 10 4 1 1 4 3 5 2 35 0

In [7]:
#Copy GLO values for OtherNat into "Not affected" col
df_BU['Not affected']['OtherNat'] = df_BU['GLO']['OtherNat']


/usr/lib/python2.7/site-packages/IPython/kernel/__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from IPython.kernel.zmq import kernelapp as app

In [8]:
df_BU


Out[8]:
stats Bor_NAme contUSA Cent_NAme Trop_SAme Temp_SAme NAfr SAfr Russia Oceania Europe China India SE_Asia Temp_Eurasia_Japan GLO Not affected
proc
Wetlands mean 32 13 2 42 4 8 19 14 3 4 5 6 29 3 185 0
Wetlands min 15 6 1 19 1 3 15 5 1 1 1 1 18 1 153 0
Wetlands max 61 23 4 59 7 16 22 26 6 7 9 13 35 6 227 0
OtherNat mean -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 199 199
OtherNat min -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 104 104
OtherNat max -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 297 297
Agriwast mean 3 17 5 21 6 14 6 5 5 17 30 21 22 20 195 0
Agriwast min 2 15 2 18 5 12 5 5 5 15 23 16 18 17 178 0
Agriwast max 3 23 6 23 6 18 7 5 5 18 36 24 25 22 206 0
Fossil mean 2 11 2 8 1 9 4 20 2 6 24 3 6 22 121 0
Fossil min 1 9 0 3 1 7 3 18 1 3 15 2 5 20 114 0
Fossil max 3 16 3 19 1 14 5 26 2 8 31 4 7 26 133 0
BioBurBiof mean 1 1 1 3 0 4 6 2 1 1 3 2 5 1 30 0
BioBurBiof min 0 0 0 1 0 3 4 0 0 0 2 2 4 1 27 0
BioBurBiof max 1 2 1 6 1 5 10 4 1 1 4 3 5 2 35 0

In [159]:
#sub_df = pd.read_csv("../data/Sankey_BU_subcategories.csv", header=1, sep="\;")
sub_df = pd.read_csv("../data/Sankey_BU_subcategories_withFake.csv", header=1, sep="\;")
sub_df


Out[159]:
proc stats Fossil Agriwast BioBurBiof OtherNat Wetlands
0 Coal mean 41 0 0 0 0
1 Coal min 26 0 0 0 0
2 Coal max 50 0 0 0 0
3 GasAndOilIndustry mean 79 0 0 0 0
4 GasAndOilIndustry min 69 0 0 0 0
5 GasAndOilIndustry max 88 0 0 0 0
6 EntericFermentationAndManure mean 0 106 0 0 0
7 EntericFermentationAndManure min 0 97 0 0 0
8 EntericFermentationAndManure max 0 111 0 0 0
9 LandfillsAndWaste mean 0 59 0 0 0
10 LandfillsAndWaste min 0 52 0 0 0
11 LandfillsAndWaste max 0 63 0 0 0
12 Rice mean 0 30 0 0 0
13 Rice min 0 24 0 0 0
14 Rice max 0 36 0 0 0
15 BiomassBurning mean 0 0 18 0 0
16 BiomassBurning min 0 0 15 0 0
17 BiomassBurning max 0 0 21 0 0
18 BiofuelBurning mean 0 0 12 0 0
19 BiofuelBurning min 0 0 10 0 0
20 BiofuelBurning max 0 0 14 0 0
21 Freshwater mean 0 0 0 122 0
22 Freshwater min 0 0 0 60 0
23 Freshwater max 0 0 0 180 0
24 GeologicalSources mean 0 0 0 52 0
25 GeologicalSources min 0 0 0 -999 0
26 GeologicalSources max 0 0 0 -1000 0
27 WildAnimals mean 0 0 0 10 0
28 WildAnimals min 0 0 0 5 0
29 WildAnimals max 0 0 0 15 0
30 Termites mean 0 0 0 9 0
31 Termites min 0 0 0 3 0
32 Termites max 0 0 0 15 0
33 PermafrostAndHydrates mean 0 0 0 3 0
34 PermafrostAndHydrates min 0 0 0 -888 0
35 PermafrostAndHydrates max 0 0 0 -889 0
36 Wildfires mean 0 0 0 3 0
37 Wildfires min 0 0 0 1 0
38 Wildfires max 0 0 0 5 0
39 fakeSource mean 0 0 0 0 100
40 fakeSource min 0 0 0 0 1
41 fakeSource max 0 0 0 0 10

In [160]:
sub_df['proc']


Out[160]:
0                             Coal
1                             Coal
2                             Coal
3                GasAndOilIndustry
4                GasAndOilIndustry
5                GasAndOilIndustry
6     EntericFermentationAndManure
7     EntericFermentationAndManure
8     EntericFermentationAndManure
9                LandfillsAndWaste
10               LandfillsAndWaste
11               LandfillsAndWaste
12                            Rice
13                            Rice
14                            Rice
15                  BiomassBurning
16                  BiomassBurning
17                  BiomassBurning
18                  BiofuelBurning
19                  BiofuelBurning
20                  BiofuelBurning
21                      Freshwater
22                      Freshwater
23                      Freshwater
24               GeologicalSources
25               GeologicalSources
26               GeologicalSources
27                     WildAnimals
28                     WildAnimals
29                     WildAnimals
30                        Termites
31                        Termites
32                        Termites
33           PermafrostAndHydrates
34           PermafrostAndHydrates
35           PermafrostAndHydrates
36                       Wildfires
37                       Wildfires
38                       Wildfires
39                      fakeSource
40                      fakeSource
41                      fakeSource
Name: proc, dtype: object

In [161]:
sub_df.index = sub_df['proc']
sub_df = sub_df.drop(['proc'], axis=1)
sub_df.head(10)


Out[161]:
stats Fossil Agriwast BioBurBiof OtherNat Wetlands
proc
Coal mean 41 0 0 0 0
Coal min 26 0 0 0 0
Coal max 50 0 0 0 0
GasAndOilIndustry mean 79 0 0 0 0
GasAndOilIndustry min 69 0 0 0 0
GasAndOilIndustry max 88 0 0 0 0
EntericFermentationAndManure mean 0 106 0 0 0
EntericFermentationAndManure min 0 97 0 0 0
EntericFermentationAndManure max 0 111 0 0 0
LandfillsAndWaste mean 0 59 0 0 0

In [162]:
#Pivot table so that "stats" Mean, Min, Max become three columns under each region column
#and "proc" becomes index
df_BU_piv = df_BU.pivot(columns='stats', index=df_BU.index)
df_BU_piv


Out[162]:
Bor_NAme contUSA Cent_NAme Trop_SAme ... SE_Asia Temp_Eurasia_Japan GLO Not affected
stats max mean min max mean min max mean min max ... min max mean min max mean min max mean min
proc
Agriwast 3 3 2 23 17 15 6 5 2 23 ... 18 22 20 17 206 195 178 0 0 0
BioBurBiof 1 1 0 2 1 0 1 1 0 6 ... 4 2 1 1 35 30 27 0 0 0
Fossil 3 2 1 16 11 9 3 2 0 19 ... 5 26 22 20 133 121 114 0 0 0
OtherNat -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 ... -99 -99 -99 -99 297 199 104 297 199 104
Wetlands 61 32 15 23 13 6 4 2 1 59 ... 18 6 3 1 227 185 153 0 0 0

5 rows × 48 columns


In [163]:
#Pivot table so that "stats" Mean, Min, Max become three columns under each region column
#and "proc" becomes index
sub_df_piv = sub_df.pivot(columns='stats', index=sub_df.index)
sub_df_piv


Out[163]:
Fossil Agriwast BioBurBiof OtherNat Wetlands
stats max min mean max min mean max min mean max min mean max min mean
proc
BiofuelBurning 0 0 0 0 0 0 14 10 12 0 0 0 0 0 0
BiomassBurning 0 0 0 0 0 0 21 15 18 0 0 0 0 0 0
Coal 50 26 41 0 0 0 0 0 0 0 0 0 0 0 0
EntericFermentationAndManure 0 0 0 111 97 106 0 0 0 0 0 0 0 0 0
Freshwater 0 0 0 0 0 0 0 0 0 180 60 122 0 0 0
GasAndOilIndustry 88 69 79 0 0 0 0 0 0 0 0 0 0 0 0
GeologicalSources 0 0 0 0 0 0 0 0 0 -1000 -999 52 0 0 0
LandfillsAndWaste 0 0 0 63 52 59 0 0 0 0 0 0 0 0 0
PermafrostAndHydrates 0 0 0 0 0 0 0 0 0 -889 -888 3 0 0 0
Rice 0 0 0 36 24 30 0 0 0 0 0 0 0 0 0
Termites 0 0 0 0 0 0 0 0 0 15 3 9 0 0 0
WildAnimals 0 0 0 0 0 0 0 0 0 15 5 10 0 0 0
Wildfires 0 0 0 0 0 0 0 0 0 5 1 3 0 0 0
fakeSource 0 0 0 0 0 0 0 0 0 0 0 0 10 1 100

In [10]:
df_BU_piv.columns[0][0] #Bor_NAme
df_BU_piv.columns[3][0] #contUSA
df_BU_piv.columns[6][0] #CentName
df_BU_piv.columns[9][0] #Trop_SAme


Out[10]:
'Trop_SAme'

In [11]:
df_BU_piv['Bor_NAme']['mean']


Out[11]:
proc
Agriwast       3
BioBurBiof     1
Fossil         2
OtherNat     -99
Wetlands      32
Name: mean, dtype: int64

In [12]:
df_BU_piv['Bor_NAme'].loc['Agriwast']['mean']


Out[12]:
3

In [13]:
df_BU_piv['Bor_NAme'].loc['BioBurBiof']['mean']


Out[13]:
1

In [165]:
#Create regions, categories and subcategories array

#regions
#Store region names in list
numRegions = df_BU_piv.shape[1] / 3

idx = 0
regions = []
for num in range(0,numRegions):    
    if df_BU_piv.columns[idx][0] != "GLO" : #exclude GLO column
      regions.append(df_BU_piv.columns[idx][0])
    idx = idx + 3

#categories    
categories = df_BU_piv.index.tolist()
 
#subcategories
subcategories = sub_df.index.tolist()
subcategories = list(set(subcategories)) #finds unique values
   
regions
categories
subcategories


Out[165]:
['Termites',
 'Wildfires',
 'Freshwater',
 'WildAnimals',
 'EntericFermentationAndManure',
 'fakeSource',
 'LandfillsAndWaste',
 'Coal',
 'BiomassBurning',
 'GeologicalSources',
 'PermafrostAndHydrates',
 'Rice',
 'BiofuelBurning',
 'GasAndOilIndustry']

In [110]:
#Create sources array
#sources = df_BU_piv.index.tolist()
#sources


Out[110]:
['Agriwast', 'BioBurBiof', 'Fossil', 'OtherNat', 'Wetlands']

In [166]:
sub_df_piv.index.tolist()


Out[166]:
['BiofuelBurning',
 'BiomassBurning',
 'Coal',
 'EntericFermentationAndManure',
 'Freshwater',
 'GasAndOilIndustry',
 'GeologicalSources',
 'LandfillsAndWaste',
 'PermafrostAndHydrates',
 'Rice',
 'Termites',
 'WildAnimals',
 'Wildfires',
 'fakeSource']

In [167]:
nodes = categories + regions + subcategories
nodes


Out[167]:
['Agriwast',
 'BioBurBiof',
 'Fossil',
 'OtherNat',
 'Wetlands',
 'Bor_NAme',
 'contUSA',
 'Cent_NAme',
 'Trop_SAme',
 'Temp_SAme',
 'NAfr',
 'SAfr',
 'Russia',
 'Oceania',
 'Europe',
 'China',
 'India',
 'SE_Asia',
 'Temp_Eurasia_Japan',
 'Not affected',
 'Termites',
 'Wildfires',
 'Freshwater',
 'WildAnimals',
 'EntericFermentationAndManure',
 'fakeSource',
 'LandfillsAndWaste',
 'Coal',
 'BiomassBurning',
 'GeologicalSources',
 'PermafrostAndHydrates',
 'Rice',
 'BiofuelBurning',
 'GasAndOilIndustry']

In [168]:
sources = categories + subcategories
sources


Out[168]:
['Agriwast',
 'BioBurBiof',
 'Fossil',
 'OtherNat',
 'Wetlands',
 'Termites',
 'Wildfires',
 'Freshwater',
 'WildAnimals',
 'EntericFermentationAndManure',
 'fakeSource',
 'LandfillsAndWaste',
 'Coal',
 'BiomassBurning',
 'GeologicalSources',
 'PermafrostAndHydrates',
 'Rice',
 'BiofuelBurning',
 'GasAndOilIndustry']

In [169]:
targets = regions + categories
targets


Out[169]:
['Bor_NAme',
 'contUSA',
 'Cent_NAme',
 'Trop_SAme',
 'Temp_SAme',
 'NAfr',
 'SAfr',
 'Russia',
 'Oceania',
 'Europe',
 'China',
 'India',
 'SE_Asia',
 'Temp_Eurasia_Japan',
 'Not affected',
 'Agriwast',
 'BioBurBiof',
 'Fossil',
 'OtherNat',
 'Wetlands']

In [170]:
#Replace -99 values with 0
#df_BU_piv = df_BU_piv.replace(-99, 1)
#df_BU_piv

#Replace -99 and 0 values with -1
#df_BU_piv = df_BU_piv.replace(-99, -1)
#df_BU_piv = df_BU_piv.replace(0, -1)
df_BU_piv


Out[170]:
Bor_NAme contUSA Cent_NAme Trop_SAme ... SE_Asia Temp_Eurasia_Japan GLO Not affected
stats max mean min max mean min max mean min max ... min max mean min max mean min max mean min
proc
Agriwast 3 3 2 23 17 15 6 5 2 23 ... 18 22 20 17 206 195 178 0 0 0
BioBurBiof 1 1 0 2 1 0 1 1 0 6 ... 4 2 1 1 35 30 27 0 0 0
Fossil 3 2 1 16 11 9 3 2 0 19 ... 5 26 22 20 133 121 114 0 0 0
OtherNat -99 -99 -99 -99 -99 -99 -99 -99 -99 -99 ... -99 -99 -99 -99 297 199 104 297 199 104
Wetlands 61 32 15 23 13 6 4 2 1 59 ... 18 6 3 1 227 185 153 0 0 0

5 rows × 48 columns


In [190]:
sub_df_piv


Out[190]:
Fossil Agriwast BioBurBiof OtherNat Wetlands
stats max min mean max min mean max min mean max min mean max min mean
proc
BiofuelBurning 0 0 0 0 0 0 14 10 12 0 0 0 0 0 0
BiomassBurning 0 0 0 0 0 0 21 15 18 0 0 0 0 0 0
Coal 50 26 41 0 0 0 0 0 0 0 0 0 0 0 0
EntericFermentationAndManure 0 0 0 111 97 106 0 0 0 0 0 0 0 0 0
Freshwater 0 0 0 0 0 0 0 0 0 180 60 122 0 0 0
GasAndOilIndustry 88 69 79 0 0 0 0 0 0 0 0 0 0 0 0
GeologicalSources 0 0 0 0 0 0 0 0 0 -1000 -999 52 0 0 0
LandfillsAndWaste 0 0 0 63 52 59 0 0 0 0 0 0 0 0 0
PermafrostAndHydrates 0 0 0 0 0 0 0 0 0 -889 -888 3 0 0 0
Rice 0 0 0 36 24 30 0 0 0 0 0 0 0 0 0
Termites 0 0 0 0 0 0 0 0 0 15 3 9 0 0 0
WildAnimals 0 0 0 0 0 0 0 0 0 15 5 10 0 0 0
Wildfires 0 0 0 0 0 0 0 0 0 5 1 3 0 0 0
fakeSource 0 0 0 0 0 0 0 0 0 0 0 0 10 1 100

In [206]:
targets #regions + categories

#df_BU_piv['Bor_NAme'].loc['Agriwast']['mean']
sub_df_piv['OtherNat'].loc['Freshwater'][2] #[0] = max, [1] = min, [2] = mean

sub_df_piv['Agriwast'].loc['Termites'][2] #[0] = max, [1] = min, [2] = mean


Out[206]:
0

In [216]:
#Write json file for 3-level Sankey, MEAN VALUES ONLY
#GLO omitted

file = open('../data/Sankey_BU_2003-2012_25MAy2016_mean_3levels.json', 'w')

file.write('{\n')
file.write('"nodes": [\n')
for node in nodes:
    file.write('{"name": "%s"},\n' %(node))
# remove last comma
file.seek(-2, os.SEEK_END)
file.truncate()
file.write('\n],\n')

file.write('"links": [\n')

#source-target pairs for categories -> regions
for category in categories:
    #print category
    for region in regions:
        #print region
        value = df_BU_piv[region].loc[category]['mean']     
        if value != -99:
            if value != 0:
                #print value
                file.write('{"source": "%s", "target": "%s", "value": "%.2f"},\n' %(category, region, float(value)))                
    
#source-target pairs for subcategories -> categories
for subcategory in subcategories:    
    for category in categories:    
        value2 = sub_df_piv[category].loc[subcategory][2] #[0] = max, [1] = min, [2] = mean
        if value2 != 0:
            file.write('{"source": "%s", "target": "%s", "value": "%.2f"},\n' %(subcategory, category, float(value2)))

# remove last comma
file.seek(-2, os.SEEK_END)
file.truncate()
file.write('\n]\n')
file.write('}\n')

file.close()

In [50]:
#Write json file for 3-level Sankey, MIN VALUES ONLY
#GLO omitted

file = open('../data/Sankey_BU_2003-2012_25MAy2016_min_3levels.json', 'w')

file.write('{\n')
file.write('"nodes": [\n')
for node in nodes:
    file.write('{"name": "%s"},\n' %(node))
# remove last comma
file.seek(-2, os.SEEK_END)
file.truncate()
file.write('\n],\n')

file.write('"links": [\n')

#source-target pairs for categories -> regions
for category in categories:
    for region in regions:
        meanValue = df_BU_piv[region].loc[category]['mean']
        value = df_BU_piv[region].loc[category]['min']
        if meanValue != -99:
            if meanValue != 0:
                file.write('{"source": "%s", "target": "%s", "value": "%.2f"},\n' %(category, region, float(value)))

# remove last comma
file.seek(-2, os.SEEK_END)
file.truncate()
file.write('\n]\n')
file.write('}\n')

file.close()

In [51]:
#Write json file for Sankey, MAX VALUES ONLY
#GLO omitted

file = open('../data/Sankey_BU_2003-2012_25MAy2016_max_3levels.json', 'w')

file.write('{\n')
file.write('"nodes": [\n')
for node in nodes:
    file.write('{"name": "%s"},\n' %(node))
# remove last comma
file.seek(-2, os.SEEK_END)
file.truncate()
file.write('\n],\n')

file.write('"links": [\n')

for source in sources:
    #print source
    for target in targets:
        #print target
        #print df_BU_piv[target].loc[source]['max']
        meanValue = df_BU_piv[target].loc[source]['mean']
        value = df_BU_piv[target].loc[source]['max']
        if meanValue != -99:
            if meanValue != 0:
                file.write('{"source": "%s", "target": "%s", "value": "%.2f"},\n' %(source, target, float(value)))

# remove last comma
file.seek(-2, os.SEEK_END)
file.truncate()
file.write('\n]\n')
file.write('}\n')

file.close()

In [20]:
#Write tsv file with min/max values
#file_name = "../data/statsBU.tsv"

#df_BU_piv.to_csv(file_name, sep='\t', encoding='utf-8')
#df_BU_piv

In [39]:
#junk = pd.read_csv("../data/Sankey_BU_2003-2012_25MAy2016.txt", header=1, delim_whitespace=True)
#junk.rename(columns = {'proc':'stats'}, inplace = True)

In [40]:
#Write tsv file with min/max values
#file_name = "../data/junk.tsv"

#junk.to_csv(file_name, sep='\t', encoding='utf-8')

In [ ]: