In [36]:
#Script to reformat methane data Top-Down file for two-level Sankey
#In BU approach, Sources = methane sources, Targets = regions
#Output: json file formatted for Sankey diagram
#Created: 31.05.2016
#Last modified: 02.06.2016 New data files from Marielle Saunois; remove GLO category from final file
# 03.06.2016 Add "Not affected" category; Exclude 0 and -99 values
# 06.06.2016 Output json files (Sankey format) with min and max values, respectively
# 07.06.2016 Min/max values output only when mean != -99 or != 0
# 09.06.2016 three levels
In [1]:
import numpy as np
import pandas as pd
import collections
import os
import xlrd
In [3]:
#Subcategories
#Pour Wetlands, il n'y a pas de sous catégories.
#Pour Other Natural, tu pourrais essayer avec :
# Fresh water (122)
# Geological source (52)
# Wild animals (10)
# Termites( 9)
# Permafrost and hydrates (3)
# Wild fires (1)
122+52+10+9+3+3
Out[3]:
In [4]:
df_BU = pd.read_csv("../data/Sankey_BU_2003-2012_25MAy2016.txt", header=1, delim_whitespace=True)
df_BU
Out[4]:
In [5]:
df_BU.rename(columns = {'proc':'stats'}, inplace = True)
df_BU.index.name = 'proc'
df_BU
Out[5]:
In [6]:
#Add "Not affected" column and set values to 0
df_BU['Not affected'] = 0
df_BU
Out[6]:
In [7]:
#Copy GLO values for OtherNat into "Not affected" col
df_BU['Not affected']['OtherNat'] = df_BU['GLO']['OtherNat']
In [8]:
df_BU
Out[8]:
In [159]:
#sub_df = pd.read_csv("../data/Sankey_BU_subcategories.csv", header=1, sep="\;")
sub_df = pd.read_csv("../data/Sankey_BU_subcategories_withFake.csv", header=1, sep="\;")
sub_df
Out[159]:
In [160]:
sub_df['proc']
Out[160]:
In [161]:
sub_df.index = sub_df['proc']
sub_df = sub_df.drop(['proc'], axis=1)
sub_df.head(10)
Out[161]:
In [162]:
#Pivot table so that "stats" Mean, Min, Max become three columns under each region column
#and "proc" becomes index
df_BU_piv = df_BU.pivot(columns='stats', index=df_BU.index)
df_BU_piv
Out[162]:
In [163]:
#Pivot table so that "stats" Mean, Min, Max become three columns under each region column
#and "proc" becomes index
sub_df_piv = sub_df.pivot(columns='stats', index=sub_df.index)
sub_df_piv
Out[163]:
In [10]:
df_BU_piv.columns[0][0] #Bor_NAme
df_BU_piv.columns[3][0] #contUSA
df_BU_piv.columns[6][0] #CentName
df_BU_piv.columns[9][0] #Trop_SAme
Out[10]:
In [11]:
df_BU_piv['Bor_NAme']['mean']
Out[11]:
In [12]:
df_BU_piv['Bor_NAme'].loc['Agriwast']['mean']
Out[12]:
In [13]:
df_BU_piv['Bor_NAme'].loc['BioBurBiof']['mean']
Out[13]:
In [165]:
#Create regions, categories and subcategories array
#regions
#Store region names in list
numRegions = df_BU_piv.shape[1] / 3
idx = 0
regions = []
for num in range(0,numRegions):
if df_BU_piv.columns[idx][0] != "GLO" : #exclude GLO column
regions.append(df_BU_piv.columns[idx][0])
idx = idx + 3
#categories
categories = df_BU_piv.index.tolist()
#subcategories
subcategories = sub_df.index.tolist()
subcategories = list(set(subcategories)) #finds unique values
regions
categories
subcategories
Out[165]:
In [110]:
#Create sources array
#sources = df_BU_piv.index.tolist()
#sources
Out[110]:
In [166]:
sub_df_piv.index.tolist()
Out[166]:
In [167]:
nodes = categories + regions + subcategories
nodes
Out[167]:
In [168]:
sources = categories + subcategories
sources
Out[168]:
In [169]:
targets = regions + categories
targets
Out[169]:
In [170]:
#Replace -99 values with 0
#df_BU_piv = df_BU_piv.replace(-99, 1)
#df_BU_piv
#Replace -99 and 0 values with -1
#df_BU_piv = df_BU_piv.replace(-99, -1)
#df_BU_piv = df_BU_piv.replace(0, -1)
df_BU_piv
Out[170]:
In [190]:
sub_df_piv
Out[190]:
In [206]:
targets #regions + categories
#df_BU_piv['Bor_NAme'].loc['Agriwast']['mean']
sub_df_piv['OtherNat'].loc['Freshwater'][2] #[0] = max, [1] = min, [2] = mean
sub_df_piv['Agriwast'].loc['Termites'][2] #[0] = max, [1] = min, [2] = mean
Out[206]:
In [216]:
#Write json file for 3-level Sankey, MEAN VALUES ONLY
#GLO omitted
file = open('../data/Sankey_BU_2003-2012_25MAy2016_mean_3levels.json', 'w')
file.write('{\n')
file.write('"nodes": [\n')
for node in nodes:
file.write('{"name": "%s"},\n' %(node))
# remove last comma
file.seek(-2, os.SEEK_END)
file.truncate()
file.write('\n],\n')
file.write('"links": [\n')
#source-target pairs for categories -> regions
for category in categories:
#print category
for region in regions:
#print region
value = df_BU_piv[region].loc[category]['mean']
if value != -99:
if value != 0:
#print value
file.write('{"source": "%s", "target": "%s", "value": "%.2f"},\n' %(category, region, float(value)))
#source-target pairs for subcategories -> categories
for subcategory in subcategories:
for category in categories:
value2 = sub_df_piv[category].loc[subcategory][2] #[0] = max, [1] = min, [2] = mean
if value2 != 0:
file.write('{"source": "%s", "target": "%s", "value": "%.2f"},\n' %(subcategory, category, float(value2)))
# remove last comma
file.seek(-2, os.SEEK_END)
file.truncate()
file.write('\n]\n')
file.write('}\n')
file.close()
In [50]:
#Write json file for 3-level Sankey, MIN VALUES ONLY
#GLO omitted
file = open('../data/Sankey_BU_2003-2012_25MAy2016_min_3levels.json', 'w')
file.write('{\n')
file.write('"nodes": [\n')
for node in nodes:
file.write('{"name": "%s"},\n' %(node))
# remove last comma
file.seek(-2, os.SEEK_END)
file.truncate()
file.write('\n],\n')
file.write('"links": [\n')
#source-target pairs for categories -> regions
for category in categories:
for region in regions:
meanValue = df_BU_piv[region].loc[category]['mean']
value = df_BU_piv[region].loc[category]['min']
if meanValue != -99:
if meanValue != 0:
file.write('{"source": "%s", "target": "%s", "value": "%.2f"},\n' %(category, region, float(value)))
# remove last comma
file.seek(-2, os.SEEK_END)
file.truncate()
file.write('\n]\n')
file.write('}\n')
file.close()
In [51]:
#Write json file for Sankey, MAX VALUES ONLY
#GLO omitted
file = open('../data/Sankey_BU_2003-2012_25MAy2016_max_3levels.json', 'w')
file.write('{\n')
file.write('"nodes": [\n')
for node in nodes:
file.write('{"name": "%s"},\n' %(node))
# remove last comma
file.seek(-2, os.SEEK_END)
file.truncate()
file.write('\n],\n')
file.write('"links": [\n')
for source in sources:
#print source
for target in targets:
#print target
#print df_BU_piv[target].loc[source]['max']
meanValue = df_BU_piv[target].loc[source]['mean']
value = df_BU_piv[target].loc[source]['max']
if meanValue != -99:
if meanValue != 0:
file.write('{"source": "%s", "target": "%s", "value": "%.2f"},\n' %(source, target, float(value)))
# remove last comma
file.seek(-2, os.SEEK_END)
file.truncate()
file.write('\n]\n')
file.write('}\n')
file.close()
In [20]:
#Write tsv file with min/max values
#file_name = "../data/statsBU.tsv"
#df_BU_piv.to_csv(file_name, sep='\t', encoding='utf-8')
#df_BU_piv
In [39]:
#junk = pd.read_csv("../data/Sankey_BU_2003-2012_25MAy2016.txt", header=1, delim_whitespace=True)
#junk.rename(columns = {'proc':'stats'}, inplace = True)
In [40]:
#Write tsv file with min/max values
#file_name = "../data/junk.tsv"
#junk.to_csv(file_name, sep='\t', encoding='utf-8')
In [ ]: