https://github.com/Pierre-Sassoulas/pySankey https://github.com/anazalea/pySankey
(base) D:\dataanalysis-herocoli-redmetrics>pip install pySankey
Collecting pySankey
Downloading https://files.pythonhosted.org/packages/ad/fc/c2823db63e0efe365275 9a4de2859ed02fe5479bb1ba720605cd789121e5/pySankey-0.0.1-py3-none-any.whl distributed 1.21.8 requires msgpack, which is not installed.
Installing collected packages: pySankey
Successfully installed pySankey-0.0.1
You are using pip version 10.0.1, however version 18.0 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' comm and.
In [ ]:
import matplotlib
#matplotlib.use('Agg')
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from pySankey import sankey
With the current implementation of pySankey it's actually impossible to guarantee that left labels and right labels will be sorted with the current implementation. Left labels and right labels are instantiated on the fly and are not rollbacked if an inferior label is drawn afterwards.
If 0 -> 1 is drawn before 1 -> 0, right 1 will be below right 0.
If 0 -> 1 is drawn after 1 -> 0, left 1 will be below left 0.
Working with a modified version here.
In [ ]:
left = ['0 (7)', '1 (55)', '0 (7)', '0 (7)', '1 (55)', '1 (55)', '2 (1)', '3 (26)', '3 (26)', '3 (26)']
right = ['0 (10)', '1 (3)', '2 (3)', '3 (73)', '0 (10)', '3 (73)', '3 (73)', '0 (10)', '2 (3)', '3 (73)']
leftWeight = [ 2., 3., 1., 4., 7., 45., 1., 1., 2., 23.]
rightWeight = [ 2., 3., 1., 4., 7., 45., 1., 1., 2., 23.]
sankey.sankey(
#left=sorted(classesDF['pretest'].values), right=sorted(classesDF['posttest'].values),
left=left,
right=right,
leftWeight=leftWeight,
rightWeight=rightWeight,
aspect=20,
fontsize=20,
figureName="testSankey"
)
In [ ]:
from pySankey import sankey
In [ ]:
left = ['1 (55)', '0 (7)', '1 (55)', '0 (7)', '0 (7)', '1 (55)', '2 (1)', '3 (26)', '3 (26)', '3 (26)']
right = ['0 (8)', '1 (5)', '1 (5)', '2 (3)', '3 (73)', '3 (73)', '3 (73)', '0 (8)', '2 (3)', '3 (73)']
leftWeight = [ 7., 2., 3., 1., 4., 45., 1., 1., 2., 23.]
rightWeight = [ 7., 2., 3., 1., 4., 45., 1., 1., 2., 23.]
sankey.sankey(
#left=sorted(classesDF['pretest'].values), right=sorted(classesDF['posttest'].values),
left=left,
right=right,
leftWeight=leftWeight,
rightWeight=rightWeight,
aspect=20,
fontsize=20,
figureName="testSankey",
closePlot=False,
)
In [ ]:
left = ['0 (7)', '1 (55)', '0 (7)', '0 (7)', '1 (55)', '1 (55)', '2 (1)', '3 (26)', '3 (26)', '3 (26)']
right = ['1 (5)', '1 (5)', '2 (3)', '3 (73)', '0 (8)', '3 (73)', '3 (73)', '0 (8)', '2 (3)', '3 (73)']
leftWeight = [ 2., 3., 1., 4., 7., 45., 1., 1., 2., 23.]
rightWeight = [ 2., 3., 1., 4., 7., 45., 1., 1., 2., 23.]
sankey.sankey(
#left=sorted(classesDF['pretest'].values), right=sorted(classesDF['posttest'].values),
left=left,
right=right,
leftWeight=leftWeight,
rightWeight=rightWeight,
aspect=20,
fontsize=20,
figureName="testSankey",
closePlot=True,
)
In [ ]:
df = pd.read_csv(
'customers-goods.csv', sep=',',
names=['id', 'customer', 'good', 'revenue']
)
rightWeight = df['revenue'].values[1:].astype(float)
leftWeight = 2 * df['revenue'].values[1:].astype(float)
sankey.sankey(
left=df['customer'].values[1:],
right=df['good'].values[1:],
rightWeight=rightWeight,
leftWeight=leftWeight,
aspect=20,
fontsize=20,
figureName=None,
closePlot=True,
)
In [ ]:
classesDF = pd.DataFrame(
index = ['1','2','3','4'],
columns = ['pretest', 'posttest'],
data=''
)
classesDF.loc['1', 'pretest'] = 'good'
classesDF.loc['1', 'posttest'] = 'good'
classesDF.loc['2', 'pretest'] = 'good'
classesDF.loc['2', 'posttest'] = 'bad'
classesDF.loc['3', 'pretest'] = 'bad'
classesDF.loc['3', 'posttest'] = 'good'
classesDF.loc['4', 'pretest'] = 'bad'
classesDF.loc['4', 'posttest'] = 'bad'
weight = pd.Series(index=classesDF.index, data=10).values.astype(float)
sankey.sankey(
left=classesDF['pretest'].values,
right=classesDF['posttest'].values,
rightWeight=weight,
leftWeight=weight,
aspect=20,
fontsize=20,
figureName=None,
closePlot=False,
)
In [ ]:
len(rightWeight), len(df.index)
In [ ]:
weight
In [ ]:
import matplotlib.rcsetup as rcsetup
print(rcsetup.all_backends)
In [ ]:
import pySankey
print(dir(pySankey))
print(pySankey.__path__)
In [ ]:
import pandas as pd
from pySankey import sankey
pd.options.display.max_rows = 8
df = pd.read_csv(
'fruits.txt', sep=' ', names=['true', 'predicted']
)
colorDict = {
'apple':'#f71b1b',
'blueberry':'#1b7ef7',
'banana':'#f3f71b',
'lime':'#12e23f',
'orange':'#f78c1b'
}
sankey.sankey(
df['true'], df['predicted'], aspect=20, colorDict=colorDict,
fontsize=12, figureName=None
)
# Result is in "fruit.png"
In [ ]:
df['true'].value_counts()
In [ ]:
df['predicted'].value_counts()
In [ ]:
df = pd.read_csv(
'customers-goods.csv', sep=',',
names=['id', 'customer', 'good', 'revenue']
)
df = df.drop(0)
df.index = df['id']
df = df.drop(['id'], axis=1)
df['revenue'] = df['revenue'].apply(float)
sankey.sankey(
left=df['customer'], right=df['good'], rightWeight=df['revenue'], aspect=20,
fontsize=20, figureName=None
)
# Result is in "customer-good.png"