Notebook accompanying article on PB Python
In [1]:
import pandas as pd
import jenkspy
Create an example dataframe
In [2]:
sales = {
'account': [
'Jones Inc', 'Alpha Co', 'Blue Inc', 'Super Star Inc', 'Wamo',
'Next Gen', 'Giga Co', 'IniTech', 'Beta LLC'
],
'Total': [1500, 2100, 50, 20, 75, 1100, 950, 1300, 1400]
}
df = pd.DataFrame(sales)
In [3]:
df.sort_values(by='Total')
Out[3]:
Try cutting the data using qcut
In [4]:
df['quantile'] = pd.qcut(df['Total'], q=2, labels=['bucket_1', 'bucket_2'])
In [5]:
df.sort_values(by='Total')
Out[5]:
Compare with using cut
In [6]:
df['cut_bins'] = pd.cut(df['Total'],
bins=2,
labels=['bucket_1', 'bucket_2'])
In [7]:
df.sort_values(by='Total')
Out[7]:
Show how jenkspy works
In [8]:
breaks = jenkspy.jenks_breaks(df['Total'], nb_class=2)
print(breaks)
In [9]:
df['cut_jenks'] = pd.cut(df['Total'],
bins=breaks,
labels=['bucket_1', 'bucket_2'])
df.sort_values(by='Total')
Out[9]:
Fix the NaN by using include_lowest
In [10]:
df['cut_jenksv2'] = pd.cut(df['Total'],
bins=breaks,
labels=['bucket_1', 'bucket_2'],
include_lowest=True)
df.sort_values(by='Total')
Out[10]:
Try some other examples
In [11]:
df['quantilev2'] = pd.qcut(
df['Total'], q=4, labels=['bucket_1', 'bucket_2', 'bucket_3', 'bucket_4'])
df['cut_jenksv3'] = pd.cut(
df['Total'],
bins=jenkspy.jenks_breaks(df['Total'], nb_class=4),
labels=['bucket_1', 'bucket_2', 'bucket_3', 'bucket_4'],
include_lowest=True)
df.sort_values(by='Total')
Out[11]:
In [ ]: