In [354]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import scipy
import scipy.stats
import json
plt.rcParams["figure.figsize"] = (10,4)
colors = ['r','g','b']
def inverse(f, domain=(0.0, 1.0-1e-6), extrapolate=(float("NaN"), float("NaN"))):
def inner(x):
if f(domain[0]) >= x:
return extrapolate[0]
if f(domain[1]) <= x:
return extrapolate[1]
else:
try:
return scipy.optimize.brentq(lambda y: f(y)-x, a=domain[0], b=domain[1])
except ValueError:
return float("NaN")
return np.vectorize(inner)
import scipy.misc
def derivative(f, dx=1e-6):
return np.vectorize(lambda x: scipy.misc.derivative(f, x, dx))
def integral(f, lower=0.0):
return np.vectorize(lambda x: scipy.integrate.quad(f, lower, x)[0])
Piecewise linear functions (aka linear splines) had the advantage of being easy to fit and simple to work with. There disadvantage was that having a discontinuous first derivative (ie. they are $C^0$ functions), so that the the CDF derived from the Lorenz curve would be discontinuous, implying a discrete distribution (not a good match for income).
Ideally, we would like to fit the Lorenz curve with a $C^2$ function, a function that is twice continuously differentiable, so that the CDF (effectively a first derivative) would be smooth, and the PDF continuous (or perhaps even $C^3$, resulting in a smooth PDF).
One very common $C^2$ interpolant is cubic splines.
In [342]:
# Choose our lorenz curve. India is:
# with open("../jsoncache/IND_5_2011.5_0.json", "r") as f:
# Good ones
# IND_2_1977.5.json
# MYS_3_1997.json
# CHN_1_1999.json
# JAM_3_1988.json
with open("../jsoncache/CHL_3_2003.json", "r") as f:
d = json.loads(f.read())
L = [0.0] + d['lorenz']['L']
p = [0.0] + d['lorenz']['p']
if 'sample' in d:
ymean = d['sample']['mean_month_ppp'] # NOT annual income is more relatable
ymin = d['sample']['month_min']
ymax = d['sample']['month_max']
if not ymean or ymean == float("NaN"):
ymean = ymin + 0.25*(ymax - ymin) # guess with some skew
else:
ymean = d['inputs']['mean_month_ppp']
ymin = 0
ymax = ymean * 4
print("Mean", ymean, "(",ymin,ymax,")")
# Down sample the Lorenz curve, for smoothness
#pc = p
#Lc = L
if False and len(p) > 40:
print ("Down sampling Lorenz curve...")
pc = p[0:100:4]
Lc = L[0:100:4]
else:
pc = p
Lc = L
if Lc[len(Lc)-1] < 1:
pc = pc + [1.0]
Lc = Lc + [1.0]
#pc = p[0:100:5]
#Lc = L[0:100:5]
#pc = [0.0] + [1.0/N] + pc + [] + [1.0]
#Lc = [0.0] + [(ymin-0.0) * (1.0/N - 0.0) / ymean] + Lc + [] + [1.0]
# Check that this is a valid convex function - if we're working from bad data we'll go astray
#Lc = Lc[1:]
#pc = pc[1:]
if not np.all(np.diff(Lc) > 0):
print ("Lorenz curve is not increasing")
print (np.diff(Lc))
if not np.all(np.diff(np.diff(Lc)/np.diff(pc)) > 0):
print ("Lorenz curve is not convex")
print (np.diff(np.diff(Lc)/np.diff(pc)))
plt.plot(pc, Lc, ".")
Out[342]:
First we attempt a simple cubic spline interpolation of the Lorenz curve. The sharp turning points result because the cubic spline is not $C^3$. Another issue is that the space of polynmonial splines is not closed under inversion, so that the CDF is not a polynomial spline, and nor is the PDF. He we use numerical methods, but the result is that for fast visualization we will either rely on precomputation (resulting in a memory intensive grid of data points) or on-the-fly numerical methods (computationally intensive). One way to avoid this is to interpolate not points on the Lorenz curve, but points on the CDF itself. We'll come back to this.
In [343]:
##########################################
plt.rcParams["figure.figsize"] = (12,2.5)
fig, ax = plt.subplots(1, 4)
##########################################
import scipy.interpolate
cubic_lorenz = scipy.interpolate.CubicSpline(pc, Lc, extrapolate=0)
x = np.linspace(0.0, 1.0, 1000)
y = np.linspace(0.0, ymax, 1000)
ax[0].plot(pc, Lc, ".")
ax[0].plot(x, cubic_lorenz(x))
cubic_quantile = lambda p: ymean * cubic_lorenz.derivative()(p)
ax[1].plot(x, cubic_quantile(x))
cubic_cdf = inverse(cubic_quantile)
ax[2].plot(y, cubic_cdf(y))
cubic_pdf = derivative(cubic_cdf)
ax[3].plot(y, cubic_pdf(y))
Out[343]:
In [344]:
plt.plot(y, cubic_pdf(y))
Out[344]:
In [333]:
##########################################
plt.rcParams["figure.figsize"] = (12,2.5)
fig, ax = plt.subplots(1, 4)
##########################################
import scipy.interpolate
kspline_lorenz = scipy.interpolate.UnivariateSpline(pc, Lc, k=5, s=len(pc)/8.5e9)
x = np.linspace(0.0, 1.0, 1000)
y = np.linspace(0.0, ymax, 1000)
ax[0].plot(pc, Lc, ".")
ax[0].plot(x, kspline_lorenz(x))
kspline_quantile = lambda p: ymean * kspline_lorenz.derivative()(p)
ax[1].plot(x, kspline_quantile(x))
kspline_cdf = inverse(kspline_quantile)
ax[2].plot(y, kspline_cdf(y))
kspline_pdf = derivative(kspline_cdf)
ax[3].plot(y, kspline_pdf(y))
Out[333]:
In [334]:
y = np.linspace(0, ymax, 1000)
pdf = kspline_pdf(y)
plt.plot(y, pdf)
plt.vlines(x=d['inputs']['line_month_ppp'],ymin=0,ymax=np.nanmax(pdf))
Out[334]:
In [335]:
import scipy.integrate
a = d['quadratic']['reg']['params']['A']['coef']
b = d['quadratic']['reg']['params']['B']['coef']
c = d['quadratic']['reg']['params']['C']['coef']
mu = ymean
nu = -b * mu / 2
tau = mu * (4 * a - b**2) ** (1/2) / 2
eta1 = 2 * (c / (a + b + c + 1) + b/2) * (4 *a - b**2)**(-1/2)
eta2 = 2 * ((2*a + b + c)/(a + c - 1) + b / a)*(4*a - b**2)**(-1/2)
lower = tau*eta1+nu
upper = tau*eta2+nu
# Hacky way to normalise
gq_pdf_integral = 1
gq_pdf = lambda y: (1 + ((y - nu)/tau)**2)**(-3/2) / gq_pdf_integral
gq_pdf_integral = scipy.integrate.quad(gq_pdf,lower,upper)[0]
plt.plot(y, gq_pdf(y),color='r')
plt.plot(y, kspline_pdf(y))
plt.vlines(x=lower,ymin=0,ymax=np.nanmax(pdf),color="green")
#plt.vlines(x=upper,ymin=0,ymax=np.nanmax(pdf),color="green")
plt.vlines(x=d['inputs']['line_month_ppp'],ymin=0,ymax=np.nanmax(pdf))
In [336]:
print("GQ bounds",lower,upper)
povline = d['inputs']['line_month_ppp']
print("Poverty line", povline)
print("HC", d['dist']['HC'])
btl_gq = scipy.integrate.quad(gq_pdf,lower,povline)[0]
print("HC est GQ", btl_gq)
btl_kspline = kspline_cdf(povline)
print("HC est Kspline", btl_kspline)
In [337]:
##########################################
plt.rcParams["figure.figsize"] = (12,2.5)
fig, ax = plt.subplots(1, 4)
##########################################
pstar = pc
Lstar = Lc
# Left tail, minimum
addon = [-0.002,-0.001]
pstar = addon+pstar
Lstar = [a*ymin/ymean for a in addon]+Lstar
# Right tail, maximum
addon = [1.001,1.002]
pstar = pstar+addon
Lstar = Lstar+[1+(a-1)*ymax/ymean for a in addon]
spline = scipy.interpolate.UnivariateSpline(pstar,Lstar,k=5,s=1e-8)
print("Derivative at 0:", derivative(spline)(0))
print("Derivative at 1:", derivative(spline)(1))
x = np.linspace(0, 1.0, 1000)
y = np.linspace(0, 200, 1000)
ax[0].scatter(p, L)
ax[0].plot(x, spline(x))
spline_quantile = lambda p: ymean * spline.derivative()(p)
ax[1].plot(x, spline_quantile(x))
spline_cdf = inverse(spline_quantile)
ax[2].plot(y, spline_cdf(y))
spline_pdf = derivative(spline_cdf)
ax[3].plot(y, spline_pdf(y))
Out[337]:
In [338]:
print(ymin/ymean)
print(ymax/ymean)
In [364]:
##########################################
plt.rcParams["figure.figsize"] = (12,2.5)
fig, ax = plt.subplots(1, 4)
##########################################
import math
G = 0.33
mean = 500
sigma = scipy.stats.norm.ppf((G + 1)/2, 0, 1) * math.sqrt(2);
mu = math.log(mean) - 0.5 * sigma ** 2;
lnpdf = lambda y: scipy.stats.lognorm.pdf(y, sigma, scale=math.exp(mu))
lncdf = lambda y: scipy.stats.lognorm.cdf(y, sigma, scale=math.exp(mu))
lnquantile = lambda p: scipy.stats.lognorm.ppf(p, sigma, scale=math.exp(mu))
lnlorenz = lambda p: integral(lnquantile)(p) / mean
p = np.linspace(0, 1, 200)
L = lnlorenz(p)
spline = scipy.interpolate.UnivariateSpline(p,L,k=5,s=1e-8)
x = np.linspace(0, 1.0, 1000)
y = np.linspace(0, 1000, 1000)
ax[0].scatter(p, L)
ax[0].plot(x, spline(x))
spline_quantile = lambda p: ymean * spline.derivative()(p)
ax[1].plot(x, spline_quantile(x))
spline_cdf = inverse(spline_quantile)
ax[2].plot(y, spline_cdf(y))
spline_pdf = derivative(spline_cdf)
ax[3].plot(y, spline_pdf(y))
ax[3].plot(y, lnpdf(y), color="r")
Out[364]:
In [359]:
Out[359]:
Since our main objects in visualization are the CDF and PDF, we might try instead intepolating CDF points directly. Then, since the PDF is a simple first derivative, it will be easy to get from a $k$th degree polynomial spline representation of the CDF to a $(k-1)$th degree polynomial for the PDF.
As noted earlier, many Lorenz cuves (and hence CDFs) are consistent with a given sample of Lorenz curve points. Moreover, a given Lorenz curve point does not map uniquely to a CDF point. A Lorenz curve effectively defines a mean income for a sequence of quantile bands in the distribution. The mean value theorem suggests that some quantile within that band must have the mean income (assuming continuity), but we have no information of which quantile that will be.
Thus, we have to make an assumption. We choose to assign the mean income for each quantile band to the mid-point of that band. For a sequence of $N$ $(p, L)$ pairs sampled from the Lorenz curve, our procedure will give $N-1$ $(p, y)$ pairs sampled from a hypothetical CDF.
We also assume that the CDF starts at (0, 0) - that is no person has an income $<= 0$. (This may not be reasonable for a given data set.)
In [160]:
##########################################
plt.rcParams["figure.figsize"] = (12,5)
fig, ax = plt.subplots(1, 2)
##########################################
# The y values are simply the mean-scaled derivatives of the Lorenz curve
dL = np.diff(Lc)
dp = np.diff(pc)
y = ymean * dL/dp
y = np.hstack((0.0, y))
# And we arbitrarily assign these y values to the mid-points of the p values
pmid = np.add(pc[1:],pc[:-1])/2
pmid = np.hstack((0.0, pmid))
# Then calculate the direct interpolation of the CDF points
ccubic_cdf = scipy.interpolate.CubicSpline(y, pmid, bc_type=((2, 0.0), (2, 0.0)), extrapolate=0)
#ccubic_min = inverse(ccubic_cdf, (0, 500))(0.0)
ygrid = np.linspace(0, 500.0, 1000)
ax[0].plot(y, pmid, ".")
ax[0].plot(ygrid, ccubic_cdf(ygrid))
ccubic_pdf = derivative(ccubic_cdf)
ax[1].plot(ygrid, ccubic_pdf(ygrid))
Out[160]:
An obvious issue with cubic splines - based on cubics - is that they are not constrained to be monotonic. We can see that above in the "CDF", which turns downward at both tails and is thus ill-formed. It's even more obvious in the "PDF", which is negative in the tails.
The PCHIP interpolator is a variant on cubic splines that does guarantee monotonicity. However, to do so, it sacrifices a continuous second derivative. This means that the PDF will not be smooth. It works, and is at least well-formed, but it's not very attractive.
In [161]:
##########################################
plt.rcParams["figure.figsize"] = (12,5)
fig, ax = plt.subplots(1, 2)
##########################################
# Then calculate the direct interpolation of the CDF points
cpchip_cdf = scipy.interpolate.PchipInterpolator(y, pmid, extrapolate=0)
ax[0].plot(y, pmid, ".")
ax[0].plot(ygrid, cpchip_cdf(ygrid))
cpchip_pdf = derivative(cpchip_cdf)
ax[1].plot(ygrid, cpchip_pdf(ygrid))
Out[161]:
Since the relatively flat tails seem to be the cause of most non-monotonicity with cubic spline interpolation, we can try instead to replace the tails with a Pareto tails, which naturally supports this asymptotic flattening. The advantage of retaining the cubic spline in the left and central parts of the distribution is that we can easily preserve some of the detail of distribution, rather than forcing it to fit a smooth parametric form.
The pareto distribution has two parameters, and we will impose two conditions, namely
In addition we will vertically compress the pareto CDF so that it continues from the cubic spline.
In [162]:
##########################################
plt.rcParams["figure.figsize"] = (12,4)
fig, ax = plt.subplots(1, 3)
##########################################
### F(x) = 1 - (x/xm)^a => solve for xm and a
#pmid = pmid[1:]
#y = y[1:]
p1 = pmid[-2]
y1 = y[-2]
p2 = pmid[-1]
y2 = y[-1]
#alpha = (np.log(1-p2) - np.log(1-p1)) / (np.log(y1) - np.log(y2))
#ym = np.exp(np.log(y2) + (1/alpha)*np.log(1-p2))
#yshift = 0
def pareto_cdf(y, ym, alpha, yshift):
return 1 - (ym / (y - yshift))**alpha
def pareto_pdf(y, ym, alpha, yshift):
return alpha * ym**alpha / (y - yshift)**(alpha+1)
def pareto_pdfderiv(y, ym, alpha, yshift):
return -alpha*(alpha+1) * ym**alpha / (y - yshift)**(alpha+2)
def target(parameters):
dd1, ym, alpha, yshift = parameters
dd1 = dd1/100
yshift = yshift*100
ccubpar_cdf = scipy.interpolate.CubicSpline(y[:-1], pmid[:-1], bc_type=((1,0),(2,dd1)),extrapolate=0)
ccubpar_pdf = ccubpar_cdf.derivative()
err1 = pareto_cdf(y1, ym, alpha, yshift) - p1
err1d = pareto_pdf(y1, ym, alpha, yshift) - ccubpar_pdf(y1)
err1dd = pareto_pdfderiv(y1, ym, alpha, yshift) - ccubpar_pdf.derivative()(y1)
#err1dd = 0
err2 = pareto_cdf(y2, ym, alpha, yshift) - p2
return err1**2 + (err1d*20)**2 + (err1dd*100)**2 + err2**2
from scipy.optimize import basinhopping
#guess = (-0.000015, 2.85446391657, 0.783499652638, 135.703869627)
guess = (-1, 1, 1, 1)
dd1, ym, alpha, yshift = basinhopping(target, guess).x
dd1 = dd1/100
yshift = yshift*100
print(dd1, ym, alpha, yshift)
ccubpar_cdf = scipy.interpolate.CubicSpline(y[:-1], pmid[:-1], bc_type=((1,0),(2,dd1)),extrapolate=0)
ccubpar_pdf = ccubpar_cdf.derivative()
d1 = ccubpar_pdf(y1)
dd1 = ccubpar_pdf.derivative()(y1)
xgrid = np.linspace(0.0, 1.0, 1000)
ygrid = np.linspace(0.0, 1000.0, 1000)
ax[0].plot(y, pmid, ".")
ax[0].plot(ygrid, ccubpar_cdf(ygrid), 'b-')
ax[1].plot(ygrid, ccubpar_pdf(ygrid), 'b-')
ax[0].plot(ygrid, pareto_cdf(ygrid, ym, alpha, yshift), 'r-')
ax[0].set_xlim((0,1000))
ax[0].set_ylim((0, 1.0))
ygrid2 = np.linspace(y1, 1000, 1000)
ax[1].plot(ygrid2, pareto_pdf(ygrid2, ym, alpha, yshift), 'r-')
final_pdf = np.vectorize(lambda z: 0.0 if z < y[0] else (ccubpar_pdf(z) if z < y1 else pareto_pdf(z, ym, alpha, yshift)))
ax[2].plot(ygrid, final_pdf(ygrid))
import scipy.integrate
print(scipy.integrate.quad(final_pdf, 0, 1e3))[0]
In [163]:
ygrid1 = np.linspace(y1-10, y1, 500)
ygrid2 = np.linspace(y1, y1+10, 500)
plt.plot(ygrid1, ccubpar_pdf(ygrid1), 'b-')
plt.plot(ygrid2, pareto_pdf(ygrid2, ym, alpha, yshift), 'r-')
Out[163]:
In [164]:
##########################################
plt.rcParams["figure.figsize"] = (12,6)
##########################################
ygrid = np.linspace(0, 400, 2000)
plt.plot(ygrid, final_pdf(ygrid), 'b-')
Out[164]:
In [165]:
from scipy.integrate import quad
final_cdf = lambda y: quad(final_pdf, 0, y)[0]
final_quantile = inverse(final_cdf, domain=(0, 1e6))
final_mean = quad(lambda y: y * final_pdf(y), 0, 1e6)[0]
final_lorenz = np.vectorize(lambda p: quad(lambda y: y * final_pdf(y), 0, final_quantile(p))[0] / ymean)
final_lorenz(0.5) # should be 0.27
Out[165]:
In [166]:
final_L = final_lorenz(np.asarray(pc[1:-1]))
final_L = np.hstack((0.0, final_L, 1.0))
In [167]:
plt.plot(pc, final_L,'r-')
plt.plot(pc, Lc,'g.')
Out[167]:
In [ ]: