This notebook describes a simple algorithm to place realistic but synthetic cell tower sites in a country and to creat realistic flows between them based on the Gravity Model. I'm super interested in the utility of synthetic data and this idea came out of that. The algorithm was developed by myself in partner with Manuel Garcia-Herranz at UNICEF Innovation Unit
Shapefile data comes from GADM and population data comes from World Pop
In [98]:
import sys,csv
import matplotlib.pyplot as plt
import shapefile,pickle,random,re,collections
import numpy as np
import pandas as pd
import folium
from IPython.display import Image
import matplotlib
import matplotlib.patches
from matplotlib.patches import Polygon
from matplotlib.collections import PatchCollection
import matplotlib.path as mplPath
import seaborn as sns
%matplotlib inline
In [2]:
sns.set_context('notebook')
sns.set_style('white',rc={'grid':'black'})
%config InlineBackend.figure_format='svg'
In [3]:
def setFigSize(dim=(16.5,5.5)):
fig = matplotlib.pyplot.gcf()
fig.set_size_inches(dim[0],dim[1])
In [4]:
sf = shapefile.Reader("BRA_adm0.shp")
shapes=sf.shapes()
In [5]:
paths=collections.defaultdict(list)
for nshp in xrange(len(shapes)):
pts = np.array(shapes[nshp].points)
prt = shapes[nshp].parts
par = list(prt) + [pts.shape[0]]
for pij in xrange(len(prt)):
paths[nshp].append(mplPath.Path(pts[par[pij]:par[pij+1]]))
The data from WorldPop is extremely high resolution, so high in fact that it makes sense to downsample it. The original pixels are roughly 100m^2, by aggregating 4x4 squares the file is 1/16 of the original. First go to WorldPop and grab the data (take care to get _ppp_ (persons per pixel) and not _pph_ (persons per hectare)).
Downsample the data using GDAL. This subsamples by taking the mean, meaning each pixel must later be multiplied by 16
gdal_translate -outsize 25% 25% BRA_ppp_v2b_2015.tif BRA_ppp_v2b_2015_25.tif
Then convert into plain text
gdal_translate -of AAIGrid BRA_pph_v2b_2015.tif out.asc
Then set no data values (outside of the country) to zero
sed '\s\-3.4028234663853e+38\0\g'
In [6]:
!head -n 7 BRA_ppp_25_nd.asc
In [7]:
headerFile=csv.reader(open('BRA_ppp_25_nd.asc','r'))
lines=[headerFile.next()[0] for i in range(7)]
numbers=re.findall(r'[0-9\.\-]{2,}',' '.join(lines))
words=re.findall(r'[a-zA-Z]{2,}',' '.join(lines))
In [8]:
nCols,nRows,x0,y0,xSize,ySize=map(float,numbers)
In [9]:
%time a=np.loadtxt('BRA_ppp_25_nd.asc',skiprows=7)
a*=16.
Set up some convenience functions to translate from matrix indices to latitude and longitude. The origin is defined as the top left corner in this case.
In [54]:
def getLatLong(xy):
x=xy[1]
y=xy[0]
return (x0+(x*xSize),y0+((nRows-y)*ySize))
In [55]:
def calculatePop(a,p1,p2):
res=np.sum(a[p1[0]:p2[0],p1[1]:p2[1]])
return res
In [56]:
def inCountry(tower):
return True
point,guess=testPoint(tower,None)
if not point==None:
return True
else:
return False
The placement of towers balances two objectives: all towers should cover as close as possible to a common value of people. If too many people use the tower it is over-capacity and put under a lot of strain, if too few then the tower wastes resources as it must be maintained and powered. This is an NP hard problem. It could be solved as an optimisation problem using an objective function that increases with the number of people covered by the tower configuration and decreases with the number of towers.
The high level logic of the algorithm is to take an array of populations, to recursively split the array into N parts until either (i) the population in those parts are small enough to be covered by a single tower or the parts are of a low population that a tower cannot be justified. The key parameters of the model are the range of each tower, the target population to be covered by each tower and into how many slices the array should be split.
In [57]:
def newTower(p1,p2):
newTower = (p1[0]+random.random()*(p2[0]-p1[0]),p1[1]+random.random()*(p2[1]-p1[1]))
while not inCountry(newTower):
newTower = (p1[0]+random.random()*(p2[0]-p1[0]),p1[1]+random.random()*(p2[1]-p1[1]))
return newTower
In [58]:
towerRange = 50
# How many pixels can a tower cover e.g. 400m=1 cell, towerRange=50 => 20km
pthr=20000
# How many people can be covered by tower (larger => fewer towers)
N=2
# How to split cells
minD=1
# This should max single pixel value in pop array
# Min distance between towers (lowest value can be 1)
In [59]:
def assignTowers(a,p1,p2,towers,populations,it,verbose=False):
if verbose:
print it
assert p1[0]<p2[0]
assert p1[1]<p2[1]
dP = (p2[0]-p1[0])
dPn = dP/N
pop = calculatePop(a,p1,p2)
if ((pop > pthr) and (dPn > minD)):
for i in range(N):
for j in range(1,N+1):
towers = assignTowers(a,(p1[0]+(i*dPn),p1[1]+((j-1)*dPn)),(p1[0]+((i+1)*dPn),p1[1]+(j*dPn)),\
towers,populations,it+1,verbose=False)
else:
numT = pop/pthr
for i in range(int(numT)):
towers.append(newTower(p1,p2))
populations.append(pop/float(numT))
numT=numT-1
if numT**10 > random.random():
# We want to aggresively reduce the probability that low populations get a tower
towers.append(newTower(p1,p2))
populations.append(pop*numT)
return towers
In [60]:
def getTowers(a):
towers = []
populations=[]
p1=(0,0)
while p1[0]<a.shape[0]:
p1=(p1[0],0)
while p1[1]<a.shape[1]:
towers=assignTowers(a,p1,(p1[0]+towerRange,p1[1]+towerRange),towers,populations,0,verbose=False)
p1=(p1[0],p1[1]+towerRange)
p1=(p1[0]+towerRange,p1[1])
return towers,populations
In [61]:
def testPoint(point,guess):
'''
Takes (x,y) tuple and returns index of voronoi, if hit is detected
'''
if guess:
for p in paths[guess]:
if p.contains_point(point):
return guess,guess
for nshp in xrange(len(shapes)):
for p in paths[nshp]:
if p.contains_point(point):
return nshp,nshp
return None,None
In [62]:
%time towersOut,populationOut=getTowers(a)
print 'Placed %d towers' % len(towersOut)
In [63]:
_=plt.hist(populationOut,log=False,bins=20)
_=plt.xlabel('Population Per Pixel')
_=plt.ylabel('Tower Count')
The population around the towers peaks around pthr
In [67]:
latsLongs=map(getLatLong,towersOut)
In [68]:
towerLatLongs=np.array(latsLongs)
A quick plot in a small region of towers along with their ranges on top of the raw population
In [69]:
plt.imshow(np.log(0.001+a))
plt.plot([t[1] for t in towersOut],[t[0] for t in towersOut],'.',alpha=0.25,markersize=towerRange*2,color='red')
cbar=plt.colorbar(ticks=map(np.log,[0.01,0.1,1,10,100,1000]))
_=cbar.ax.set_yticklabels([0.01,0.1,1,10,100,1000])
cbar.set_label(r'Population per Pixel ($\approx \bf{400m}^{2}$)')
plt.xlim(8000,8500)
plt.ylim(8000,8500)
plt.savefig('map_inset.png',dpi=300)
In [71]:
fig=plt.figure()
ax = fig.add_subplot(1,1,1)
plt.imshow(np.log(0.001+a))
plt.title('%d Towers, Max Dist %d pixels' %(len(towersOut),towerRange))
plt.plot([t[1] for t in towersOut],[t[0] for t in towersOut],'.',alpha=0.75,markersize=1.5,color='red')
cbar=plt.colorbar(ticks=map(np.log,[0.01,0.1,1,10,100]))
_=cbar.ax.set_yticklabels([0.01,0.1,1,10,100])
cbar.set_label(r'Population per Pixel ($\approx \bf{400m}^{2}$)')
plt.savefig('map.png',dpi=300)
In [72]:
realTowers=pd.read_csv('BTC_Sites_Details_2016-01-31.csv',sep=',')
In [73]:
realTowers.head()
Out[73]:
In [74]:
def plotOutline():
for nshp in xrange(len(shapes)):
pts = np.array(shapes[nshp].points)
prt = shapes[nshp].parts
par = list(prt) + [pts.shape[0]]
for pij in xrange(len(prt)):
paths[nshp].append(mplPath.Path(pts[par[pij]:par[pij+1]]))
plt.plot([p[0] for p in pts[par[pij]:par[pij+1]]],[p[1] for p in pts[par[pij]:par[pij+1]]],'-',color='grey')
In [76]:
plt.subplot(1, 2, 1)
plotOutline()
plt.plot(realTowers.LONGITUDE,realTowers.LATITUDE, '.',alpha=0.3,markersize=6,color='red')
plt.title('Real Towers (N=%d)' % realTowers.shape[0])
plt.xlim(sf.bbox[0],sf.bbox[1])
plt.ylim(sf.bbox[2],sf.bbox[3])
plt.subplot(1, 2, 2)
plotOutline()
plt.plot(towerLatLongs[:,0],towerLatLongs[:,1], '.',alpha=0.3,markersize=6,color='blue')
plt.title('Synthetic Towers (N=%d)' % towerLatLongs.shape[0])
plt.xlim(sf.bbox[0],sf.bbox[1])
plt.ylim(sf.bbox[2],sf.bbox[3])
Out[76]:
Mostly we get things right: most of the population is centered along the coast so it makes sense that most of the towers are there. We also capture Manaus in the Amazonas region in an otherwise unpulated and cell tower swathe. One area that is notably missing from the synthetic data is the road from Rio to Brasilia (the long red line in the ower half of the left hand figure). This area has low population but high mobility. An improved model would benefit from adding towers when situated between two important highly populated areas, even if they are in unpopulated areas themselves.
In [77]:
# From http://www.johndcook.com/blog/python_longitude_latitude/
earthRadius=6371.0
# KMs
import math
def distance(lat1, long1, lat2, long2):
'''
latitude is north-south
'''
# Convert latitude and longitude to
# spherical coordinates in radians.
degrees_to_radians = math.pi/180.0
# phi = 90 - latitude
phi1 = (90.0 - lat1)*degrees_to_radians
phi2 = (90.0 - lat2)*degrees_to_radians
# theta = longitude
theta1 = long1*degrees_to_radians
theta2 = long2*degrees_to_radians
cos = (math.sin(phi1)*math.sin(phi2)*math.cos(theta1 - theta2) +
math.cos(phi1)*math.cos(phi2))
arc = math.acos( cos )
return arc*earthRadius
In [78]:
d=np.zeros(shape=(len(towersOut),len(towersOut)))
longVals=towerLatLongs[:,0]
latVals=towerLatLongs[:,1]
for ni in range(len(longVals)):
for nj in range(len(latVals)):
if not ni==nj:
d[ni,nj]=distance(latVals[ni],longVals[ni],latVals[nj],longVals[nj])
In [79]:
def getGravityFlow(d,i,j,A=34.7,alpha1=0.269,alpha2=0.269,beta=1.09,pop=populationOut):
# Parameters taken from http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0128692
top=pop[i]**alpha1*pop[j]**alpha2
bottom=d**beta
return A*top/bottom
In [80]:
flow=np.zeros_like(a)
for ni in range(len(longVals)):
for nj in range(len(latVals)):
if not ni==nj:
flow[ni,nj]=getGravityFlow(d[ni,nj],ni,nj)
else:
flow[ni,nj]=populationOut[ni]
# We fill in the diagonal with the towers population
In [81]:
_=plt.hist(flow.flatten(),log=True,bins=20)
plt.xlabel('Flow (Number of People)')
plt.ylabel('Number of Edges')
Out[81]:
In [85]:
midPoint=[np.mean(sf.bbox[2:]),np.mean(sf.bbox[0:2])]
In [86]:
flowNormed=(flow-np.min(flow))/np.max(flow)
In [93]:
mapOut=folium.Map(location=midPoint,zoom_start=4)
for ni in range(len(latsLongs)-1):
for nj in range(ni+1,len(latsLongs)):
if flowNormed[ni,nj]>0.01 and d[ni,nj]>0:
mapOut.line([(latsLongs[ni][1],latsLongs[ni][0]),(latsLongs[nj][1],latsLongs[nj][0])],\
line_weight=3,line_opacity=flowNormed[ni,nj],line_color='black')
#print [(latsLongs[ni][1],latsLongs[ni][0]),(latsLongs[nj][1],latsLongs[nj][0])]
for coords,pop in zip(latsLongs,populationOut):
mapOut.circle_marker(location=[coords[1],coords[0]],popup='%d'% (pop),radius=3000,\
fill_color='red',fill_opacity=0.1,line_color=None)
In [94]:
mapOut
Out[94]:
In [97]:
Image('screenshot.png')
Out[97]:
In [ ]: