In this part, we aim to determine the shortest route that will allow the maintenance crew to fix all the various wells that are either in need of repair or not functional. Some factors which we may wish to consider:
i) Can we assign a higher priority to wells which are not functional as opposed to those that are merely not working?
ii) Can we take advantage of route information to concentrate on higher quality roads?
Initially we will ignore differences in location, height etc. and assume that all points on the map are equally accessible. To calculate the pairwise distance between points we need to take account of the fact that the Earth is a sphere so we need to use the Haversine formula:
$$\rm{haversin} \Big(\frac{d}{r}\Big) = \rm{haversin}(\phi_2 - \phi_1) + \rm{cos}(\phi_1)\,\,\rm{cos}(\phi_2)\,\,\rm{haversin}(\lambda_1 - \lambda_2)$$$$\rm{haversin}(\theta) = \rm{sin}^2\Big(\frac{\theta}{2}\Big) = \frac{1 - \rm{cos}(\theta)}{2} $$where $d$ is the distance between the two points, $r$ is the radius of the sphere (Earth), $\phi$ is the latitude and $\theta$ is the longitude. This can be rearranged to give the following formula as described in (R. W. Sinnott, "Virtues of the Haversine," Sky and Telescope, vol. 68, no. 2, 1984, p. 159):
$$\rm{dlon} = \rm{lon2} - \rm{lon1}$$ $$\rm{dlat} = \rm{lat2} - \rm{lat1}$$ $$\rm{a} = (\rm{sin}(\frac{dlat}{2}))^2 + cos(lat1) \times cos(lat2) \times (sin(\frac{dlon}{2}))^2$$ $$\rm{c} = 2 \times \rm{arctan}(\frac{\sqrt{a}}{\sqrt{1-a}})$$ $$\rm{d} = \rm{R} \times \rm{c}$$
In [2]:
%matplotlib inline
import numpy as np
import scipy as sp
import scipy.spatial
import matplotlib.pyplot as plt
import pandas as pd
import random
import math
import time
import seaborn as sns
from math import radians, sin, cos, asin, sqrt, pi, atan2
In [3]:
def getDistanceByHaversine(latitudes, longitudes):
'''Haversine formula - give coordinates as a 2D numpy array of
(lat_decimal,lon_decimal) pairs'''
# earth's mean radius = 6,371km
EARTHRADIUS = 6371.0
# create meshgrid:
lat, lon = np.meshgrid(latitudes, longitudes)
# convert to radians
lat *= np.pi / 180.0
lon *= np.pi / 180.0
# get transposed meshgrids for distances
lat_T = lat.T.copy()
lon_T = lon.T.copy()
dlon = lon_T - lon
dlat = lat_T - lat
a = (np.sin(dlat/2))**2 + np.cos(lat) * np.cos(lat_T) * (np.sin(dlon/2.0))**2
c = 2.0 * np.arctan2(np.sqrt(a), np.sqrt(1.0-a))
km = EARTHRADIUS * c
return km
In [ ]:
In [4]:
from pandas import Series, DataFrame, Panel
#import data and labels
train_file = pd.read_csv('Waterpump-training-values.csv')
train_labels = pd.read_csv('Waterpump-training-labels.csv')
train_file['status_group'] = train_labels['status_group']
train_file = train_file[train_file['longitude'] > 1]
train_file = train_file[train_file['latitude'] < 0]
features = ['longitude','latitude','status_group']
trainLoc = train_file[features]
# pandas barplot
trainLoc['status_group'].value_counts().plot(kind='bar');
In [5]:
trainLoc = trainLoc[trainLoc['status_group'] != 'functional']
#trainLoc.head()
#subsetting data to just take 1% to make it easier to work with
np.random.seed(142)
test_idx = np.random.uniform(0, 1, len(trainLoc)) <= 0.01
trainLoc = trainLoc[test_idx==True]
In [6]:
x = np.array(trainLoc['longitude'])#.tolist()
y = np.array(trainLoc['latitude'])#.tolist()
#insert coordinates for Dar Es Salaam (39.2833 E 6.8000S)
x = np.insert(x, 0, 39.2833, 0)
y = np.insert(y, 0, -6.8000, 0)
DarEs = np.array([39.2833, -6.800])
#A = numpy.array((x, y, z), dtype=float)
#tmpx, tmpy = np.meshgrid(x,y)
cities = np.array((x, y), dtype=float)
#cities = np.array([tmpx, tmpy])
cities = np.reshape(cities, (2,-1)).T
#print cities.shape
#print cities
plt.scatter(cities[:,0], cities[:,1])
plt.scatter(DarEs[0],DarEs[1], s=50, color='red') #Highlight Dar Es Salaam on Map as HQ
Out[6]:
In [7]:
cities[:5]
Out[7]:
In [8]:
# the distance between two cities on a sphere is found using the Haversine formula
def get_distance(city1, city2):
'''Haversine formula - give coordinates as a 2D numpy array of
(lat_decimal,lon_decimal) pairs'''
#print city1[:], city2[:]
# earth's mean radius = 6,371km
EARTHRADIUS = 6371.0
# create meshgrid:
lat0, lon0 = city1[1], city1[0]
lat1, lon1 = city2[1], city2[0]
# convert to radians
lat0 *= np.pi / 180.0
lon0 *= np.pi / 180.0
lat1 *= np.pi / 180.0
lon1 *= np.pi / 180.0
# get transposed meshgrids for distances
#lat1_T = lat1.T.copy()
#lon1_T = lon1.T.copy()
#dlon = lon_T - lon
#dlat = lat_T - lat
dlon = lon1 - lon0
dlat = lat1 - lat0
a = (np.sin(dlat/2))**2 + np.cos(lat0) * np.cos(lat1) * (np.sin(dlon/2.0))**2
c = 2.0 * np.arctan2(np.sqrt(a), np.sqrt(1.0-a))
km = EARTHRADIUS * c
return km
# the energy for the whole system corresponds to
# the total distance the salesman has to travel
def distance(pathcities):
distance = 0.
number_of_cities = pathcities.shape[0]
# loop over all cities
for j in xrange(number_of_cities):
if j == number_of_cities - 1: # FINAL POINT CONNECTS WITH THE FIRST ONE
distance += get_distance( pathcities[j,:], pathcities[0,:] )
else:
distance += get_distance( pathcities[j,:], pathcities[j+1,:] )
return distance
#get_distance(cities[0], cities[1])
In [9]:
# create a new path by swapping the connection between
# n_swaps cities randomly
def changepath(inputcities, n_swaps):
indices = range(1,inputcities.shape[0]) #Don't include starting city in swaps so that HQ is always the same
cities = inputcities.copy()
for i in range(n_swaps):
swappedCities = swapindex(cities)
cities=swappedCities.copy()
return cities
def swapindex(cities):
indices = range(cities.shape[0]) #Don't include starting city in swaps so that HQ is always the same
# take two random indices to swap
c1 = np.random.choice(indices[1:])
c2 = np.random.choice(indices[1:])
while c2 == c1:
c2 = np.random.choice(indices[1:])
# remember the cities to swap
tmp1 = cities[c1,:]
tmp2 = cities[c2,:]
# do the actual swapping
changedCities = cities.copy()
changedCities[c1,:] = tmp2
changedCities[c2,:] = tmp1
return changedCities
In [10]:
print DarEs[0], DarEs[1]
In [11]:
def plot_path(pcities):
plt.plot(pcities[:,0], pcities[:,1],'o')
plt.plot(pcities[:,0], pcities[:,1])
plt.scatter(DarEs[0],DarEs[1], s=250, color='red') #Highlight Dar Es Salaam on Map as HQ
plt.xlim(np.min(pcities[:,0])-1, np.max(pcities[:,0])+1)
plt.ylim(np.min(pcities[:,1])-1, np.max(pcities[:,1])+1)
In [12]:
# function for simulated annealing
# pathcities: array with our cities represented by their coordinates
# init_temp: initial temperature
# thermostat: linear factor to decrease the temperature
# ftol, itol, otol: tolerance values for stopping
# reannealing: schedule for reheating
def simulated_annealing( pathcities, init_temp, thermostat, ftol, itol, otol, reannealing):
# ===============
# SET THESE FOR DIAGNOSTICS
# ================
m=100000
distsize=otol+1
dist=[]
temperature = init_temp
N = pathcities.shape[0]
# number of accepted steps
it = 0
# DISTANCE HERE IS OUR ENERGY
prev_E = distance(pathcities)
# number of iterations
atp=0
didPlot = False
while it >=0:
#while otol < itol:
## NUMBER OF CORNERS IS L
L = np.max((np.floor(np.sqrt(temperature)).astype(int),1))
#print "L", L
#L = 2
propose_path = changepath(pathcities, L)
new_E = distance(propose_path)
deltaE = new_E -prev_E
if new_E < prev_E:
pathcities = propose_path
#dist[it] =new_E
dist.append(new_E)
prev_E = new_E
it = it+1
didPlot = False
elif np.random.rand() < np.exp( -deltaE/temperature):
pathcities = propose_path
#dist[it] =new_E
dist.append(new_E)
prev_E = new_E
it = it+1
didPlot = False
atp =atp +1; # NUMBER OF ITERATIONS
# check if it is time to cool down
if it % reannealing == 0:
temperature = thermostat * temperature;
#temperature = temperature/log(it);
compl_temp=0;
#if we get too cold, reheat
if temperature < 0.01:
temperature = 1
if False: #some optional plotting
if (it % 100 == 0) and not didPlot:
display.clear_output()
plt.plot( dist, '-r')
display.display(plt.gcf())
print len(dist)
print raw_input('Iteration: ' + np.str(atp))
plt.close()
didPlot = True
if len(dist)>m and np.std(dist[-m:])/np.mean(dist[-m:]) < ftol:
print 'ftol'
break
if atp >itol:
print 'itol'
break
if len(dist)> 0 and dist[-1] <= otol:
print 'otol'
print dist[-1]
break
s = pathcities
return s, dist
In [ ]:
In [ ]:
In [13]:
# simulated_annealing( pathcities, init_temp, thermostat, ftol, itol, otol, reannealing):
pcities, distances = simulated_annealing(cities, 10.0, 0.8, 0.01, 20000, 0, 10)
print len(distances)
In [14]:
plt.plot(distances)
Out[14]:
In [15]:
plt.subplot(2,1,1)
plot_path(cities)
plt.subplot(2,1,2)
plot_path(pcities)
In [16]:
#print distances[-1]
print cities[0:2], pcities[0:2]
1) Set Dar Es Salaam (39.2833 E 6.8000S) as starting/ending point. DONE
2) Set Maximum Distance that can be travelled in one day or try multiple maintenance crews. KIND OF DONE
3) Priortize pumps that have a small number of nearest neighbours. KIND OF DONE
4) Travelling Purchaser Problem where non-functioning pumps cost more to repair than 'needs repair'. NOT DONE
In [17]:
print pcities[pcities.shape[0]/2:pcities.shape[0]/2 + 10]
In [18]:
#create multiple maintenance crews by splitting annealed cities into 3, all leaving from same depot.
cities1 = pcities[:pcities.shape[0]/3+1]
cities2 = pcities[pcities.shape[0]/3+1:2*pcities.shape[0]/3+1]
cities3 = pcities[2*pcities.shape[0]/3+1:]
cities2 = np.insert(cities2, 0, [39.2833, -6.8000], 0)
cities3 = np.insert(cities3, 0, [39.2833, -6.8000], 0)
print cities1[0], cities2[0], cities3[0]
In [19]:
pcities1, distances1 = simulated_annealing(cities1, 10.0, 0.8, 0.01, 20000, 0, 10)
pcities2, distances2 = simulated_annealing(cities2, 10.0, 0.8, 0.01, 20000, 0, 10)
pcities3, distances3 = simulated_annealing(cities3, 10.0, 0.8, 0.01, 20000, 0, 10)
print "1: ", len(distances1), "2: ", len(distances2), "3: ", len(distances3)
In [20]:
#I would have expected to see greater segregation of cities into distinct regions but possibily density is too high
plt.subplot(4,1,1)
plot_path(pcities)
plt.subplot(4,1,2)
plot_path(pcities1)
plt.subplot(4,1,3)
plot_path(pcities2)
plt.subplot(4,1,4)
plot_path(pcities3)
In [21]:
print distances[-1]
print distances1[-1] + distances2[-1] + distances3[-1]
In [22]:
#attempt to make comparison between sim anneal and genetic algorithm
start_time = time.clock()
p_big_mutate = 0.05
p_small_mutate = 0.4
fitness_scale=-0.5
pop_size=100
generations = 10**4
std_big = 1
std_small= 0.05
#def ras_fitness(g):
# ans = 20+g[:,0]**2+g[:,1]**2-10.0*(np.cos(2*np.pi*g[:,0])+np.cos(2*np.pi*g[:,1]))
# return ans**fitness_scale
def distance(pathcities):
distance = 0.
number_of_cities = pathcities.shape[0]
# loop over all cities
for j in xrange(number_of_cities):
if j == number_of_cities - 1: # FINAL POINT CONNECTS WITH THE FIRST ONE
distance += get_distance( pathcities[j,:], pathcities[0,:] )
else:
distance += get_distance( pathcities[j,:], pathcities[j+1,:] )
return distance**fitness_scale
def transform(population_orig):
# select two individuals for recombination
population =population_orig.copy()
indices = range(pop_size)
np.random.shuffle(indices)
temp = population[indices[0],1]
population[indices[0],1] = population[indices[1],1]
population[indices[1],1] = temp
#perform mutation
for i in range(pop_size):
if np.random.rand() < p_big_mutate:
population[i,0] = population[i,0]+std_big*np.random.randn()
if np.random.rand()<p_small_mutate:
population[i,0] = population[i,0]+std_small*np.random.randn()
if np.random.rand()<p_big_mutate:
population[i,1] = population[i,1]+std_big*np.random.randn()
if np.random.rand()<p_small_mutate:
population[i,1] = population[i,1]+std_small*np.random.randn()
return population
#generates initial population
mean=[100,100]
cov=[[9,0],[0,9]]
#g_0 = np.random.multivariate_normal(mean,cov,pop_size)
g_0 = cities[:100]
generation_fitness = np.zeros(generations)
#put placeholder for optimal solution
optimal_sol = [-100,-100]
g_curr=g_0
for z in range(generations):
if not z==0:
g_curr = transform(g_curr)
fit_curr = distance(g_curr)
generation_fitness[z] = fit_curr.max()
if z==0:
optimal_sol = g_curr[np.argmax(fit_curr),:]
elif generation_fitness[z]>generation_fitness[z-1]:
optimal_sol = g_curr[np.argmax(fit_curr),:]
marg_fit = fit_curr.cumsum()/fit_curr.sum()
r=np.random.rand(pop_size)
counts=np.zeros(pop_size)
for i in range(pop_size):
counts[i] = np.sum(marg_fit<=r[i])
child_counts = counts
g_new = []
for i in range(pop_size):
g_new.append(g_curr[child_counts[i],:])
g_curr=np.array(g_new)
end_time = time.clock()
In [23]:
def cartesian_matrix(coords):
'''create a distance matrix for the city coords
that uses straight line distance'''
matrix={}
for i,(x1,y1) in enumerate(coords):
for j,(x2,y2) in enumerate(coords):
dx,dy=x1-x2,y1-y2
dist=sqrt(dx*dx + dy*dy)
matrix[i,j]=dist
return matrix
matrix = cartesian_matrix(pcities)
#print matrix
In [24]:
print optimal_sol
print end_time - start_time
plt.plot(generation_fitness)
plt.show()
Here we try to develop a weighting to encourage the maintenance crews to visit the most remote pumps first. We identify remoteness by performing KDTree analysis of the average distance between the 5 nearest neighbours. We then add this as a weight to the distance function that the SA uses to optimise the routes. Currently the implementation means that the distance reported are no longer true but the order will be reflective of this new priority and leads to longer routes as the crews no longer move to the optimal step as determined by simple distance.
In [76]:
kdt = scipy.spatial.cKDTree(pcities1)
k = 5 # number of nearest neighbors
dists, neighs = kdt.query(pcities1, k+1)
avg_dists = np.mean(dists[:, 1:], axis=1)
In [78]:
#avg_dists[:10]
#np.concatenate((a, b.T), axis=1)
avg_dists = avg_dists.reshape((pcities1.shape[0],1))
#avg_dists.shape
cities1 = np.concatenate((pcities1, avg_dists), axis=1)
print cities1.shape
In [79]:
avg_dists.sort()
plt.hist(avg_dists)
Out[79]:
In [80]:
def get_distance(city1, city2):
'''Haversine formula - give coordinates as a 2D numpy array of
(lat_decimal,lon_decimal) pairs'''
#print city1[:], city2[:]
# earth's mean radius = 6,371km
EARTHRADIUS = 6371.0
# retrieve coords:
lat0, lon0 = city1[1], city1[0]
lat1, lon1 = city2[1], city2[0]
# convert to radians
lat0 *= np.pi / 180.0
lon0 *= np.pi / 180.0
lat1 *= np.pi / 180.0
lon1 *= np.pi / 180.0
# get transposed meshgrids for distances
#lat1_T = lat1.T.copy()
#lon1_T = lon1.T.copy()
#dlon = lon_T - lon
#dlat = lat_T - lat
dlon = lon1 - lon0
dlat = lat1 - lat0
a = (np.sin(dlat/2))**2 + np.cos(lat0) * np.cos(lat1) * (np.sin(dlon/2.0))**2
c = 2.0 * np.arctan2(np.sqrt(a), np.sqrt(1.0-a))
#add weight to encourage visits to remote pumps (i.e. higher avg_dist for kdt)
w = 1.0/(city1[2]*city2[2])
#print "w: ", w
km = EARTHRADIUS * c * w
return km
# the energy for the whole system corresponds to
# the total distance the salesman has to travel
def distance(pathcities):
distance = 0.
number_of_cities = pathcities.shape[0]
# loop over all cities
for j in xrange(number_of_cities):
if j == number_of_cities - 1: # FINAL POINT CONNECTS WITH THE FIRST ONE
distance += get_distance( pathcities[j,:], pathcities[0,:] )
else:
distance += get_distance( pathcities[j,:], pathcities[j+1,:] )
return distance
#get_distance(cities[0],cities[1])
In [81]:
cities1.shape
Out[81]:
In [88]:
pcities4, distances4 = simulated_annealing(cities1, 10.0, 0.8, 0.01, 20000, 0, 10)
print len(distances4)
In [89]:
plt.plot(distances)
#print distances[-1]
Out[89]:
In [90]:
plt.subplot(2,1,1)
plot_path(pcities1)
plt.subplot(2,1,2)
plot_path(pcities4)
In [91]:
print "distance optimised cities: ", pcities1[:10], "remote prioritized cities: ", pcities4[:10]
In [ ]:
In [ ]:
In [ ]: