Optimising Maintenance Schedule

In this part, we aim to determine the shortest route that will allow the maintenance crew to fix all the various wells that are either in need of repair or not functional. Some factors which we may wish to consider:

i) Can we assign a higher priority to wells which are not functional as opposed to those that are merely not working?

ii) Can we take advantage of route information to concentrate on higher quality roads?

Initially we will ignore differences in location, height etc. and assume that all points on the map are equally accessible. To calculate the pairwise distance between points we need to take account of the fact that the Earth is a sphere so we need to use the Haversine formula:

$$\rm{haversin} \Big(\frac{d}{r}\Big) = \rm{haversin}(\phi_2 - \phi_1) + \rm{cos}(\phi_1)\,\,\rm{cos}(\phi_2)\,\,\rm{haversin}(\lambda_1 - \lambda_2)$$$$\rm{haversin}(\theta) = \rm{sin}^2\Big(\frac{\theta}{2}\Big) = \frac{1 - \rm{cos}(\theta)}{2} $$

where $d$ is the distance between the two points, $r$ is the radius of the sphere (Earth), $\phi$ is the latitude and $\theta$ is the longitude. This can be rearranged to give the following formula as described in (R. W. Sinnott, "Virtues of the Haversine," Sky and Telescope, vol. 68, no. 2, 1984, p. 159):

$$\rm{dlon} = \rm{lon2} - \rm{lon1}$$ $$\rm{dlat} = \rm{lat2} - \rm{lat1}$$ $$\rm{a} = (\rm{sin}(\frac{dlat}{2}))^2 + cos(lat1) \times cos(lat2) \times (sin(\frac{dlon}{2}))^2$$ $$\rm{c} = 2 \times \rm{arctan}(\frac{\sqrt{a}}{\sqrt{1-a}})$$ $$\rm{d} = \rm{R} \times \rm{c}$$



In [2]:

    
%matplotlib inline
import numpy as np
import scipy as sp
import scipy.spatial
import matplotlib.pyplot as plt
import pandas as pd
import random
import math
import time
import seaborn as sns
from math import radians, sin, cos, asin, sqrt, pi, atan2



In [3]:

    
def getDistanceByHaversine(latitudes, longitudes):
    '''Haversine formula - give coordinates as a 2D numpy array of
    (lat_decimal,lon_decimal) pairs'''
    
    # earth's mean radius = 6,371km
    EARTHRADIUS = 6371.0
    
    # create meshgrid:
    lat, lon = np.meshgrid(latitudes, longitudes)
    
    # convert to radians
    lat *= np.pi / 180.0
    lon *= np.pi / 180.0
    
    # get transposed meshgrids for distances
    lat_T = lat.T.copy()
    lon_T = lon.T.copy()

    dlon = lon_T - lon
    dlat = lat_T - lat
    
    a = (np.sin(dlat/2))**2 + np.cos(lat) * np.cos(lat_T) * (np.sin(dlon/2.0))**2
    c = 2.0 * np.arctan2(np.sqrt(a), np.sqrt(1.0-a))
    km = EARTHRADIUS * c
    return km



In [ ]:



In [4]:

    
from pandas import Series, DataFrame, Panel

#import data and labels
train_file = pd.read_csv('Waterpump-training-values.csv')
train_labels = pd.read_csv('Waterpump-training-labels.csv')

train_file['status_group'] = train_labels['status_group']
train_file = train_file[train_file['longitude'] > 1]
train_file = train_file[train_file['latitude'] < 0]

features = ['longitude','latitude','status_group']
trainLoc = train_file[features]


# pandas barplot
trainLoc['status_group'].value_counts().plot(kind='bar');



In [5]:

    
trainLoc = trainLoc[trainLoc['status_group'] != 'functional']
#trainLoc.head()

#subsetting data to just take 1% to make it easier to work with
np.random.seed(142)
test_idx = np.random.uniform(0, 1, len(trainLoc)) <= 0.01
trainLoc = trainLoc[test_idx==True]



In [6]:

    
x = np.array(trainLoc['longitude'])#.tolist()
y = np.array(trainLoc['latitude'])#.tolist()

#insert coordinates for Dar Es Salaam (39.2833 E 6.8000S)
x = np.insert(x, 0, 39.2833, 0)
y = np.insert(y, 0, -6.8000, 0)
DarEs = np.array([39.2833, -6.800])

#A = numpy.array((x, y, z), dtype=float)
#tmpx, tmpy =  np.meshgrid(x,y)

cities = np.array((x, y), dtype=float)
#cities = np.array([tmpx, tmpy])
cities = np.reshape(cities, (2,-1)).T
#print cities.shape
#print cities


plt.scatter(cities[:,0], cities[:,1])
plt.scatter(DarEs[0],DarEs[1], s=50, color='red') #Highlight Dar Es Salaam on Map as HQ









    Out[6]:





<matplotlib.collections.PathCollection at 0x10a201fd0>



In [7]:

    
cities[:5]









    Out[7]:





array([[ 39.2833    ,  -6.8       ],
       [ 30.12668109,  -4.23637808],
       [ 40.30198716, -10.32095944],
       [ 34.4797207 ,  -9.72147456],
       [ 31.51069241,  -7.83669775]])



In [8]:

    
# the distance between two cities on a sphere is found using the Haversine formula

def get_distance(city1, city2):

    '''Haversine formula - give coordinates as a 2D numpy array of
    (lat_decimal,lon_decimal) pairs'''
    #print city1[:], city2[:]
    # earth's mean radius = 6,371km
    EARTHRADIUS = 6371.0
    
    # create meshgrid:
    lat0, lon0 = city1[1], city1[0]
    lat1, lon1 = city2[1], city2[0]
    
    # convert to radians
    lat0 *= np.pi / 180.0
    lon0 *= np.pi / 180.0
    lat1 *= np.pi / 180.0
    lon1 *= np.pi / 180.0
    
    # get transposed meshgrids for distances
    #lat1_T = lat1.T.copy()
    #lon1_T = lon1.T.copy()

    #dlon = lon_T - lon
    #dlat = lat_T - lat
    dlon = lon1 - lon0
    dlat = lat1 - lat0
    
    a = (np.sin(dlat/2))**2 + np.cos(lat0) * np.cos(lat1) * (np.sin(dlon/2.0))**2
    c = 2.0 * np.arctan2(np.sqrt(a), np.sqrt(1.0-a))
    km = EARTHRADIUS * c
    return km

# the energy for the whole system corresponds to
# the total distance the salesman has to travel
def distance(pathcities):
    distance = 0.
    number_of_cities = pathcities.shape[0]
    
    # loop over all cities
    for j in xrange(number_of_cities):
        if j == number_of_cities - 1: # FINAL POINT CONNECTS WITH THE FIRST ONE
            distance += get_distance( pathcities[j,:], pathcities[0,:] )
        else:
            distance += get_distance( pathcities[j,:], pathcities[j+1,:] )
    return distance

#get_distance(cities[0], cities[1])



In [9]:

    
# create a new path by swapping the connection between
# n_swaps cities randomly
def changepath(inputcities, n_swaps):
    indices = range(1,inputcities.shape[0]) #Don't include starting city in swaps so that HQ is always the same
    cities = inputcities.copy()
    for i in range(n_swaps):
        swappedCities = swapindex(cities)
        cities=swappedCities.copy()

    return cities

def swapindex(cities):
    indices = range(cities.shape[0]) #Don't include starting city in swaps so that HQ is always the same
    # take two random indices to swap
    c1 = np.random.choice(indices[1:])
    c2 = np.random.choice(indices[1:])
    
    while c2 == c1:
        c2 = np.random.choice(indices[1:]) 
    
    # remember the cities to swap
    tmp1 = cities[c1,:]
    tmp2 = cities[c2,:]
    
    # do the actual swapping
    changedCities = cities.copy()
    changedCities[c1,:] = tmp2
    changedCities[c2,:] = tmp1
    
    return changedCities



In [10]:

    
print DarEs[0], DarEs[1]









    



39.2833 -6.8



In [11]:

    
def plot_path(pcities):
    plt.plot(pcities[:,0], pcities[:,1],'o')
    plt.plot(pcities[:,0], pcities[:,1])
    plt.scatter(DarEs[0],DarEs[1], s=250, color='red') #Highlight Dar Es Salaam on Map as HQ
    plt.xlim(np.min(pcities[:,0])-1, np.max(pcities[:,0])+1)
    plt.ylim(np.min(pcities[:,1])-1, np.max(pcities[:,1])+1)



In [12]:

    
# function for simulated annealing
# pathcities: array with our cities represented by their coordinates
# init_temp: initial temperature 
# thermostat: linear factor to decrease the temperature 
# ftol, itol, otol: tolerance values for stopping
# reannealing: schedule for reheating

def simulated_annealing( pathcities, init_temp, thermostat, ftol, itol, otol, reannealing):
    # ===============
    # SET THESE FOR DIAGNOSTICS
    # ================

    m=100000     
    distsize=otol+1
    dist=[]
    temperature = init_temp   


    N = pathcities.shape[0]
    
    # number of accepted steps
    it = 0                    

    # DISTANCE HERE IS OUR ENERGY 
    prev_E = distance(pathcities)  
    
    # number of iterations
    atp=0
    
    didPlot = False
    
    while it >=0:
    #while otol < itol:
        ## NUMBER OF CORNERS IS L
        L = np.max((np.floor(np.sqrt(temperature)).astype(int),1))
        #print "L", L
        #L = 2
        propose_path = changepath(pathcities, L) 

        new_E = distance(propose_path)
        deltaE  =  new_E -prev_E 

        if new_E < prev_E:
            pathcities  = propose_path
            #dist[it] =new_E
            dist.append(new_E)
            prev_E = new_E  
            it = it+1
            didPlot = False
            
        elif np.random.rand() < np.exp( -deltaE/temperature):
            pathcities  = propose_path
            #dist[it] =new_E
            dist.append(new_E)
            prev_E = new_E 
            it = it+1
            didPlot = False

        atp =atp +1;  # NUMBER OF ITERATIONS
        
        # check if it is time to cool down
        if it % reannealing == 0:
            temperature = thermostat * temperature;
            #temperature =  temperature/log(it);
            compl_temp=0;
            
            #if we get too cold, reheat
            if temperature < 0.01:
                temperature = 1
        
        if False: #some optional plotting
            if (it % 100 == 0) and not didPlot:       
                display.clear_output()
                plt.plot( dist, '-r')
                display.display(plt.gcf())
                print len(dist)
                print raw_input('Iteration: ' + np.str(atp))
                plt.close()
                didPlot = True
    
        if len(dist)>m and np.std(dist[-m:])/np.mean(dist[-m:]) < ftol:
            print 'ftol'
            break
        if atp >itol:
            print 'itol'
            break
        if len(dist)> 0 and dist[-1] <= otol:
            print 'otol'
            print dist[-1]
            break

    s = pathcities
    return s, dist



In [ ]:



In [ ]:



In [13]:

    
# simulated_annealing( pathcities, init_temp, thermostat, ftol, itol, otol, reannealing):
pcities, distances = simulated_annealing(cities, 10.0, 0.8, 0.01, 20000, 0, 10)

print len(distances)









    



itol
659



In [14]:

    
plt.plot(distances)









    Out[14]:





[<matplotlib.lines.Line2D at 0x109cdb910>]



In [15]:

    
plt.subplot(2,1,1)
plot_path(cities)

plt.subplot(2,1,2)
plot_path(pcities)



In [16]:

    
#print distances[-1]
print cities[0:2], pcities[0:2]









    



[[ 39.2833      -6.8       ]
 [ 30.12668109  -4.23637808]] [[ 39.2833      -6.8       ]
 [ 37.99819119  -4.18340462]]

Things to do:

1) Set Dar Es Salaam (39.2833 E 6.8000S) as starting/ending point. DONE

2) Set Maximum Distance that can be travelled in one day or try multiple maintenance crews. KIND OF DONE

3) Priortize pumps that have a small number of nearest neighbours. KIND OF DONE

4) Travelling Purchaser Problem where non-functioning pumps cost more to repair than 'needs repair'. NOT DONE



In [17]:

    
print pcities[pcities.shape[0]/2:pcities.shape[0]/2 + 10]









    



[[ 37.94762989  -4.47267294]
 [ 38.24396213  -4.6110009 ]
 [ 37.96645293  -4.44512228]
 [ 37.94026949  -4.37973878]
 [ 38.07456542  -4.46613143]
 [ 36.16076882  -5.57077088]
 [ 35.86216981  -5.97312346]
 [ 34.90059147 -11.01263306]
 [ 35.12157682 -10.78742477]
 [ 34.96317847 -10.95592463]]



In [18]:

    
#create multiple maintenance crews by splitting annealed cities into 3, all leaving from same depot.
cities1 = pcities[:pcities.shape[0]/3+1]
cities2 = pcities[pcities.shape[0]/3+1:2*pcities.shape[0]/3+1]
cities3 = pcities[2*pcities.shape[0]/3+1:]
cities2 = np.insert(cities2, 0, [39.2833, -6.8000], 0)
cities3 = np.insert(cities3, 0, [39.2833, -6.8000], 0)
print cities1[0], cities2[0], cities3[0]









    



[ 39.2833  -6.8   ] [ 39.2833  -6.8   ] [ 39.2833  -6.8   ]



In [19]:

    
pcities1, distances1 = simulated_annealing(cities1, 10.0, 0.8, 0.01, 20000, 0, 10)

pcities2, distances2 = simulated_annealing(cities2, 10.0, 0.8, 0.01, 20000, 0, 10)

pcities3, distances3 = simulated_annealing(cities3, 10.0, 0.8, 0.01, 20000, 0, 10)
print "1: ", len(distances1), "2: ", len(distances2), "3: ", len(distances3)









    



itol
itol
itol
1:  92 2:  118 3:  90



In [20]:

    
#I would have expected to see greater segregation of cities into distinct regions but possibily density is too high
plt.subplot(4,1,1)
plot_path(pcities)

plt.subplot(4,1,2)
plot_path(pcities1)

plt.subplot(4,1,3)
plot_path(pcities2)

plt.subplot(4,1,4)
plot_path(pcities3)



In [21]:

    
print distances[-1]
print distances1[-1] + distances2[-1] + distances3[-1]









    



41078.9480051
34453.4200932



In [22]:

    
#attempt to make comparison between sim anneal and genetic algorithm

start_time = time.clock()

p_big_mutate = 0.05
p_small_mutate = 0.4

fitness_scale=-0.5
pop_size=100
generations = 10**4
std_big = 1
std_small= 0.05

#def ras_fitness(g):
#    ans = 20+g[:,0]**2+g[:,1]**2-10.0*(np.cos(2*np.pi*g[:,0])+np.cos(2*np.pi*g[:,1]))
#    return ans**fitness_scale

def distance(pathcities):
    distance = 0.
    number_of_cities = pathcities.shape[0]
    
    # loop over all cities
    for j in xrange(number_of_cities):
        if j == number_of_cities - 1: # FINAL POINT CONNECTS WITH THE FIRST ONE
            distance += get_distance( pathcities[j,:], pathcities[0,:] )
        else:
            distance += get_distance( pathcities[j,:], pathcities[j+1,:] )
    return distance**fitness_scale

def transform(population_orig):
    # select two individuals for recombination
    population =population_orig.copy()
    indices = range(pop_size)
    np.random.shuffle(indices)
    temp = population[indices[0],1]
    population[indices[0],1] = population[indices[1],1]
    population[indices[1],1] = temp
    
    #perform mutation
    for i in range(pop_size):
        if np.random.rand() < p_big_mutate:
            population[i,0] = population[i,0]+std_big*np.random.randn()
        if np.random.rand()<p_small_mutate:
            population[i,0] = population[i,0]+std_small*np.random.randn()
        if np.random.rand()<p_big_mutate:
            population[i,1] = population[i,1]+std_big*np.random.randn()
        if np.random.rand()<p_small_mutate:
            population[i,1] = population[i,1]+std_small*np.random.randn()
            
    return population


#generates initial population
mean=[100,100]
cov=[[9,0],[0,9]]
#g_0 = np.random.multivariate_normal(mean,cov,pop_size)
g_0 = cities[:100]


generation_fitness = np.zeros(generations)
#put placeholder for optimal solution
optimal_sol = [-100,-100]
    
g_curr=g_0

for z in range(generations):
    
    if not z==0:
        g_curr = transform(g_curr)
        
    fit_curr = distance(g_curr)
    generation_fitness[z] = fit_curr.max()
    
    if z==0:
        optimal_sol = g_curr[np.argmax(fit_curr),:]
    elif generation_fitness[z]>generation_fitness[z-1]:
        optimal_sol = g_curr[np.argmax(fit_curr),:]
       
    marg_fit = fit_curr.cumsum()/fit_curr.sum()
    r=np.random.rand(pop_size)
    counts=np.zeros(pop_size)

    for i in range(pop_size):
        counts[i] = np.sum(marg_fit<=r[i]) 

    child_counts = counts
    
    g_new = []
    
    for i in range(pop_size):
        g_new.append(g_curr[child_counts[i],:])
    
    g_curr=np.array(g_new)

end_time = time.clock()



In [23]:

    
def cartesian_matrix(coords):
    '''create a distance matrix for the city coords
      that uses straight line distance'''
    matrix={}
    for i,(x1,y1) in enumerate(coords):
        for j,(x2,y2) in enumerate(coords):
            dx,dy=x1-x2,y1-y2
            dist=sqrt(dx*dx + dy*dy)
            matrix[i,j]=dist
    return matrix

matrix = cartesian_matrix(pcities)
#print matrix



In [24]:

    
print optimal_sol
print end_time - start_time
plt.plot(generation_fitness)
plt.show()









    



[ 57.85838175 -35.95260026]
25.512422

Prioritising remote pumps

Here we try to develop a weighting to encourage the maintenance crews to visit the most remote pumps first. We identify remoteness by performing KDTree analysis of the average distance between the 5 nearest neighbours. We then add this as a weight to the distance function that the SA uses to optimise the routes. Currently the implementation means that the distance reported are no longer true but the order will be reflective of this new priority and leads to longer routes as the crews no longer move to the optimal step as determined by simple distance.



In [76]:

    
kdt = scipy.spatial.cKDTree(pcities1) 
k = 5 # number of nearest neighbors 
dists, neighs = kdt.query(pcities1, k+1)
avg_dists = np.mean(dists[:, 1:], axis=1)



In [78]:

    
#avg_dists[:10]
#np.concatenate((a, b.T), axis=1)

avg_dists = avg_dists.reshape((pcities1.shape[0],1))
#avg_dists.shape
cities1 = np.concatenate((pcities1, avg_dists), axis=1)
print cities1.shape



In [79]:

    
avg_dists.sort()
plt.hist(avg_dists)









    Out[79]:





(array([ 39.,  13.,  17.,   8.,   4.,   7.,   2.,   2.,   1.,   1.]),
 array([ 0.14298556,  0.35924264,  0.57549972,  0.7917568 ,  1.00801389,
         1.22427097,  1.44052805,  1.65678513,  1.87304222,  2.0892993 ,
         2.30555638]),
 <a list of 10 Patch objects>)



In [80]:

    
def get_distance(city1, city2):

    '''Haversine formula - give coordinates as a 2D numpy array of
    (lat_decimal,lon_decimal) pairs'''
    #print city1[:], city2[:]
    # earth's mean radius = 6,371km
    EARTHRADIUS = 6371.0
    
    # retrieve coords:
    lat0, lon0 = city1[1], city1[0]
    lat1, lon1 = city2[1], city2[0]
    
    # convert to radians
    lat0 *= np.pi / 180.0
    lon0 *= np.pi / 180.0
    lat1 *= np.pi / 180.0
    lon1 *= np.pi / 180.0
    
    # get transposed meshgrids for distances
    #lat1_T = lat1.T.copy()
    #lon1_T = lon1.T.copy()

    #dlon = lon_T - lon
    #dlat = lat_T - lat
    dlon = lon1 - lon0
    dlat = lat1 - lat0
    
    a = (np.sin(dlat/2))**2 + np.cos(lat0) * np.cos(lat1) * (np.sin(dlon/2.0))**2
    c = 2.0 * np.arctan2(np.sqrt(a), np.sqrt(1.0-a))
    
    #add weight to encourage visits to remote pumps (i.e. higher avg_dist for kdt)
    w = 1.0/(city1[2]*city2[2])
    #print "w: ", w
    km = EARTHRADIUS * c * w
    return km

# the energy for the whole system corresponds to
# the total distance the salesman has to travel
def distance(pathcities):
    distance = 0.
    number_of_cities = pathcities.shape[0]
    
    # loop over all cities
    for j in xrange(number_of_cities):
        if j == number_of_cities - 1: # FINAL POINT CONNECTS WITH THE FIRST ONE
            distance += get_distance( pathcities[j,:], pathcities[0,:] )
        else:
            distance += get_distance( pathcities[j,:], pathcities[j+1,:] )
    return distance

#get_distance(cities[0],cities[1])



In [81]:

    
cities1.shape









    Out[81]:





(94, 3)



In [88]:

    
pcities4, distances4 = simulated_annealing(cities1, 10.0, 0.8, 0.01, 20000, 0, 10)

print len(distances4)









    



itol
79



In [89]:

    
plt.plot(distances)
#print distances[-1]









    Out[89]:





[<matplotlib.lines.Line2D at 0x10dbb6310>]



In [90]:

    
plt.subplot(2,1,1)
plot_path(pcities1)

plt.subplot(2,1,2)
plot_path(pcities4)



In [91]:

    
print "distance optimised cities: ", pcities1[:10], "remote prioritized cities: ", pcities4[:10]









    



distance optimised cities:  [[ 39.2833      -6.8       ]
 [ 37.99819119  -4.18340462]
 [ 37.65935713  -3.17139227]
 [ 37.5755451   -3.05176518]
 [ 37.53509114  -2.9685929 ]
 [ 37.56696274  -3.27538102]
 [ 37.5321024   -3.25402039]
 [ 37.40303025  -3.25510401]
 [ 37.30456498  -3.27151948]
 [ 33.15587322  -8.41892723]] remote prioritized cities:  [[ 39.2833      -6.8          0.26239635]
 [ 37.99819119  -4.18340462   0.65054886]
 [ 37.53509114  -2.9685929    0.24793928]
 [ 37.5755451   -3.05176518   0.18717272]
 [ 37.65935713  -3.17139227   0.18887841]
 [ 37.56696274  -3.27538102   0.15595046]
 [ 37.5321024   -3.25402039   0.14298556]
 [ 37.40303025  -3.25510401   0.15029584]
 [ 36.62769846  -3.29031742   0.53046426]
 [ 33.42840944  -3.02901348   1.39707482]]



In [ ]:



In [ ]:



In [ ]: