In [1]:
import time
import collections
import sys
sys.path.append('/Users/jay/github/pysal/')
import pysal as ps

%load_ext line_profiler

#Just to confirm that I am on the newest version
print ps.version


1.7.0dev

List based contiguity


In [22]:
def queen(fname):
    #ta = time.time()
    #t1 = time.time()
    #fname = '10000_poly.shp'
    shpFileObject = ps.open(fname)
    if shpFileObject.type != ps.cg.Polygon:
        print "FAILED"
    numPoly = len(shpFileObject)
    #t2 = time.time()
    #print "Opening took {} seconds.".format(t2-t1)
    
    #t1 = time.time()
      
    w = collections.defaultdict(set)    
    #w = {}
    #for i in range(numPoly):
        #w[i] = set()
    #t2 = time.time()
    #print "Preallocating the W took {} seconds.".format(t2-t1)
    
    #t1 = time.time()
    geoms = collections.deque()
    offsets = collections.deque()  #len(offsets) == len(geoms)
    c = 0  #PolyID Counter
    for n in range(numPoly):
        verts = shpFileObject.get(n).vertices
        offsets.extend( [c] * len(verts) )
        geoms.extend(verts)
        c += 1
    #t2 = time.time()
    #print "Extending took {} seconds.".format(t2-t1)
    
    #t1 = time.time()
    items = collections.defaultdict(set)
    for i, vertex in enumerate(geoms):
        items[vertex].add(offsets[i])
    shared_vertices = []
    for item, location in items.iteritems():
        if len(location) > 1:
            shared_vertices.append(location)
    #t2 = time.time()
    #print "Shared vertex identification took {} seconds".format(t2-t1)
    #Shared Vertices is a list, by index of those polys that share a vertex.
    #t1 = time.time()
    #print shared_vertices
    for vert_set in shared_vertices:
        for v in vert_set:
            w[v] = w[v] | vert_set
            #try:
                #w[v].remove(v)
            #except:
                #pass
    #t2 = time.time()
    #tb = time.time()
    #print "Total processing time was {} seconds.".format(tb-ta)
    return w
w = queen('2500_poly.shp')

Line profiler for original Queen


In [3]:
%lprun -f queen queen('2500_poly.shp')


Opening took 0.00869393348694 seconds.
Preallocating the W took 1.19209289551e-05 seconds.
Extending took 0.710283041 seconds.
Shared vertex identification took 0.288301944733 seconds
Total processing time was 1.22213506699 seconds.

Two pass algorithm

This is what Serge and I chatted about last week. We make two passes over the data.

  1. Read the vertices from the shapefile. As they are read, parse them into a dictionary of vertices with {(vertex):set[idA, idB, ..., idC]}
  2. Loop over the values in the vertex dictionary. Loop over each entry in the set and union any existing neighbors with the new entries.

Questions:

  1. Can the double loop in #2 be converted into a single using a set operation? I do not think so, because we need the keys.
  2. Self neighbors - how do we want to handle this?

In [23]:
def onestep(fname):
    shpFileObject = ps.open(fname)
    if shpFileObject.type != ps.cg.Polygon:
        print "FAILED"
    numPoly = len(shpFileObject)
    
    vertices = collections.defaultdict(set)
    for i, s in enumerate(shpFileObject):
        newvertices = s.vertices[:-1]
        for v in newvertices:
            vertices[v].add(i)
            
    w = collections.defaultdict(set)
    for neighbors in vertices.itervalues():
        for neighbor in neighbors:
            w[neighbor] = w[neighbor] | neighbors
        
    return w
    
neww = onestep('2500_poly.shp')

In [25]:
assert w == neww
print w[0]
print neww[0]


set([0, 1447, 1001, 2287, 272, 755, 180, 2326])
set([0, 1447, 1001, 2287, 272, 755, 180, 2326])

In [19]:
%lprun -f onestep onestep('2500_poly.shp')




In [3]:
import pysal as ps
t1 = time.time()
w = ps.queen_from_shapefile(fname)
t2 = time.time()
print t2-t1


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-3-964c2e86945a> in <module>()
      1 import pysal as ps
      2 t1 = time.time()
----> 3 w = ps.queen_from_shapefile(fname)
      4 t2 = time.time()
      5 print t2-t1

NameError: name 'fname' is not defined

In [41]:
#Check to see that these two dicts are identical
for k, v in w.neighbors.iteritems():
    if sorted(improved_w[k]) != sorted(v):
        print improved_w[k], v

Rook


In [43]:
def rook(fname):
    ta = time.time()
    t1 = time.time()
    #fname = '2500_poly.shp'
    shpFileObject = ps.open(fname)
    if shpFileObject.type != ps.cg.Polygon:
        print "FAILED"
    numPoly = len(shpFileObject)
    t2 = time.time()
    #print "Opening took {} seconds.".format(t2-t1)
    
    t1 = time.time()
        
    w = {}
    for i in range(numPoly):
        w[i] = set()
    t2 = time.time()
    #print "Preallocating the W took {} seconds.".format(t2-t1)
    
    t1 = time.time()
    geoms = []
    offsets = []  #len(offsets) == len(geoms)
    c = 0  #PolyID Counter
    for n in range(numPoly):
        verts = shpFileObject.get(n).vertices
        for v in range(len(verts)-1):
            geoms.append(tuple(sorted([verts[v], verts[v+1]])))
        offsets += [c] * (len(verts)-1)
        c += 1
    t2 = time.time()
    #print "Reading took {} seconds.".format(t2-t1)
    
    t1 = time.time()
    items = collections.defaultdict(set)
    for i, item in enumerate(geoms):
        items[item].add(offsets[i])
    shared_vertices = []
    for item, location in items.iteritems():
        if len(location) > 1:
            shared_vertices.append(location)
    t2 = time.time()
    #print "Shared vertex identification took {} seconds".format(t2-t1)
    #Shared Vertices is a list, by index of those polys that share a vertex.
    t1 = time.time()
    for vert_set in shared_vertices:
    
        for v in vert_set:
            w[v] = w[v] | vert_set
            try:
                w[v].remove(v)
            except:
                pass
    t2 = time.time()
    
    t1 = time.time()
    for k, v in w.iteritems():
        w[k] = list(v)
    t2 = time.time()
    
    tb = time.time()
    print "Total processing time was {} seconds.".format(tb-ta)

In [33]:
import pysal as ps
t1 = time.time()
w = ps.rook_from_shapefile(fname)
t2 = time.time()
print t2-t1
print w[2499]


2.30595707893
{2262: 1.0, 3080: 1.0, 2353: 1.0, 2514: 1.0, 2326: 1.0, 2615: 1.0}

In [34]:
#Check to see that these two dicts are identical
for k, v in w.neighbors.iteritems():
    if sorted(improved_w[k]) != sorted(v):
        print improved_w[k], v

In [18]:
fnames = ['TestData/62500_poly.shp']

for f in fnames:
    #Open the data source once to get the file pointer
    touch = open(f)
    touch.close()
    print f
    queen(f)
    t1 = time.time()
    ps.queen_from_shapefile(f)
    t2 = time.time()
    print "Original queen took {} seconds".format(t2 - t1)
    rook(f)
    t1 = time.time()
    ps.rook_from_shapefile(f)
    t2 = time.time()
    print "Original rook took {} seconds".format(t2 - t1)
    print


TestData/62500_poly.shp
Opening took 0.148154973984 seconds.
Preallocating the W took 0.082967042923 seconds.
Reading took 5.90046095848 seconds.
Shared vertex identification took 11.0241189003 seconds
Converting from shared points to W took 1.01461482048 seconds.
Total processing time was 18.1916129589 seconds.
Original queen took 9.17589497566 seconds
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-18-3709db48eeac> in <module>()
     11     t2 = time.time()
     12     print "Original queen took {} seconds".format(t2 - t1)
---> 13     rook(f)
     14     t1 = time.time()
     15     ps.rook_from_shapefile(f)

NameError: name 'rook' is not defined


In [1]:
queen_fast = np.array([0.45458817482,
                       1.27110695839,
                       3.097905159,
                       5.1691391468,
                       6.72036194801,
                       7.85625100136,
                       11.6823840141,
                       12.812087059])
queen_slow = np.array([1.07005596161,
                       4.87641596794,
                       16.385130167,
                       37.7573301792,
                       60.615033865,
                       80.6158120632,
                       158.111407995,
                       191.939512968])
rook_fast = np.array([0.727967023849,
                      1.79104304314,
                      3.43500900269,
                      6.94867897034,
                      8.01557397842,
                      9.91741108894,
                      14.2835550308,
                      16.4100689888])
rook_slow = np.array([1.16798591614,
                      6.33092308044,
                      18.7772500515,
                      44.9458069801,
                      60.5921809673,
                      88.5189139843,
                      162.452570915,
                      203.032716036])
x = np.array([2500,10000,22500,40000,50000,62500,90000,100000])

In [9]:
plot(x, queen_fast, label='Fast')
plot(x, queen_slow, label='Slow')
xlabel('Num. Polys')
ylabel('Time (sec.)')
title('Queen')
legend(loc=2)
grid()



In [11]:
plot(x, rook_fast, label='Fast')
plot(x, rook_slow, label='Slow')
xlabel('Num. Polys')
ylabel('Time (sec.)')
title('Rook')
legend(loc=2)
grid()



In [14]:
plot(x, queen_slow / queen_fast, label='Queen Speedup')
plot(x, rook_slow / rook_fast, label='Rook Speedup')
xlabel('Num. Polys')
ylabel('Speedup')
title('Rook')
legend(loc=4)
grid()



In [ ]: