In [1]:
%pylab inline
import numbapro
from numbapro import *


Populating the interactive namespace from numpy and matplotlib

In [4]:
%load https://raw.githubusercontent.com/Chiroptera/QCThesis/master/CUDA/K_Means.py

In [ ]:
"""
Created on Fri Mar 27 08:53:20 2015

@author: Diogo Silva
"""

import numpy as np
import numbapro
from numbapro import *

class K_Means:       
       
    
    def __init__(self,N=None,D=None,K=None):
        self.N = N
        self.D = D
        self.K = K
        
        self._cudaDataRef = None 
        
    def fit(self,data,K,iters=3,cuda=False):
        
        N,D = data.shape
            
        self.N = N
        self.D = D
        self.K = K
        
        centroids = self._init_centroids(data)
        
        if iters == 0:
            return
        
        for i in xrange(iters):
            dist_mat = self._calc_dists(data,centroids,cuda=cuda)
            assign,grouped_data = self._assign_data(data,dist_mat)
            centroids =  self._np_recompute_centroids(grouped_data)
            self.centroids = centroids

    def _init_centroids(self,data):
        
        centroids = np.empty((self.K,self.D),dtype=data.dtype)
        random_init = np.random.randint(0,self.N,self.K)
        self.init_seed = random_init
        
        for k in xrange(self.K):
            centroids[k] = data[random_init[k]]
        
        self.centroids = centroids
        
        return centroids

    def _calc_dists(self,data,centroids,cuda=False):
        if cuda:
            dist_mat = self._cu_calc_dists(data,centroids,gridDim=None,
                                           blockDim=None,memManage='manual')
        else:
            dist_mat = self._np_calc_dists(data,centroids)
            
        return dist_mat
            
    def _py_calc_dists(data,centroids):
        N,D = data.shape
        K,cD = centroids.shape

        for n in range(N):
            for k in range(K):
                dist=0
                for d in range(dim):
                    diff = a[n,d]-b[k,d]
                    dist += diff ** 2
                c[n,k]=dist
            
    def _np_calc_dists(self,data,centroids):
        """
        NumPy implementation - much faster than vanilla Python
        """
        N,D = data.shape
        K,cD = centroids.shape

        dist_mat = np.empty((N,K),dtype=data.dtype)    
        
        for k in xrange(K):
            dist = data - centroids[k]
            dist = dist ** 2
            dist_mat[:,k] = dist.sum(axis=1)
            
        return dist_mat
    
    def _cu_calc_dists(self,data,centroids,gridDim=None,blockDim=None,
                       memManage='manual',keepDataRef=True):
        """
        TODO:
            - deal with gigantic data / distance matrix
            - deal with heavely assymetric distance matrix
                - if the number of blocks on any given dimension of 
                the grid > 2**16, divide that dimension by another dimension
                - don't forget to change the index computation in the kernel
        """
        
        
        N,D = data.shape
        K,cD = centroids.shape
        
        if memManage not in ('manual','auto'):
            raise Exception("Invalid value for \'memManage\'.")

            
        if gridDim is None or blockDim is None:
            #dists shape
            

            MAX_THREADS_BLOCK = 28 * 20 # GT520M has 48 CUDA cores
            MAX_GRID_XYZ_DIM = 65535

            if K <= 28:
                blockWidth = K
                blockHeight = np.floor(MAX_THREADS_BLOCK / blockWidth)
                blockHeight = np.int(blockHeight)
            else:
                blockWidth = 20
                blockHeight = 28

            # grid width/height is the number of blocks necessary to fill
            # the columns/rows of the matrix
            gridWidth = np.ceil(np.float(K) / blockWidth)
            gridHeight = np.ceil(np.float(N) / blockHeight)

    
            blockDim = blockWidth, blockHeight
            gridDim = np.int(gridWidth), np.int(gridHeight)
            
        distShape =  N,K
        dist_mat = np.empty(distShape,dtype=data.dtype)
        
        if memManage == 'manual':
            
            if keepDataRef:
                if self._cudaDataRef is None:
                    dData = cuda.to_device(data)
                    self._cudaDataRef = dData
                else:
                    dData = self._cudaDataRef
            else:
                dData = cuda.to_device(data)
                
            dCentroids = cuda.to_device(centroids)
            dDists = numbapro.cuda.device_array_like(dist_mat)
            
            self._cu_dist_kernel[gridDim,blockDim](dData,dCentroids,dDists)        
        
            dDists.copy_to_host(ary=dist_mat)
            numbapro.cuda.synchronize()

        elif memManage == 'auto':
            self._cu_dist_kernel[gridDim,blockDim](data,centroids,dist_mat) 
        
        return dist_mat
        
    
    @numbapro.cuda.jit("void(float32[:,:], float32[:,:], float32[:,:])")
    def _cu_dist_kernel(a,b,c):
        k,n = numbapro.cuda.grid(2)

        ch, cw = c.shape # c width and height

        if n >= ch or k >= cw:
            return

        dist = 0.0
        for d in range(a.shape[1]):
            diff = a[n,d]-b[k,d]
            dist += diff ** 2
        c[n,k]= dist
    
        
    def _assign_data(self,data,dist_mat):
        
        N,K = dist_mat.shape
        
        assign = np.argmin(dist_mat,axis=1)
        
        grouped_data=[[] for i in xrange(K)]
        
        
        for n in xrange(N):
            # add datum i to its assigned cluster assign[i]
            grouped_data[assign[n]].append(data[n])
        
        for k in xrange(K):
            grouped_data[k] = np.array(grouped_data[k])
        
        return assign,grouped_data
    
    def _np_recompute_centroids(self,grouped_data):
        
        # change to get dimension from class or search a non-empty cluster
        #dim = grouped_data[0][0].shape[1]
        dim = self.D
        K = len(grouped_data)
        
        centroids = np.empty((K,dim))
        for k in xrange(K):
            centroids[k] = np.mean(grouped_data[k],axis=0)
        
        return centroids

In [25]:
##generate data
n = 1e6
d = 2
k = 20

total_bytes = (n * d + k * d + n * k) * 4
print 'Memory used by arrays:\t',total_bytes/1024,'\tKBytes'
print '\t\t\t',total_bytes/(1024*1024),'\tMBytes'

print 'Memory used by data:  \t',n * d * 4 / 1024,'\t','KBytes'

data = np.random.random((n,d)).astype(np.float32)


Memory used by arrays:	85937.65625 	KBytes
			83.9234924316 	MBytes
Memory used by data:  	7812.5 	KBytes

In [4]:
from timeit import default_timer as timer

In [5]:
times=dict()

In [13]:
start = timer()
grouperCUDA = K_Means()
grouperCUDA.fit(data,k,iters=3,cuda=True)
times['cuda'] = timer() - start

In [7]:
#%debug
start = timer()
grouperNP = K_Means()
grouperNP.fit(data,k,cuda=False)
times['numpy'] = timer() - start

In [14]:
print 'Times'
print 'CUDA ','\t',times['cuda']
print 'NumPy','\t',times['numpy']


Times
CUDA  	29.3555300236
NumPy 	32.5373690128

In [16]:
import cProfile
#from line_profiler import LineProfiler


---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
<ipython-input-16-a0b61c615a14> in <module>()
----> 1 from line_profiler import LineProfiler

ImportError: No module named line_profiler

In [34]:
#profile = LineProfiler(grouperCUDA.fit(data,k,iters=3,cuda=True))
#profile.print_stats()
#cProfile.run("grouperCUDA.fit(data,k,iters=3,cuda=True)")

In [ ]:
#profile = LineProfiler(grouperCUDA.fit(data,k,iters=3,cuda=True))
#profile.print_stats()
#cProfile.run("grouperCUDA.fit(data,k,iters=3,cuda=False)")

In [23]:
print 'Speedup:','\t\t', 1.367/0.319
print 'Centroids portion','\t',25.2/31


Speedup: 		4.28526645768
Centroids portion 	0.812903225806

In [ ]:
Profiling both runs (CUDA and NumPy) we see that we have a speedup, but it's very small. In fact, the speedup of the optimized portions of the code is around 4 but the rest is so slow that it makes very little impact. What's taking the most time is the centroid recalculation, taking around 80% of the total time. This is the obvious next step for optimization.