When I increase the spread from 8 to 9 cycles in my core, it breaks with an out of resources error.
I read in How do I diagnose a failure to start CUDA due to lack of resources? that the problem of parameter mismatching and overuse of registers may be a problem, but it doesn't seem to be.
My core calculates the distance between points n and centroids m and selects the nearest centroid for each point. It works for 8 dimensions, but not for 9. When I set dimensions=9 and uncomment the two lines to calculate the distance, I get pycuda._driver.LaunchError: cuLaunchGrid failed: launch out of resources .
Do you think this can cause this behavior? What other iusses can call out of resources *?
I am using Quadro FX580. Here is a minimal (ish) example. For deployment in real code, I use templates.
import numpy as np from pycuda import driver, compiler, gpuarray, tools import pycuda.autoinit ## preference np.random.seed(20) points = 512 dimensions = 8 nclusters = 1 ## init data data = np.random.randn(points,dimensions).astype(np.float32) clusters = data[:nclusters] ## init cuda kernel_code = """ // the kernel definition __device__ __constant__ float centroids[16384]; __global__ void kmeans_kernel(float *idata,float *g_centroids, int * cluster, float *min_dist, int numClusters, int numDim) { int valindex = blockIdx.x * blockDim.x + threadIdx.x ; float increased_distance,distance, minDistance; minDistance = 10000000 ; int nearestCentroid = 0; for(int k=0;k<numClusters;k++){ distance = 0.0; increased_distance = idata[valindex*numDim] -centroids[k*numDim]; distance = distance +(increased_distance * increased_distance); increased_distance = idata[valindex*numDim+1] -centroids[k*numDim+1]; distance = distance +(increased_distance * increased_distance); increased_distance = idata[valindex*numDim+2] -centroids[k*numDim+2]; distance = distance +(increased_distance * increased_distance); increased_distance = idata[valindex*numDim+3] -centroids[k*numDim+3]; distance = distance +(increased_distance * increased_distance); increased_distance = idata[valindex*numDim+4] -centroids[k*numDim+4]; distance = distance +(increased_distance * increased_distance); increased_distance = idata[valindex*numDim+5] -centroids[k*numDim+5]; distance = distance +(increased_distance * increased_distance); increased_distance = idata[valindex*numDim+6] -centroids[k*numDim+6]; distance = distance +(increased_distance * increased_distance); increased_distance = idata[valindex*numDim+7] -centroids[k*numDim+7]; distance = distance +(increased_distance * increased_distance); //increased_distance = idata[valindex*numDim+8] -centroids[k*numDim+8]; //distance = distance +(increased_distance * increased_distance); if(distance <minDistance) { minDistance = distance ; nearestCentroid = k; } } cluster[valindex]=nearestCentroid; min_dist[valindex]=sqrt(minDistance); } """ mod = compiler.SourceModule(kernel_code) centroids_adrs = mod.get_global('centroids')[0] kmeans_kernel = mod.get_function("kmeans_kernel") clusters_gpu = gpuarray.to_gpu(clusters) cluster = gpuarray.zeros(points, dtype=np.int32) min_dist = gpuarray.zeros(points, dtype=np.float32) driver.memcpy_htod(centroids_adrs,clusters) distortion = gpuarray.zeros(points, dtype=np.float32) block_size= 512 ## start kernel kmeans_kernel( driver.In(data),driver.In(clusters),cluster,min_dist, np.int32(nclusters),np.int32(dimensions), grid = (points/block_size,1), block = (block_size, 1, 1), ) print cluster print min_dist