Is cudaMalloc modified as asynchronous?

Question

Is cudaMalloc modified as asynchronous?

I read elsewhere that cudaMalloc will sync between cores. (for example, will cudaMalloc synchronize the host and device? ) However, I just checked this code and based on what I see in the visual profiler, it looks like cudaMalloc is not synchronizing. if you add cudaFree to a loop that syncs. I am using CUDA 7.5. Does anyone know if cudaMalloc has changed its behavior? Or am I missing some subtlety? Many thanks!

__global__ void slowKernel()
{
  float input = 5;
  for( int i = 0; i < 1000000; i++ ){
    input = input * .9999999;
  }
}

__global__ void fastKernel()
{
  float input = 5;
  for( int i = 0; i < 100000; i++ ){
    input = input * .9999999;
  }
}

void mallocSynchronize(){
  cudaStream_t stream1, stream2;
  cudaStreamCreate( &stream1 );
  cudaStreamCreate( &stream2 );
  slowKernel <<<1, 1, 0, stream1 >>>();
  int *dev_a = 0;
  for( int i = 0; i < 10; i++ ){
    cudaMalloc( &dev_a, 4 * 1024 * 1024 );
    fastKernel <<<1, 1, 0, stream2 >>>();
    // cudaFree( dev_a ); // If you uncomment this, the second fastKernel launch will wait until slowKernel completes
  }
}

+4

cuda

hildy Mar 15 '16 at 1:55

source share

1 answer

talonmies · Answer 1 · 2016-03-16T16:01:04+0000

, ( , , , , ).

#include <time.h>
__global__ void slowKernel(float *output, bool write=false)
{
    float input = 5;
#pragma unroll
    for( int i = 0; i < 10000000; i++ ){
        input = input * .9999999;
    }
    if (write) *output -= input;
}

__global__ void fastKernel(float *output, bool write=false)
{
    float input = 5;
#pragma unroll
    for( int i = 0; i < 100000; i++ ){
        input = input * .9999999;
    }
    if (write) *output -= input;
}

void burntime(long val) {
    struct timespec tv[] = {{0, val}};
    nanosleep(tv, 0);
}

void mallocSynchronize(){
    cudaStream_t stream1, stream2;
    cudaStreamCreate( &stream1 );
    cudaStreamCreate( &stream2 );
    const size_t sz = 1 << 21;
    slowKernel <<<1, 1, 0, stream1 >>>((float *)(0));
    burntime(500000000L); // 500ms wait - slowKernel around 1300ms
    int *dev_a = 0;
    for( int i = 0; i < 10; i++ ){
        cudaMalloc( &dev_a, sz );
        fastKernel <<<1, 1, 0, stream2 >>>((float *)(0));
        burntime(1000000L); // 1ms wait - fastKernel around 15ms
    }
}

int main()
{
    mallocSynchronize();
    cudaDeviceSynchronize();
    cudaDeviceReset();
    return 0;
}

[note POSIX, Windows]

(GTX970) , cudaMalloc slowKernel , fastKernel . , , . 0,5 . , burntime, , , .

, : , , Linux CUDA 7.5 Maxwell. , , , , , / . CUDA , , Fermi Kepler.

Is cudaMalloc modified as asynchronous?

More articles: