Nvidia OpenMP Unloading Wrong Cut

I am interested in offloading work on the GPU using OpenMP.

The code below gives the correct value sumon the CPU

//g++ -O3 -Wall foo.cpp -fopenmp
#pragma omp parallel for reduction(+:sum)                                                                                                                                    
for(int i = 0 ; i < 2000000000; i++) sum += i%11;

It also runs on a GPU with OpenACC as follows

//g++ -O3 -Wall foo.cpp -fopenacc   
#pragma acc parallel loop reduction(+:sum)                                                                                                                                    
for(int i = 0 ; i < 2000000000; i++) sum += i%11;

nvprof shows that it runs on the GPU and is also faster than OpenMP on the CPU.

However, when I try to upload to the GPU using OpenMP like this,

//g++ -O3 -Wall foo.cpp -fopenmp -fno-stack-protector
#pragma omp target teams distribute parallel for reduction(+:sum)
for(int i = 0 ; i < 2000000000; i++) sum += i%11;

it gets the wrong result for sum(it just returns zero). nvprofseems to show that it runs on the GPU, but it is much slower than OpenMP on the processor.

Why doesn't shortening work with OpenMP on the GPU?

Here is the complete code I used to verify this

#include <stdio.h>
//g++ -O3 -Wall acc2.cpp -fopenmp -fno-stack-protector                                                                                                                           
//sudo nvprof ./a.out                                                                                                                                                            
int main (void) {
  int sum = 0;
  //#pragma omp parallel for reduction(+:sum)                                                                                                                                    
  //#pragma acc parallel loop reduction(+:sum)                                                                                                                                   
  #pragma omp target teams distribute parallel for reduction(+:sum)
  for(int i = 0 ; i < 2000000000; i++) {
    sum += i%11;
  }
  printf("sum = %d\n",sum);
  return 0;
}

Using GCC 7.2.0, Ubuntu 17.10, along with gcc-offload-nvptx

+6
1

, map(tofrom:sum) :

//g++ -O3 -Wall foo.cpp -fopenmp -fno-stack-protector
#pragma omp target teams distribute parallel for reduction(+:sum) map(tofrom:sum)
for(int i = 0 ; i < 2000000000; i++) sum += i%11;

sum, , OpenACC OpenMP target.

: , simd. . .


. :

#pragma omp target data map(tofrom: sum)
#pragma omp target teams distribute parallel for reduction(+:sum)
for(int i = 0 ; i < 2000000000; i++) sum += i%11;

- defaultmap(tofrom:scalar)

#pragma omp target teams distribute parallel for reduction(+:sum) defaultmap(tofrom:scalar)

-, OpenMP 4.5 firstprivate. https://developers.redhat.com/blog/2016/03/22/what-is-new-in-openmp-4-5-3/

defaultmap(tofrom:scalar) , , .


, , . , ( , , ).

#include <omp.h>
#include <stdio.h>

//g++ -O3 -Wall acc2.cpp -fopenmp -fno-stack-protector
//sudo nvprof ./a.out

static inline int foo(int a, int b, int c) {
  return a > b ? (a/c)*b + (a%c)*b/c : (b/c)*a + (b%c)*a/c;
}

int main (void) {
  int nteams = 0, nthreads = 0;

  #pragma omp target teams map(tofrom: nteams) map(tofrom:nthreads)
  {
    nteams = omp_get_num_teams();
    #pragma omp parallel
    #pragma omp single
    nthreads = omp_get_num_threads();
  }
  int N = 2000000000;
  int sum = 0;

  #pragma omp declare target(foo)  

  #pragma omp target teams map(tofrom: sum)
  {
    int nteams = omp_get_num_teams();
    int iteam = omp_get_team_num();
    int start  = foo(iteam+0, N, nteams);
    int finish = foo(iteam+1, N, nteams);    
    int n2 = finish - start;
    #pragma omp parallel
    {
      int sum_team = 0;
      int ithread = omp_get_thread_num();
      int nthreads = omp_get_num_threads();
      int start2  = foo(ithread+0, n2, nthreads) + start;
      int finish2 = foo(ithread+1, n2, nthreads) + start;
      for(int i=start2; i<finish2; i++) sum_team += i%11;
      #pragma omp atomic
      sum += sum_team;
    }   
  }   

  printf("devices %d\n", omp_get_num_devices());
  printf("default device %d\n", omp_get_default_device());
  printf("device id %d\n", omp_get_initial_device());
  printf("nteams %d\n", nteams);
  printf("nthreads per team %d\n", nthreads);
  printf("total threads %d\n", nteams*nthreads);
  printf("sum %d\n", sum);
  return 0;
}

nvprof , cuCtxSynchronize. OpenACC .


. simd

#pragma omp target teams distribute parallel for simd reduction(+:sum) map(tofrom:sum).

.

#pragma omp target map(tofrom:sum)
#pragma omp teams distribute parallel for simd reduction(+:sum)

OMP_GPU    0.25 s
ACC        0.47 s
OMP_CPU    0.64 s

OpenMP , OpenACC OpenMP . , OpenACC .

, Ubuntu 18.04 gcc-offload-nvptx, -fno-stack-protector.

0

All Articles