Strange behavior when mixing openMP with openMPI

I have code that is parallelized using openMP (in a for loop). I would like to repeat this function several times and use MPI to send machines to the cluster, keeping all the contents of the node still open.

When I use only openMP, I get the acceleration I expect (using twice the number of processors / cores for half the time). When I add to MPI and submit to only one MPI process, I do not get this speed. I created a toy problem to test this and still have the same problem. Here is the code

#include <iostream>
#include <stdio.h>
#include <unistd.h>
#include "mpi.h"

#include <omp.h>


int main(int argc, char *argv[]) {
    int iam=0, np = 1;
    long i;
    int numprocs, rank, namelen;
    char processor_name[MPI_MAX_PROCESSOR_NAME];

    double t1 = MPI_Wtime();
    std::cout << "!!!Hello World!!!" << std::endl; // prints !!!Hello World!!!

    MPI_Init(&argc, &argv);
    MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Get_processor_name(processor_name, &namelen);

    int nThread = omp_get_num_procs();//omp_get_num_threads here returns 1??
    printf("nThread = %d\n", nThread);

    int *total = new int[nThread];
    for (int j=0;j<nThread;j++) {
        total[j]=0;
    }
#pragma omp parallel num_threads(nThread) default(shared) private(iam, i)
    {
        np = omp_get_num_threads();

#pragma omp for schedule(dynamic, 1)
        for (i=0; i<10000000; i++) {
            iam = omp_get_thread_num();
            total[iam]++;
        }
        printf("Hello from thread %d out of %d from process %d out of %d on %s\n",
                iam, np, rank, numprocs,processor_name);
    }

    int grandTotal=0;
    for (int j=0;j<nThread;j++) {
        printf("Total=%d\n",total[j]);
        grandTotal += total[j];
    }
    printf("GrandTotal= %d\n", grandTotal);

    MPI_Finalize();

    double t2 = MPI_Wtime();

    printf("time elapsed with MPI clock=%f\n", t2-t1);
    return 0;
}

I am compiling with openmpi-1.8 / bin / mpiC ++ using the -fopenmp flag. Here is my PBS script

#PBS -l select=1:ncpus=12

setenv OMP_NUM_THREADS 12

/util/mpi/openmpi-1.8/bin/mpirun -np 1 -hostfile $PBS_NODEFILE --map-by node:pe=$OMP_NUM_THREADS /workspace/HelloWorldMPI/HelloWorldMPI

I also tried C # PBS -l nodes = 1: ppn = 12 to get the same results.

, ( !). , ncpus, OMP_NUM_THREADS. ( 10 10 10 ^ 7, ). printf, , -, . , ( ncpus), 100%. -np = 2, , MPI, , , , openMP

, . ?

+4
1

, , , , OpenMP MPI. , , .

double t1 = MPI_Wtime();

: MPI_Wtime() MPI_Init() undefined. , MPI, - MPI_Barrier() Wtime, .

//omp_get_num_threads here returns 1??

, omp_get_num_threads() 1, , .

#pragma omp parallel num_threads(nThread)

num_threads nThread, , , OMP_NUM_THREADS. num_threads .

default(shared)

shared, default(shared) .

private(iam, i)

, , , iam i private, , ( , , ).

#pragma omp for schedule(dynamic, 1)

, , schedule(dynamic, 1) , , , .

int grandTotal=0;
for (int j=0;j<nThread;j++) {
    printf("Total=%d\n",total[j]);
    grandTotal += total[j];
}

, total OpenMP reduction.

double t2 = MPI_Wtime();

, MPI_Init(), MPI_Wtime() , MPI_Finalize() undefined, , .

. , OpenMP, this , , , OpenMP .

, , MPI, comm . MPI , . MPI, - , ? (: , , MPI, ).

:

#include <iostream>
#include <cstdio>
#include <cstdlib>

#include <mpi.h>
#include <omp.h>

int main(int argc, char *argv[])
{
    MPI_Init(&argc, &argv);

    int world_size,
        world_rank;
    MPI_Comm_size(MPI_COMM_WORLD, &world_size);
    MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);

    int name_len;
    char proc_name[MPI_MAX_PROCESSOR_NAME];
    MPI_Get_processor_name(proc_name, &name_len);

    MPI_Barrier(MPI_COMM_WORLD);
    double t_start = MPI_Wtime();

    // we need to scale the work per task by number of mpi threads,
    // otherwise we actually do more work with the more tasks we have
    const int n_iterations = 1e7 / world_size;

    // actually we also need some dummy data to add so the compiler doesn't just
    // optimize out the work loop with -O3 on
    int data[16];
    for (int i = 0; i < 16; ++i)
        data[i] = rand() % 16;

    // reduction(+:total) means that all threads will make a private
    // copy of total at the beginning of this construct and then
    // do a reduction operation with the + operator at the end (aka sum them
    // all together)
    unsigned int total = 0;
    #pragma omp parallel reduction(+:total)
    {
        // both of these calls will execute properly since we
        // are in an omp parallel region
        int n_threads = omp_get_num_threads(),
            thread_id = omp_get_thread_num();

        // note: this code will only execute on a single thread (per mpi task)
        #pragma omp master
        {
            printf("nThread = %d\n", n_threads);
        }

        #pragma omp for
        for (int i = 0; i < n_iterations; i++)
            total += data[i % 16];

        printf("Hello from thread %d out of %d from process %d out of %d on %s\n",
                thread_id, n_threads, world_rank, world_size, proc_name);
    }

    // do a reduction with MPI, otherwise the data we just calculated is useless
    unsigned int grand_total;
    MPI_Allreduce(&total, &grand_total, 1, MPI_UNSIGNED, MPI_SUM, MPI_COMM_WORLD);

    // another barrier to make sure we wait for the slowest task
    MPI_Barrier(MPI_COMM_WORLD);
    double t_end = MPI_Wtime();

    // output individual thread totals
    printf("Thread total = %d\n", total);

    // output results from a single thread
    if (world_rank == 0)
    {
        printf("Grand Total = %d\n", grand_total);
        printf("Time elapsed with MPI clock = %f\n", t_end - t_start);
    }

    MPI_Finalize();
    return 0;
}

: ​​ 22 schedule(dynamic, 1), , .

, PBS, , , SLURM, sbatch file 6- node :

#!/bin/bash
#SBATCH --job-name=TestOrSomething
#SBATCH --export=ALL
#SBATCH --partition=debug
#SBATCH --nodes=3
#SBATCH --ntasks-per-socket=1

# set 6 processes per thread here
export OMP_NUM_THREADS=6

# note that this will end up running 3 * (however many cpus 
#   are on a single node) mpi tasks, not just 3. Additionally
#   the below line might use `mpirun` instead depending on the
#   cluster
srun ./a.out

, MPI OMP, ( ):

Scaling (time) for the example code.  It's basically perfect.

, . , 1-16 - 1 MPI 1-16 OMP, 16-256 - 1-16 MPI 16 , , MPI OMP.

+3

All Articles