Global loading / saving of memory and sharing of global memory

I have the following simple code:

#include<stdio.h>

#define BLOCKSIZE_X 32
#define BLOCKSIZE_Y 1

int iDivUp(int a, int b) { return ((a % b) != 0) ? (a / b + 1) : (a / b); }

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
    if (code != cudaSuccess) 
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

__global__ void kernel0(float *d_a, float *d_b, const unsigned int M, const unsigned int N)
{
    const int tidx = threadIdx.x + blockIdx.x * blockDim.x;
    const int tidy = threadIdx.y + blockIdx.y * blockDim.y;

    if ((tidx < M)&&(tidy < N)) {

        d_b[tidy * M + tidx] = d_a[tidy * M + tidx];

    }

}

void main()
{
    const unsigned int M = 32;
    const unsigned int N = 1;

    float *d_a; cudaMalloc((void**)&d_a, M*N*sizeof(float));
    float *d_b; cudaMalloc((void**)&d_b, M*N*sizeof(float));

    dim3 dimGrid(iDivUp(M, BLOCKSIZE_X), iDivUp(N, BLOCKSIZE_Y));
    dim3 dimBlock(BLOCKSIZE_X, BLOCKSIZE_Y);

    kernel0<<<dimGrid, dimBlock>>>(d_a, d_b, M, N);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

    cudaDeviceReset();

}

which performs the assignment between two arrays 32 floats. I am trying to understand the relationship between global collaborations for accessing global memory and the efficiency of loading / storing global memory, as well as other metrics / events.

Visual Profiler shows the following indicators:

Global Memory Load Efficiency = 50%
Global Memory Store Efficiency = 100%

The value of Global Memory Load Efficiency surprises me. I would expect performance 100%in both cases, since I believe that I am performing perfectly unified memory access. So my question is:

50% - , , 100% - ?

/, :

gld_inst_32bit = 32 (Number of 32-bit global memory load transactions)
gst_inst_32bit = 32 (Number of 32-bit global memory store transactions)

, / 32 float s.

uncached global load transaction = 0 (Number of uncached global load transactions)
l1 global load miss = 2 (Number of global load misses in L1 cache)

, (, ) . l1 , 0.

gld_request = 1 (Number of executed global load instructions per warp in a SM)
gst_request = 1 (Number of executed global store instructions per warp in a SM)

, , , .

:

/*0000*/         MOV R1, c[0x1][0x100];                          /* 0x2800440400005de4 */
/*0008*/         S2R R3, SR_CTAID.Y;                             /* 0x2c0000009800dc04 */
/*0010*/         S2R R4, SR_TID.Y;                               /* 0x2c00000088011c04 */
/*0018*/         IMAD R4, R3, c[0x0][0xc], R4;                   /* 0x2008400030311ca3 */
/*0020*/         S2R R0, SR_CTAID.X;                             /* 0x2c00000094001c04 */
/*0028*/         ISETP.LT.U32.AND P0, PT, R4, c[0x0][0x2c], PT;  /* 0x188e4000b041dc03 */
/*0030*/         S2R R2, SR_TID.X;                               /* 0x2c00000084009c04 */
/*0038*/         IMAD R0, R0, c[0x0][0x8], R2;                   /* 0x2004400020001ca3 */
/*0040*/         ISETP.LT.U32.AND P0, PT, R0, c[0x0][0x28], P0;  /* 0x18804000a001dc03 */
/*0048*/    @!P0 BRA.U 0x78;                                     /* 0x40000000a000a1e7 */
/*0050*/     @P0 IMAD R2, R4, c[0x0][0x28], R0;                  /* 0x20004000a04080a3 */
/*0058*/     @P0 ISCADD R0, R2, c[0x0][0x20], 0x2;               /* 0x4000400080200043 */
/*0060*/     @P0 ISCADD R2, R2, c[0x0][0x24], 0x2;               /* 0x4000400090208043 */
/*0068*/     @P0 LD R0, [R0];                                    /* 0x8000000000000085 */
/*0070*/     @P0 ST [R2], R0;                                    /* 0x9000000000200085 */
/*0078*/         EXIT;                                           /* 0x8000000000001de7 */

: CUDA 6.5, GeForce GT540M, Windows 7.

M 32 64, , 100%, /:

gld_inst_32bit = 64 
gst_inst_32bit = 64 

uncached global load transaction = 0 
l1 global load miss = 2 

gld_request = 2
gst_request = 2

gld_inst_32bit, gst_inst_32bit, gld_request gst_request , 64 float 2 / . , uncached global load transaction l1 global load miss , , 100%.

K20c M=32:

Global Memory Load Efficiency = 100%
Global Memory Store Efficiency = 100%
gld_inst_32bit = 64 
gst_inst_32bit = 64 
gld_request = 1
gst_request = 1
uncached global load transaction = 1
l1 global load miss = 0
l1 global load hit = 0

Visual Profiler , l1.

, M BLOCKSIZE_X.

, GT540M , , 100%, 100% . 100%, .

l1, -Xptxas -dlcm=cg, @Jez, 100%, . , l1, L2.

, M,

M=32

enter image description here

enter image description here

M=64

enter image description here

enter image description here

M=96

enter image description here

enter image description here

M=128

enter image description here

M=160

enter image description here

M=192

enter image description here

, M , 32, .

l1, :

M=32

enter image description here

M=64

enter image description here

M=96

<411 >

- TESLA C2050

M = 32    33.3%
M = 64    28.6%
M = 96    42.9%
M = 128   57.1%
M = 160   71.4%
M = 192   85.7%
M = 224  100%
M = 256  114%
M = 288   90%

, l1, 100% .

+4
1

NVIDIA , . , ​​ . , , , .

Fermi HWPM, L1 , 1 L1 GPC. GF100 (C2050) 25% .

, , .

HWPM L1 L1, L2, .

Maxwell , , L1/TEX.

CACHED VS. Uncached

Fermi / L1. / L1, LSU , . Atomics - L1, L1.

Kepler . , . GK110 GK208 LDG TEX.

SM L1 128 .

L1 L2 4 32B-.

L1 L2 32B.

+4

All Articles