Inline ptx load function parameters

I have the following function with built-in build that works fine in debug mode in 32-bit Visual Studio 2008:

__device__ void add(int* pa, int* pb) { asm(".reg .u32 s<3>;"::); asm(".reg .u32 r<14>;"::); asm("ld.global.b32 s0, [%0];"::"r"(&pa)); //load addresses of pa, pb printf(...); asm("ld.global.b32 s1, [%0];"::"r"(&pb)); printf(...); asm("ld.global.b32 r1, [s0+8];"::); printf(...); asm("ld.global.b32 r2, [s1+8];"::); printf(...); ...// perform some operations } 

pa and pb are distributed globally on the device, e.g.

 __device__ int pa[3] = {0, 0x927c0000, 0x20000011}; __device__ int pb[3] = {0, 0xbb900000, 0x2000000b}; 

However, this code crashes in release mode, on the line asm("ld.global.b32 r1, [s0+8];"::); How to load function parameters using the built-in ptx in release mode?

PS creating a release mode with the -G flag (generates GPU debugging information) makes the code work correctly in release mode. Thanks,

+2
source share
1 answer

Hope this code helps. I still guess what you are trying to do exactly, but I started with your code and decided to add some values โ€‹โ€‹to the pa and pb arrays and save them back to pa[0] and pb[0] .

This code is written for a 64-bit machine, but converting it to 32-bit pointers should not be difficult. I marked the lines that need to be changed for 32-bit pointers with comment. I hope this answers your question about how to use the functional parameters pointing to the deviceโ€™s memory:

 #include <stdio.h> __device__ int pa[3] = {0, 0x927c0000, 0x20000011}; __device__ int pb[3] = {0, 0xbb900000, 0x2000000b}; __device__ void add(int* mpa, int* mpb) { asm(".reg .u64 s<2>;"::); // change to .u32 for 32 bit pointers asm(".reg .u32 r<6>;"::); asm("mov.u64 s0, %0;"::"l"(mpa)); //change to .u32 and "r" for 32 bit asm("mov.u64 s1, %0;"::"l"(mpb)); //change to .u32 and "r" for 32 bit asm("ld.global.u32 r0, [s0+4];"::); asm("ld.global.u32 r1, [s1+4];"::); asm("ld.global.u32 r2, [s0+8];"::); asm("ld.global.u32 r3, [s1+8];"::); asm("add.u32 r4, r0, r2;"::); asm("add.u32 r5, r1, r3;"::); asm("st.global.u32 [s0], r4;"::); asm("st.global.u32 [s1], r5;"::); } __global__ void mykernel(){ printf("pa[0] = %x, pb[0] = %x\n", pa[0], pb[0]); add(pa, pb); printf("pa[0] = %x, pb[0] = %x\n", pa[0], pb[0]); } int main() { mykernel<<<1,1>>>(); cudaDeviceSynchronize(); return 0; } 

When I run this code, I get:

 $ ./t128 pa[0] = 0, pb[0] = 0 pa[0] = b27c0011, pb[0] = db90000b $ 

which, I believe, is the right way out.

I compiled it with

 nvcc -O3 -arch=sm_20 -o t128 t128.cu 
+1
source

All Articles