Hope this code helps. I still guess what you are trying to do exactly, but I started with your code and decided to add some values โโto the pa and pb arrays and save them back to pa[0] and pb[0] .
This code is written for a 64-bit machine, but converting it to 32-bit pointers should not be difficult. I marked the lines that need to be changed for 32-bit pointers with comment. I hope this answers your question about how to use the functional parameters pointing to the deviceโs memory:
#include <stdio.h> __device__ int pa[3] = {0, 0x927c0000, 0x20000011}; __device__ int pb[3] = {0, 0xbb900000, 0x2000000b}; __device__ void add(int* mpa, int* mpb) { asm(".reg .u64 s<2>;"::); // change to .u32 for 32 bit pointers asm(".reg .u32 r<6>;"::); asm("mov.u64 s0, %0;"::"l"(mpa)); //change to .u32 and "r" for 32 bit asm("mov.u64 s1, %0;"::"l"(mpb)); //change to .u32 and "r" for 32 bit asm("ld.global.u32 r0, [s0+4];"::); asm("ld.global.u32 r1, [s1+4];"::); asm("ld.global.u32 r2, [s0+8];"::); asm("ld.global.u32 r3, [s1+8];"::); asm("add.u32 r4, r0, r2;"::); asm("add.u32 r5, r1, r3;"::); asm("st.global.u32 [s0], r4;"::); asm("st.global.u32 [s1], r5;"::); } __global__ void mykernel(){ printf("pa[0] = %x, pb[0] = %x\n", pa[0], pb[0]); add(pa, pb); printf("pa[0] = %x, pb[0] = %x\n", pa[0], pb[0]); } int main() { mykernel<<<1,1>>>(); cudaDeviceSynchronize(); return 0; }
When I run this code, I get:
$ ./t128 pa[0] = 0, pb[0] = 0 pa[0] = b27c0011, pb[0] = db90000b $
which, I believe, is the right way out.
I compiled it with
nvcc -O3 -arch=sm_20 -o t128 t128.cu