radix , , , . , .
. , 4 . 1, 4, 7 14. , :
Element
Value: 7 14 4 1
Binary: 0111 1110 0100 0001
0:
Element # 1 2 3 4
Value: 7 14 4 1
Binary: 0111 1110 0100 0001
bit 0: 1 0 0 1
, , ( 0) , . , . :
Element # 2 3 1 4
Value: 14 4 7 1
Binary: 1110 0100 0111 0001
bit 0: 0 0 1 1
radix . - () :
Element # 3 2 1 4
Value: 4 14 7 1
Binary: 0100 1110 0111 0001
bit 1: 0 1 1 0
, ( 1) :
Element # 3 4 2 1
Value: 4 1 14 7
Binary: 0100 0001 1110 0111
bit 1: 0 0 1 1
:
Element # 3 4 2 1
Value: 4 1 14 7
Binary: 0100 0001 1110 0111
bit 2: 1 0 1 1
:
Element # 4 3 2 1
Value: 1 4 14 7
Binary: 0001 0100 1110 0111
bit 2: 0 1 1 1
( ):
Element # 4 3 2 1
Value: 1 4 14 7
Binary: 0001 0100 1110 0111
bit 3: 0 0 1 0
:
Element # 4 3 1 2
Value: 1 4 7 14
Binary: 0001 0100 0111 1110
bit 3: 0 0 0 1
. , , , , " , ?" " , ?" , , , . 0:
Element # 1 2 3 4
Value: 7 14 4 1
Binary: 0111 1110 0100 0001
bit 0: 1 0 0 1
0 1 0:
bit 0: 1 0 0 1
zero bits one bits
--------- --------
1)histogram: 2 2
:
zero bits one bits
--------- --------
1)histogram: 2 2
2)prefix sum: 0 2
- - . , 2 ( 0 0). , , 0 , - :
bit 0: 1 0 0 1
3)offset: 0 0 1 1
- , 0 1 , 1:
0 bit 0: 1 1
3)ex. psum: 0 1
1 bit 0: 1 1
3)ex. psum: 0 1
, 4 :
4) 2 3,
, -, (0 1), , , :
Element # 1 2 3 4
Value: 7 14 4 1
Binary: 0111 1110 0100 0001
bit 0: 1 0 0 1
hist psum: 2 0 0 2
offset: 0 0 1 1
new index: 2 0 1 3
" ":
Element
Value: 14 4 7 1
Binary: 0111 1110 0111 0001
, , . 1, ( ) ; , .
:
- Radix-, , . , , 2,3 4 .
- , radix, - , . , 32- , , 1023 (2 ^ 10-1), 32 . , , 10 .
- ? . - , , - . radix , , .
. radix. , , 32- warp, warp, .. 32 . , , , , CUDA. , , .., .
#include <stdio.h>
#include <stdlib.h>
#define WSIZE 32
#define LOOPS 100000
#define UPPER_BIT 31
#define LOWER_BIT 0
__device__ unsigned int ddata[WSIZE];
__global__ void mykernel(){
__shared__ volatile unsigned int sdata[WSIZE*2];
sdata[threadIdx.x] = ddata[threadIdx.x];
unsigned int bitmask = 1<<LOWER_BIT;
unsigned int offset = 0;
unsigned int thrmask = 0xFFFFFFFFU << threadIdx.x;
unsigned int mypos;
for (int i = LOWER_BIT; i <= UPPER_BIT; i++){
unsigned int mydata = sdata[((WSIZE-1)-threadIdx.x)+offset];
unsigned int mybit = mydata&bitmask;
unsigned int ones = __ballot(mybit);
unsigned int zeroes = ~ones;
offset ^= WSIZE;
if (!mybit)
mypos = __popc(zeroes&thrmask);
else
mypos = __popc(zeroes)+__popc(ones&thrmask);
sdata[mypos-1+offset] = mydata;
bitmask <<= 1;
}
ddata[threadIdx.x] = sdata[threadIdx.x+offset];
}
int main(){
unsigned int hdata[WSIZE];
for (int lcount = 0; lcount < LOOPS; lcount++){
unsigned int range = 1U<<UPPER_BIT;
for (int i = 0; i < WSIZE; i++) hdata[i] = rand()%range;
cudaMemcpyToSymbol(ddata, hdata, WSIZE*sizeof(unsigned int));
mykernel<<<1, WSIZE>>>();
cudaMemcpyFromSymbol(hdata, ddata, WSIZE*sizeof(unsigned int));
for (int i = 0; i < WSIZE-1; i++) if (hdata[i] > hdata[i+1]) {printf("sort error at loop %d, hdata[%d] = %d, hdata[%d] = %d\n", lcount,i, hdata[i],i+1, hdata[i+1]); return 1;}
}
printf("Success!\n");
return 0;
}