Repeat the vector several times using CUDA Thrust

Question

Repeat the vector several times using CUDA Thrust

I am trying to solve a problem using CUDA Thrust.

I have a host array with 3 elements. Is it possible, using Thrust, to create an array of devices from elements 384 in which elements 3 in my host array are repeated 128 times ( 128 x 3 = 384 )?

Generally speaking, starting with an array of elements 3 , how can I use Thrust to generate an array of devices of size X , where X = Y x 3 , i.e. Y - the number of repetitions?

+3

cuda thrust

fahad Jun 03 '13 at 15:52

source share

3 answers

Robert Kroella has already answered this question using strided ranges . He also pointed to the possibility of using the operator.

Below I provide a processed example using the expand statement. On the contrary, using ranges avoiding the need for loops.

 #include <thrust/device_vector.h> #include <thrust/gather.h> #include <thrust/sequence.h> #include <stdio.h> using namespace thrust::placeholders; /*************************************/ /* CONVERT LINEAR INDEX TO ROW INDEX */ /*************************************/ template <typename T> struct linear_index_to_row_index : public thrust::unary_function<T,T> { T Ncols; // --- Number of columns __host__ __device__ linear_index_to_row_index(T Ncols) : Ncols(Ncols) {} __host__ __device__ T operator()(T i) { return i / Ncols; } }; /*******************/ /* EXPAND OPERATOR */ /*******************/ template <typename InputIterator1, typename InputIterator2, typename OutputIterator> OutputIterator expand(InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, OutputIterator output) { typedef typename thrust::iterator_difference<InputIterator1>::type difference_type; difference_type input_size = thrust::distance(first1, last1); difference_type output_size = thrust::reduce(first1, last1); // scan the counts to obtain output offsets for each input element thrust::device_vector<difference_type> output_offsets(input_size, 0); thrust::exclusive_scan(first1, last1, output_offsets.begin()); // scatter the nonzero counts into their corresponding output positions thrust::device_vector<difference_type> output_indices(output_size, 0); thrust::scatter_if(thrust::counting_iterator<difference_type>(0), thrust::counting_iterator<difference_type>(input_size), output_offsets.begin(), first1, output_indices.begin()); // compute max-scan over the output indices, filling in the holes thrust::inclusive_scan(output_indices.begin(), output_indices.end(), output_indices.begin(), thrust::maximum<difference_type>()); // gather input values according to index array (output = first2[output_indices]) OutputIterator output_end = output; thrust::advance(output_end, output_size); thrust::gather(output_indices.begin(), output_indices.end(), first2, output); // return output + output_size thrust::advance(output, output_size); return output; } /**************************/ /* STRIDED RANGE OPERATOR */ /**************************/ template <typename Iterator> class strided_range { public: typedef typename thrust::iterator_difference<Iterator>::type difference_type; struct stride_functor : public thrust::unary_function<difference_type,difference_type> { difference_type stride; stride_functor(difference_type stride) : stride(stride) {} __host__ __device__ difference_type operator()(const difference_type& i) const { return stride * i; } }; typedef typename thrust::counting_iterator<difference_type> CountingIterator; typedef typename thrust::transform_iterator<stride_functor, CountingIterator> TransformIterator; typedef typename thrust::permutation_iterator<Iterator,TransformIterator> PermutationIterator; // type of the strided_range iterator typedef PermutationIterator iterator; // construct strided_range for the range [first,last) strided_range(Iterator first, Iterator last, difference_type stride) : first(first), last(last), stride(stride) {} iterator begin(void) const { return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor(stride))); } iterator end(void) const { return begin() + ((last - first) + (stride - 1)) / stride; } protected: Iterator first; Iterator last; difference_type stride; }; /********/ /* MAIN */ /********/ int main(){ /**************************/ /* SETTING UP THE PROBLEM */ /**************************/ const int Nrows = 10; // --- Number of objects const int Ncols = 3; // --- Number of centroids thrust::device_vector<int> d_sequence(Nrows * Ncols); thrust::device_vector<int> d_counts(Ncols, Nrows); thrust::sequence(d_sequence.begin(), d_sequence.begin() + Ncols); expand(d_counts.begin(), d_counts.end(), d_sequence.begin(), thrust::make_permutation_iterator( d_sequence.begin(), thrust::make_transform_iterator(thrust::make_counting_iterator(0),(_1 % Nrows) * Ncols + _1 / Nrows))); printf("\n\nCentroid indices\n"); for(int i = 0; i < Nrows; i++) { std::cout << " [ "; for(int j = 0; j < Ncols; j++) std::cout << d_sequence[i * Ncols + j] << " "; std::cout << "]\n"; } return 0; }

+1

Jackolantern May 28 '15 at 5:46

source share

As a seemingly simpler alternative to using CUDA Thrust, I am posting a processed example below that implements the classic Matlab meshgrid function in CUDA.

In matlab

 x = [1 2 3]; y = [4 5 6 7]; [X, Y] = meshgrid(x, y);

produces

 X = 1 2 3 1 2 3 1 2 3 1 2 3

and

 Y = 4 4 4 5 5 5 6 6 6 7 7 7

X is exactly fourfold replication of the array X , which is the OP question and the first answer of Robert Crowella, and Y is the threefold sequential replication of each element of Y , which is the second prerequisite of Robert Crowella's answer.

Here is the code:

 #include <cstdio> #include <thrust/pair.h> #include "Utilities.cuh" #define BLOCKSIZE_MESHGRID_X 16 #define BLOCKSIZE_MESHGRID_Y 16 #define DEBUG /*******************/ /* MESHGRID KERNEL */ /*******************/ template <class T> __global__ void meshgrid_kernel(const T * __restrict__ x, size_t Nx, const float * __restrict__ y, size_t Ny, T * __restrict__ X, T * __restrict__ Y) { unsigned int tidx = blockIdx.x * blockDim.x + threadIdx.x; unsigned int tidy = blockIdx.y * blockDim.y + threadIdx.y; if ((tidx < Nx) && (tidy < Ny)) { X[tidy * Nx + tidx] = x[tidx]; Y[tidy * Nx + tidx] = y[tidy]; } } /************/ /* MESHGRID */ /************/ template <class T> thrust::pair<T *,T *> meshgrid(const T *x, const unsigned int Nx, const T *y, const unsigned int Ny) { T *X; gpuErrchk(cudaMalloc((void**)&X, Nx * Ny * sizeof(T))); T *Y; gpuErrchk(cudaMalloc((void**)&Y, Nx * Ny * sizeof(T))); dim3 BlockSize(BLOCKSIZE_MESHGRID_X, BLOCKSIZE_MESHGRID_Y); dim3 GridSize (iDivUp(Nx, BLOCKSIZE_MESHGRID_X), iDivUp(BLOCKSIZE_MESHGRID_Y, BLOCKSIZE_MESHGRID_Y)); meshgrid_kernel<<<GridSize, BlockSize>>>(x, Nx, y, Ny, X, Y); #ifdef DEBUG gpuErrchk(cudaPeekAtLastError()); gpuErrchk(cudaDeviceSynchronize()); #endif return thrust::make_pair(X, Y); } /********/ /* MAIN */ /********/ int main() { const int Nx = 3; const int Ny = 4; float *h_x = (float *)malloc(Nx * sizeof(float)); float *h_y = (float *)malloc(Ny * sizeof(float)); float *h_X = (float *)malloc(Nx * Ny * sizeof(float)); float *h_Y = (float *)malloc(Nx * Ny * sizeof(float)); for (int i = 0; i < Nx; i++) h_x[i] = i; for (int i = 0; i < Ny; i++) h_y[i] = i + 4.f; float *d_x; gpuErrchk(cudaMalloc(&d_x, Nx * sizeof(float))); float *d_y; gpuErrchk(cudaMalloc(&d_y, Ny * sizeof(float))); gpuErrchk(cudaMemcpy(d_x, h_x, Nx * sizeof(float), cudaMemcpyHostToDevice)); gpuErrchk(cudaMemcpy(d_y, h_y, Ny * sizeof(float), cudaMemcpyHostToDevice)); thrust::pair<float *, float *> meshgrid_pointers = meshgrid(d_x, Nx, d_y, Ny); float *d_X = (float *)meshgrid_pointers.first; float *d_Y = (float *)meshgrid_pointers.second; gpuErrchk(cudaMemcpy(h_X, d_X, Nx * Ny * sizeof(float), cudaMemcpyDeviceToHost)); gpuErrchk(cudaMemcpy(h_Y, d_Y, Nx * Ny * sizeof(float), cudaMemcpyDeviceToHost)); for (int j = 0; j < Ny; j++) { for (int i = 0; i < Nx; i++) { printf("i = %i; j = %i; x = %f; y = %f\n", i, j, h_X[j * Nx + i], h_Y[j * Nx + i]); } } return 0; }

+1

Jackolantern Sep 08 '15 at 7:12

source share

Robert Crovella · Accepted Answer · 2013-06-03T16:22:41+0000

One possible approach:

create device vector of appropriate size
create 3 strided ranges , one for each position of the element {1, 2, 3} in the final output (device) vector
use thrust :: fill to fill each of the three ranges using the corresponding (host vector) element {1, 2, 3}

This code is a trivial modification of the approximate range for demonstration. You can change the definition of REPS to 128 to see the full extension to 384 output elements:

 #include <thrust/iterator/counting_iterator.h> #include <thrust/iterator/transform_iterator.h> #include <thrust/iterator/permutation_iterator.h> #include <thrust/functional.h> #include <thrust/fill.h> #include <thrust/device_vector.h> #include <thrust/host_vector.h> // for printing #include <thrust/copy.h> #include <ostream> #define STRIDE 3 #define REPS 15 // change to 128 if you like #define DSIZE (STRIDE*REPS) // this example illustrates how to make strided access to a range of values // examples: // strided_range([0, 1, 2, 3, 4, 5, 6], 1) -> [0, 1, 2, 3, 4, 5, 6] // strided_range([0, 1, 2, 3, 4, 5, 6], 2) -> [0, 2, 4, 6] // strided_range([0, 1, 2, 3, 4, 5, 6], 3) -> [0, 3, 6] // ... template <typename Iterator> class strided_range { public: typedef typename thrust::iterator_difference<Iterator>::type difference_type; struct stride_functor : public thrust::unary_function<difference_type,difference_type> { difference_type stride; stride_functor(difference_type stride) : stride(stride) {} __host__ __device__ difference_type operator()(const difference_type& i) const { return stride * i; } }; typedef typename thrust::counting_iterator<difference_type> CountingIterator; typedef typename thrust::transform_iterator<stride_functor, CountingIterator> TransformIterator; typedef typename thrust::permutation_iterator<Iterator,TransformIterator> PermutationIterator; // type of the strided_range iterator typedef PermutationIterator iterator; // construct strided_range for the range [first,last) strided_range(Iterator first, Iterator last, difference_type stride) : first(first), last(last), stride(stride) {} iterator begin(void) const { return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor(stride))); } iterator end(void) const { return begin() + ((last - first) + (stride - 1)) / stride; } protected: Iterator first; Iterator last; difference_type stride; }; int main(void) { thrust::host_vector<int> h_data(STRIDE); h_data[0] = 1; h_data[1] = 2; h_data[2] = 3; thrust::device_vector<int> data(DSIZE); typedef thrust::device_vector<int>::iterator Iterator; strided_range<Iterator> pos1(data.begin(), data.end(), STRIDE); strided_range<Iterator> pos2(data.begin()+1, data.end(), STRIDE); strided_range<Iterator> pos3(data.begin()+2, data.end(), STRIDE); thrust::fill(pos1.begin(), pos1.end(), h_data[0]); thrust::fill(pos2.begin(), pos2.end(), h_data[1]); thrust::fill(pos3.begin(), pos3.end(), h_data[2]); // print the generated data std::cout << "data: "; thrust::copy(data.begin(), data.end(), std::ostream_iterator<int>(std::cout, " ")); std::cout << std::endl; return 0; }

Repeat the vector several times using CUDA Thrust

More articles: