The original traction example that you linked performed the sum of the row in the base dataset that had a row store. Your question basically is how to do the same when the underlying storage is the main column.
We can use essentially the same method, but we must use permutation iterators to convert the main column store to a row store with a large share of on-the-fly.
For this, we can characterize the functor that I described here .
Here is a fully processed example:
$ cat t466.cu #include <thrust/host_vector.h> #include <thrust/device_vector.h> #include <thrust/reduce.h> #include <thrust/functional.h> #include <thrust/sequence.h> #include <thrust/iterator/transform_iterator.h> #include <thrust/iterator/permutation_iterator.h> #include <thrust/iterator/counting_iterator.h> #include <iostream> #define COLS 3 #define ROWS 3 #define DSIZE (COLS*ROWS) #define INIT 10 #define STEP 10 // convert a linear index to a row index template <typename T> struct linear_index_to_row_index : public thrust::unary_function<T,T> { TC; // number of columns __host__ __device__ linear_index_to_row_index(TC) : C(C) {} __host__ __device__ T operator()(T i) { return i % C; } }; struct rm2cm_idx_functor : public thrust::unary_function<int, int> { int r; int c; rm2cm_idx_functor(int _r, int _c) : r(_r), c(_c) {}; __host__ __device__ int operator() (int idx) { unsigned my_r = idx/c; unsigned my_c = idx%c; return (my_c * r) + my_r; } }; int main(void) { int C = COLS; // number of columns int R = ROWS; // number of rows thrust::host_vector<int> h_vals(DSIZE); // initialize data thrust::sequence(h_vals.begin(), h_vals.end(), INIT, STEP); thrust::device_vector<int> vals = h_vals; std::cout << " Initial data: " << std::endl; thrust::copy(h_vals.begin(), h_vals.end(), std::ostream_iterator<int>(std::cout, ",")); std::cout << std::endl; // allocate storage for row sums and indices thrust::device_vector<int> row_sums(R); thrust::device_vector<int> row_indices(R); // compute row sums by summing values with equal row indices thrust::reduce_by_key (thrust::make_permutation_iterator(thrust::make_transform_iterator(thrust::counting_iterator<int>(0), linear_index_to_row_index<int>(R)), thrust::make_transform_iterator(thrust::counting_iterator<int>(0), rm2cm_idx_functor(R, C))), thrust::make_permutation_iterator(thrust::make_transform_iterator(thrust::counting_iterator<int>(0), linear_index_to_row_index<int>(R)) + (R*C), thrust::make_transform_iterator(thrust::counting_iterator<int>(0), rm2cm_idx_functor(R, C)) + (R*C)), thrust::make_permutation_iterator(vals.begin(), thrust::make_transform_iterator(thrust::counting_iterator<int>(0), rm2cm_idx_functor(R, C))), row_indices.begin(), row_sums.begin(), thrust::equal_to<int>(), thrust::plus<int>()); // print data thrust::host_vector<int> h_row_sums = row_sums; std::cout << " Results: " << std::endl; thrust::copy(h_row_sums.begin(), h_row_sums.end(), std::ostream_iterator<int>(std::cout, ",")); std::cout << std::endl; return 0; } $ nvcc -arch=sm_20 -o t466 t466.cu $ ./t466 Initial data: 10,20,30,40,50,60,70,80,90, Results: 120,150,180, $
Note that I also modified the linear_index_to_row_index functor to give me a row index suitably organized for the underlying column storage (the previous functor returned the index when the underlying storage was considered string storage). This included modifying the division operation by modulo operation and passing R instead of C to initialize the functor, so pay attention to the subtle difference.