Reducing rows or columns of a matrix in CUDA

I am using CUDA with cuBLAS to perform matrix operations.

I need to add the rows (or columns) of the matrix. I am currently doing this by multiplying the matrix by a unit vector, but this is not so efficient.

Is there a better way? Could not find anything in cuBLAS .

+5
source share
2 answers

In fact, multiplying the matrix by one vector using cublas_gemv() is a very efficient way if you do not plan to manually write your kernel.

You can easily profile cublas_gemv() memory cublas_gemv() . It is very close to simply reading all the matrix data once, which can be considered as the theoretical peak performance of summing the rows / columns of the matrix.

Additional operation "x1.0" will not lead to a significant decrease in performance, because:

  • cublas_gemv() is basically a bandwidth binding operation; additional arithmetic instructions will not be a bottleneck;
  • the FMA instruction further reduces instruction throughput;
  • mem of of vector is usually much smaller than the matrix, and it can be easily cached using the GPU to reduce memory bandwidth.

cublas_gemv() will also help you solve the layout problem. It works with the / col -major line and arbitrary padding.

I also asked a similar question about this. My experiment shows that cublas_gemv() better than segmented reduction using Thrust::reduce_by_key , which is another approach for summing matrix rows.

+5
source

Posts related to this, containing helpful answers on the same topic, are available at

Reduce matrix rows with CUDA

and

Reduce matrix columns using CUDA .

Here I just want to point out how the approach of reducing the columns of a matrix by multiplying a row by the same matrix can be generalized to perform a linear combination of an ensemble of vectors. In other words, if you want to calculate the following vector basis extension

enter image description here

where f(x_m) are the samples of the function f(x) , and \psi_n are the basis functions, and c_n are the expansion coefficients, then \psi_n can be organized in the matrix N x M and the coefficients c_n in the row vector, and then calculate the vector multiplication matrices x using cublas<t>gemv .

Below I report a fully processed example:

 #include <cublas_v2.h> #include <thrust/device_vector.h> #include <thrust/random.h> #include <stdio.h> #include <iostream> #include "Utilities.cuh" /********************************************/ /* LINEAR COMBINATION FUNCTION - FLOAT CASE */ /********************************************/ void linearCombination(const float * __restrict__ d_coeff, const float * __restrict__ d_basis_functions_real, float * __restrict__ d_linear_combination, const int N_basis_functions, const int N_sampling_points, const cublasHandle_t handle) { float alpha = 1.f; float beta = 0.f; cublasSafeCall(cublasSgemv(handle, CUBLAS_OP_N, N_sampling_points, N_basis_functions, &alpha, d_basis_functions_real, N_sampling_points, d_coeff, 1, &beta, d_linear_combination, 1)); } void linearCombination(const double * __restrict__ d_coeff, const double * __restrict__ d_basis_functions_real, double * __restrict__ d_linear_combination, const int N_basis_functions, const int N_sampling_points, const cublasHandle_t handle) { double alpha = 1.; double beta = 0.; cublasSafeCall(cublasDgemv(handle, CUBLAS_OP_N, N_sampling_points, N_basis_functions, &alpha, d_basis_functions_real, N_sampling_points, d_coeff, 1, &beta, d_linear_combination, 1)); } /********/ /* MAIN */ /********/ int main() { const int N_basis_functions = 5; // --- Number of rows -> Number of basis functions const int N_sampling_points = 8; // --- Number of columns -> Number of sampling points of the basis functions // --- Random uniform integer distribution between 10 and 99 thrust::default_random_engine rng; thrust::uniform_int_distribution<int> dist(10, 99); // --- Matrix allocation and initialization thrust::device_vector<float> d_basis_functions_real(N_basis_functions * N_sampling_points); for (size_t i = 0; i < d_basis_functions_real.size(); i++) d_basis_functions_real[i] = (float)dist(rng); thrust::device_vector<double> d_basis_functions_double_real(N_basis_functions * N_sampling_points); for (size_t i = 0; i < d_basis_functions_double_real.size(); i++) d_basis_functions_double_real[i] = (double)dist(rng); /************************************/ /* COMPUTING THE LINEAR COMBINATION */ /************************************/ cublasHandle_t handle; cublasSafeCall(cublasCreate(&handle)); thrust::device_vector<float> d_linear_combination_real(N_sampling_points); thrust::device_vector<double> d_linear_combination_double_real(N_sampling_points); thrust::device_vector<float> d_coeff_real(N_basis_functions, 1.f); thrust::device_vector<double> d_coeff_double_real(N_basis_functions, 1.); linearCombination(thrust::raw_pointer_cast(d_coeff_real.data()), thrust::raw_pointer_cast(d_basis_functions_real.data()), thrust::raw_pointer_cast(d_linear_combination_real.data()), N_basis_functions, N_sampling_points, handle); linearCombination(thrust::raw_pointer_cast(d_coeff_double_real.data()), thrust::raw_pointer_cast(d_basis_functions_double_real.data()), thrust::raw_pointer_cast(d_linear_combination_double_real.data()), N_basis_functions, N_sampling_points, handle); /*************************/ /* DISPLAYING THE RESULT */ /*************************/ std::cout << "Real case \n\n"; for(int j = 0; j < N_sampling_points; j++) { std::cout << "Column " << j << " - [ "; for(int i = 0; i < N_basis_functions; i++) std::cout << d_basis_functions_real[i * N_sampling_points + j] << " "; std::cout << "] = " << d_linear_combination_real[j] << "\n"; } std::cout << "\n\nDouble real case \n\n"; for(int j = 0; j < N_sampling_points; j++) { std::cout << "Column " << j << " - [ "; for(int i = 0; i < N_basis_functions; i++) std::cout << d_basis_functions_double_real[i * N_sampling_points + j] << " "; std::cout << "] = " << d_linear_combination_double_real[j] << "\n"; } return 0; } 
+1
source

All Articles