I am updating my question with some new benchmarking results (I also reformulated the question to be more specific and I updated the code) ...
I implemented the matrix vector multiplication kernel in CUDA C, following the CUDA C Programming Guide using shared memory. Let me first introduce some of the benchmarking results that I did on the Jetson TK1 (GPU: Tegra K1, computing power 3.2) and the comparison with cuBLAS:


Here I assume that cuBLAS does some kind of magic, as it seems that its execution is not affected by the number of columns of A , which, in turn, implies that there is some parallelization across columns of A
Now, here is the source code of my kernel and the host function to call it (file: mv.cuh):
I use this during my execution (file: cuda_timer.cuh):
and my main file (main.cu):
#include <stdio.h> #include <stdlib.h> #include <cuda_runtime.h> #include <assert.h> #include "cublas_v2.h" #include <math.h> #include <curand.h> #include <stdbool.h> #include "mv.cuh" #include "cuda_timer.cuh" #include "error_handles.cuh" typedef float real_t; #define _CUDA(x) do { if((x)!=cudaSuccess) { \ printf("Error at %s:%d\n",__FILE__,__LINE__);\ exit(EXIT_FAILURE);}} while(0) #define _CUBLAS(x) do { if((x) != CUBLAS_STATUS_SUCCESS) { \ printf("Error at %s:%d\n",__FILE__,__LINE__);\ exit(EXIT_FAILURE);}} while(0) #define _CURAND(x) do { if((x) != CURAND_STATUS_SUCCESS) { \ printf("Error at %s:%d\n",__FILE__,__LINE__);\ exit(EXIT_FAILURE);}} while(0) #define TEST_COLUMNS 1 #define TEST_ROWS 0 /** * If `TEST_WRT_` is set to `TEST_COLUMNS`, then a benchmark * will be performed with respect to columns (with a fixed * number of rows). If it is set to `TEST_ROWS`, then a benchmark will * run with respect to rows (fixed number of columns). */ #define TEST_WRT_ TEST_ROWS #define CONSTANT_COLS 300 #define CONSTANT_ROWS 256 /** * In order to estimate the execution time, every * kernel is run `RUNS` times and the average is taken. */ #define RUNS 50 void compare_results(real_t *dev_y_cublas, real_t * dev_y,unsigned int nrows) { real_t * hst_y_cublas; real_t * hst_y; const size_t s = nrows * sizeof(real_t); hst_y_cublas = (real_t*) malloc(s); hst_y = (real_t*) malloc(s); _CUDA(cudaMemcpy(hst_y, dev_y, s, cudaMemcpyDeviceToHost)); _CUDA(cudaMemcpy(hst_y_cublas, dev_y_cublas, s, cudaMemcpyDeviceToHost)); for (unsigned int i = 0; i < nrows; ++i) { if (fabsf(hst_y_cublas[i] - hst_y[i]) > 0.001) { printf("ERROR ------ %f\n", fabsf(hst_y_cublas[i] - hst_y[i])); exit(EXIT_FAILURE); } } if (hst_y_cublas) free(hst_y_cublas); if (hst_y) free(hst_y); } void do_benchmark() { curandGenerator_t gen; real_t *dev_rand_data = NULL; // Random data will be allocated here! real_t *dev_y = NULL; real_t *dev_y_cublas = NULL; real_t t; real_t t_cublas; const size_t n_rows_max = 1500; const size_t n_cols_max = 300; const size_t ntot = n_cols_max * (1 + n_rows_max); const size_t size_tot = sizeof(real_t) * ntot; float alpha = 1.0, beta = 0.0; // beta was initially set to 1.0 by mistake cublasHandle_t handle; _CUBLAS(cublasCreate(&handle)); start_tictoc(); _CUDA(cudaMalloc((void** )&dev_rand_data, size_tot)); _CUDA(cudaMalloc((void** )&dev_y, n_rows_max * sizeof(real_t))); _CUDA(cudaMalloc((void** )&dev_y_cublas, n_rows_max * sizeof(real_t))); _CURAND(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT)); _CURAND(curandSetPseudoRandomGeneratorSeed(gen, 1234ULL)); tic(); _CURAND(curandGenerateUniform(gen, dev_rand_data, ntot)); t = toc(); printf("RNG in %f ms\n", t); _CURAND(curandDestroyGenerator(gen)); size_t ncols = CONSTANT_COLS; size_t nrows = CONSTANT_ROWS; size_t runs = RUNS; cudaMemset(dev_y_cublas, 0, n_rows_max * sizeof(real_t)); matvec<real_t>(dev_rand_data + ncols, dev_rand_data, dev_y, nrows, ncols); _CUBLAS(cublasSgemv(handle, CUBLAS_OP_N, nrows, ncols, &alpha, dev_rand_data + ncols, nrows, dev_rand_data, 1, &beta, dev_y_cublas, 1)); /* Compare results */ compare_results(dev_y_cublas,dev_y, nrows); FILE * pFile; char filename[50]; #if (TEST_WRT_ == TEST_COLUMNS) sprintf(filename, "times_rows%lu_cols.txt", nrows); #else sprintf(filename, "times_cols%lu_rows.txt", ncols); #endif printf("Logging to : '%s'\n", filename); pFile = fopen(filename, "w"); if (pFile == NULL) { perror("Error opening file."); exit(79); } #if (TEST_WRT_ == TEST_COLUMNS) fprintf(pFile, "0, %lu, 0, 0\n", nrows); for (ncols = 32; ncols < n_cols_max; ncols += 32) { #else fprintf(pFile, "1, %lu, 0, 0\n", ncols); for (nrows = 32; nrows < n_rows_max; nrows += 32) { #endif tic(); for (short i = 0; i < runs; i++) { matvec<real_t>(dev_rand_data + ncols, dev_rand_data, dev_y, nrows, ncols); } t = toc() / runs; tic(); for (short i = 0; i < runs; i++) { _CUBLAS(cublasSgemv(handle, CUBLAS_OP_N, nrows, ncols, &alpha, dev_rand_data + ncols, nrows, dev_rand_data, 1, &beta, dev_y_cublas, 1)); } t_cublas = toc() / runs; #if (TEST_WRT_ == TEST_COLUMNS) fprintf(pFile, "%lu, %f, %f\n", ncols, t, t_cublas); #else fprintf(pFile, "%lu, %f, %f\n", nrows, t, t_cublas); #endif } _CUBLAS(cublasDestroy(handle)); fclose(pFile); if (dev_rand_data != NULL) _CUDA(cudaFree(dev_rand_data)); stop_tictoc(); } int main(void) { do_benchmark(); return EXIT_SUCCESS; }
Finally, this is the MATLAB script I use to build runtime:
fetch_this = 'times_cols512_rows.txt'; username = 'ubuntu'; target_hostname = 'jetson'; % Do not modify below this line eval_this=['! scp ' username '@' target_hostname ':~/mv/Debug/' fetch_this ' .']; eval(eval_this) set(0, 'DefaultAxesFontSize', 14); r = csvread(fetch_this); r_header = r(1,:); plot(r(2:end,1), r(2:end,2)*1000, '-'); hold on plot(r(2:end,1), r(2:end,3)*1000, '-r'); grid on; fig_title = 'Matvec on Tegra K1 - %d %s'; if (r_header(1)==1), xlabel('Number of rows'); title(sprintf(fig_title, r_header(2),'columns')); else xlabel('Number of columns'); title(sprintf(fig_title, r_header(2),'rows')); end ylabel('Computation time [us]'); legend('Kernel', 'cuBLAS'); axis tight
I am concerned about the performance and scalability of my kernel, so first I would like to know how to improve scalability in relation to the number of rows of matrix A Secondly, I know that it is not a good practice to have a discrepancy in the industry (and my code), but I feel that I need some tips to improve it.
UPDATE: Thanks to all your comments and suggestions, I have come to the conclusion that cudaDeviceSynchronized() caused, firstly, some features of my time, so my initial measurements were inaccurate. A strict order leads to worse results. Block size is an important setting, and changing from 16 to 32 or 64 improves runtime. Further benchmarking is needed to select the block size. For this purpose, the following API can be used for the kernel:
template<typename T, const uint_t blk> __global__ void matvec_kernel(const T * RESTRICT dA, const T * RESTRICT dx, T * RESTRICT dy, const uint_t nRows, const uint_t nx);
and call it like this:
template<typename T> __host__ void matvec(const T * RESTRICT dA, const T * RESTRICT dx, T * RESTRICT dy, const uint_t nRows, const uint_t nx) { uint_t blk_size_opt = 64; if (blk_size_opt == 32) { matvec_engine<T, 32>(dA, dx, dy, nRows, nx); } else if (blk_size_opt == 64) { matvec_engine<T, 64>(dA, dx, dy, nRows, nx); } else if (blk_size_opt == 128) { matvec_engine<T, 128>(dA, dx, dy, nRows, nx); } else if (blk_size_opt == 256) { matvec_engine<T, 256>(dA, dx, dy, nRows, nx); } }
Let me give you some benchmarking results. First, a comparison with cublasSgemv:

and the effect of block size on runtime:
