Solution of linear systems AX = B with CUDA

Question

Solution of linear systems AX = B with CUDA

Is it possible to use the new cuSOLVER library (CUDA 7) to solve linear systems of the form

AX = B

where A , X and B are NxN dense matrices?

+5

gpu cuda cusolver

Shay S. Mar 01 '15 at 13:20

source share

1 answer

Jackolantern · Answer 1 · 2015-03-01T21:45:39+0000

Yes.

Approach No. 1

Within cuSOLVER you can use QR decomposition, see QR decomposition for solving linear systems in CUDA .

Approach No. 2

Alternatively, you can calculate the inverse of the sequence involution

  cublas<t>getrfBatched()

which calculates the LU decomposition of the matrix, and

 cublas<t>getriBatched()

which computes the inverse matrix starting with its LU decomposition.

Approach No. 3

The ultimate opportunity is to use

 cublas<t>getrfBatched()

followed by a double call

 cublas<t>trsm()

which solves the upper or lower triangular linear systems.

As Robert Kroella noted, the answer may vary depending on the size and type of matrices involved.

Code for the nr approach. 1

Please see the QR decomposition for solving linear systems in CUDA .

Code for nr approaches. 2 and nr. 3

Below I report a working example for implementing nr approaches. 2 and 3 . Hankel matrices are used to power approaches with well-conditioned, reversible matrices. Please note that the approach is nr. 3 requires a permutation (permutation) of the vector of system coefficients in accordance with the summary array obtained after calling cublas<t>getrfBatched() . This permutation can be performed on the CPU.

 #include <stdio.h> #include <fstream> #include <iomanip> #include <stdlib.h> /* srand, rand */ #include <time.h> /* time */ #include "cuda_runtime.h" #include "device_launch_parameters.h" #include "cublas_v2.h" #include "Utilities.cuh" #include "TimingGPU.cuh" #define prec_save 10 #define BLOCKSIZE 256 #define BLOCKSIZEX 16 #define BLOCKSIZEY 16 /************************************/ /* SAVE REAL ARRAY FROM CPU TO FILE */ /************************************/ template <class T> void saveCPUrealtxt(const T * h_in, const char *filename, const int M) { std::ofstream outfile; outfile.open(filename); for (int i = 0; i < M; i++) outfile << std::setprecision(prec_save) << h_in[i] << "\n"; outfile.close(); } /************************************/ /* SAVE REAL ARRAY FROM GPU TO FILE */ /************************************/ template <class T> void saveGPUrealtxt(const T * d_in, const char *filename, const int M) { T *h_in = (T *)malloc(M * sizeof(T)); gpuErrchk(cudaMemcpy(h_in, d_in, M * sizeof(T), cudaMemcpyDeviceToHost)); std::ofstream outfile; outfile.open(filename); for (int i = 0; i < M; i++) outfile << std::setprecision(prec_save) << h_in[i] << "\n"; outfile.close(); } /***************************************************/ /* FUNCTION TO SET THE VALUES OF THE HANKEL MATRIX */ /***************************************************/ // --- https://en.wikipedia.org/wiki/Hankel_matrix void setHankelMatrix(double * __restrict h_A, const int N) { double *h_atemp = (double *)malloc((2 * N - 1) * sizeof(double)); // --- Initialize random seed srand(time(NULL)); // --- Generate random numbers for (int k = 0; k < 2 * N - 1; k++) h_atemp[k] = rand(); // --- Fill the Hankel matrix. The Hankel matrix is symmetric, so filling by row or column is equivalent. for (int i = 0; i < N; i++) for (int j = 0; j < N; j++) h_A[i * N + j] = h_atemp[(i + 1) + (j + 1) - 2]; free(h_atemp); } /***********************************************/ /* FUNCTION TO COMPUTE THE COEFFICIENTS VECTOR */ /***********************************************/ void computeCoefficientsVector(const double * __restrict h_A, const double * __restrict h_xref, double * __restrict h_y, const int N) { for (int k = 0; k < N; k++) h_y[k] = 0.f; for (int m = 0; m < N; m++) for (int n = 0; n < N; n++) h_y[m] = h_y[m] + h_A[n * N + m] * h_xref[n]; } /************************************/ /* COEFFICIENT REARRANGING FUNCTION */ /************************************/ void rearrange(double *vec, int *pivotArray, int N){ for (int i = 0; i < N; i++) { double temp = vec[i]; vec[i] = vec[pivotArray[i] - 1]; vec[pivotArray[i] - 1] = temp; } } /********/ /* MAIN */ /********/ int main() { const unsigned int N = 1000; const unsigned int Nmatrices = 1; // --- CUBLAS initialization cublasHandle_t cublas_handle; cublasSafeCall(cublasCreate(&cublas_handle)); TimingGPU timerLU, timerApproach1, timerApproach2; double timingLU, timingApproach1, timingApproach2; /***********************/ /* SETTING THE PROBLEM */ /***********************/ // --- Matrices to be inverted (only one in this example) double *h_A = (double *)malloc(N * N * Nmatrices * sizeof(double)); // --- Setting the Hankel matrix setHankelMatrix(h_A, N); // --- Defining the solution double *h_xref = (double *)malloc(N * sizeof(double)); for (int k = 0; k < N; k++) h_xref[k] = 1.f; // --- Coefficient vectors (only one in this example) double *h_y = (double *)malloc(N * sizeof(double)); computeCoefficientsVector(h_A, h_xref, h_y, N); // --- Result (only one in this example) double *h_x = (double *)malloc(N * sizeof(double)); // --- Allocate device space for the input matrices double *d_A; gpuErrchk(cudaMalloc(&d_A, N * N * Nmatrices * sizeof(double))); double *d_y; gpuErrchk(cudaMalloc(&d_y, N * sizeof(double))); double *d_x; gpuErrchk(cudaMalloc(&d_x, N * sizeof(double))); // --- Move the relevant matrices from host to device gpuErrchk(cudaMemcpy(d_A, h_A, N * N * Nmatrices * sizeof(double), cudaMemcpyHostToDevice)); gpuErrchk(cudaMemcpy(d_y, h_y, N * sizeof(double), cudaMemcpyHostToDevice)); /**********************************/ /* COMPUTING THE LU DECOMPOSITION */ /**********************************/ timerLU.StartCounter(); // --- Creating the array of pointers needed as input/output to the batched getrf double **h_inout_pointers = (double **)malloc(Nmatrices * sizeof(double *)); for (int i = 0; i < Nmatrices; i++) h_inout_pointers[i] = d_A + i * N * N; double **d_inout_pointers; gpuErrchk(cudaMalloc(&d_inout_pointers, Nmatrices * sizeof(double *))); gpuErrchk(cudaMemcpy(d_inout_pointers, h_inout_pointers, Nmatrices * sizeof(double *), cudaMemcpyHostToDevice)); free(h_inout_pointers); int *d_pivotArray; gpuErrchk(cudaMalloc(&d_pivotArray, N * Nmatrices * sizeof(int))); int *d_InfoArray; gpuErrchk(cudaMalloc(&d_InfoArray, Nmatrices * sizeof(int))); int *h_InfoArray = (int *)malloc(Nmatrices * sizeof(int)); cublasSafeCall(cublasDgetrfBatched(cublas_handle, N, d_inout_pointers, N, d_pivotArray, d_InfoArray, Nmatrices)); //cublasSafeCall(cublasDgetrfBatched(cublas_handle, N, d_inout_pointers, N, NULL, d_InfoArray, Nmatrices)); gpuErrchk(cudaMemcpy(h_InfoArray, d_InfoArray, Nmatrices * sizeof(int), cudaMemcpyDeviceToHost)); for (int i = 0; i < Nmatrices; i++) if (h_InfoArray[i] != 0) { fprintf(stderr, "Factorization of matrix %d Failed: Matrix may be singular\n", i); cudaDeviceReset(); exit(EXIT_FAILURE); } timingLU = timerLU.GetCounter(); printf("Timing LU decomposition %f [ms]\n", timingLU); /*********************************/ /* CHECKING THE LU DECOMPOSITION */ /*********************************/ saveCPUrealtxt(h_A, "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\A.txt", N * N); saveCPUrealtxt(h_y, "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\y.txt", N); saveGPUrealtxt(d_A, "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\Adecomposed.txt", N * N); saveGPUrealtxt(d_pivotArray, "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\pivotArray.txt", N); /******************************************************************************/ /* APPROACH NR.1: COMPUTE THE INVERSE OF A STARTING FROM ITS LU DECOMPOSITION */ /******************************************************************************/ timerApproach1.StartCounter(); // --- Allocate device space for the inverted matrices double *d_Ainv; gpuErrchk(cudaMalloc(&d_Ainv, N * N * Nmatrices * sizeof(double))); // --- Creating the array of pointers needed as output to the batched getri double **h_out_pointers = (double **)malloc(Nmatrices * sizeof(double *)); for (int i = 0; i < Nmatrices; i++) h_out_pointers[i] = (double *)((char*)d_Ainv + i * ((size_t)N * N) * sizeof(double)); double **d_out_pointers; gpuErrchk(cudaMalloc(&d_out_pointers, Nmatrices*sizeof(double *))); gpuErrchk(cudaMemcpy(d_out_pointers, h_out_pointers, Nmatrices*sizeof(double *), cudaMemcpyHostToDevice)); free(h_out_pointers); cublasSafeCall(cublasDgetriBatched(cublas_handle, N, (const double **)d_inout_pointers, N, d_pivotArray, d_out_pointers, N, d_InfoArray, Nmatrices)); gpuErrchk(cudaMemcpy(h_InfoArray, d_InfoArray, Nmatrices * sizeof(int), cudaMemcpyDeviceToHost)); for (int i = 0; i < Nmatrices; i++) if (h_InfoArray[i] != 0) { fprintf(stderr, "Inversion of matrix %d Failed: Matrix may be singular\n", i); cudaDeviceReset(); exit(EXIT_FAILURE); } double alpha1 = 1.f; double beta1 = 0.f; cublasSafeCall(cublasDgemv(cublas_handle, CUBLAS_OP_N, N, N, &alpha1, d_Ainv, N, d_y, 1, &beta1, d_x, 1)); timingApproach1 = timingLU + timerApproach1.GetCounter(); printf("Timing approach 1 %f [ms]\n", timingApproach1); /**************************/ /* CHECKING APPROACH NR.1 */ /**************************/ saveGPUrealtxt(d_x, "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\xApproach1.txt", N); /*************************************************************/ /* APPROACH NR.2: INVERT UPPER AND LOWER TRIANGULAR MATRICES */ /*************************************************************/ timerApproach2.StartCounter(); double *d_P; gpuErrchk(cudaMalloc(&d_P, N * N * sizeof(double))); gpuErrchk(cudaMemcpy(h_y, d_y, N * Nmatrices * sizeof(int), cudaMemcpyDeviceToHost)); int *h_pivotArray = (int *)malloc(N * Nmatrices*sizeof(int)); gpuErrchk(cudaMemcpy(h_pivotArray, d_pivotArray, N * Nmatrices * sizeof(int), cudaMemcpyDeviceToHost)); rearrange(h_y, h_pivotArray, N); gpuErrchk(cudaMemcpy(d_y, h_y, N * Nmatrices * sizeof(double), cudaMemcpyHostToDevice)); // --- Now P*A=L*U // Linear system A*x=y => P.'*L*U*x=y => L*U*x=P*y // --- 1st phase - solve Ly = b const double alpha = 1.f; // --- Function solves the triangular linear system with multiple right hand sides, function overrides b as a result // --- Lower triangular part cublasSafeCall(cublasDtrsm(cublas_handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_UNIT, N, 1, &alpha, d_A, N, d_y, N)); // --- Upper triangular part cublasSafeCall(cublasDtrsm(cublas_handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, N, 1, &alpha, d_A, N, d_y, N)); timingApproach2 = timingLU + timerApproach2.GetCounter(); printf("Timing approach 2 %f [ms]\n", timingApproach2); /**************************/ /* CHECKING APPROACH NR.2 */ /**************************/ saveGPUrealtxt(d_y, "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\xApproach2.txt", N); return 0; }

The Utilities.cu and Utilities.cuh files needed to run this example are supported on this github page. TimingGPU.cu and TimingGPU.cuh supported on this github page.

Some useful links to the third approach:

NAG Fortran Training Document

Computational Library Library (SCSL) User Guide

https://www.cs.drexel.edu/~jjohnson/2010-11/summer/cs680/programs/lapack/Danh/verify_sequential.c

EDIT

Dates (in ms) for nr approaches. 2 and 3 (tests performed on a GTX960 card, see 5.2).

 N LU decomposition Approach nr. 2 Approach nr. 3 100 1.08 2.75 1.28 500 45.4 161 45.7 1000 302 1053 303

As it arises, we approach nr. 3 is more convenient, and its cost is, in fact, the cost of calculating the factorization of LU. Moreover:

Solving linear systems using LU decomposition is faster than using QR decomposition (see QR decomposition for solving linear systems in CUDA );
LU decomposition is limited to square linear systems, and QR decomposition helps with non-square linear systems .

Below is the matlab code to check the results

 clear all close all clc warning off N = 1000; % --- Setting the problem solution x = ones(N, 1); %%%%%%%%%%%%%%%%%%%%% % NxN HANKEL MATRIX % %%%%%%%%%%%%%%%%%%%%% % --- https://en.wikipedia.org/wiki/Hankel_matrix load A.txt load y.txt A = reshape(A, N, N); yMatlab = A * x; fprintf('Percentage rms between coefficients vectors in Matlab and CUDA %f\n', 100 * sqrt(sum(sum(abs(yMatlab - y).^2)) / sum(sum(abs(yMatlab).^2)))); %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % COMPUTATION OF THE LU DECOMPOSITION % %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% [Lmatlab, Umatlab] = lu(A); load Adecomposed.txt Adecomposed = reshape(Adecomposed, N, N); L = eye(N); for k = 1 : N L(k + 1 : N, k) = Adecomposed(k + 1 : N, k); end U = zeros(N); for k = 1 : N U(k, k : N) = Adecomposed(k, k : N); end load pivotArray.txt Pj = eye(N); for j = 1 : N tempVector = Pj(j, :); Pj(j, :) = Pj(pivotArray(j), :); Pj(pivotArray(j), :) = tempVector; end fprintf('Percentage rms between Pj * A and L * U in CUDA %f\n', 100 * sqrt(sum(sum(abs(Pj * A - L * U).^2)) / sum(sum(abs(Pj * A).^2)))); xprime = inv(Lmatlab) * yMatlab; xMatlab = inv(Umatlab) * xprime; fprintf('Percentage rms between reference solution and solution in Matlab %f\n', 100 * sqrt(sum(sum(abs(xMatlab - x).^2)) / sum(sum(abs(x).^2)))); load xApproach1.txt fprintf('Percentage rms between reference solution and solution in CUDA for approach nr.1 %f\n', 100 * sqrt(sum(sum(abs(xApproach1 - x).^2)) / sum(sum(abs(x).^2)))); load xApproach2.txt fprintf('Percentage rms between reference solution and solution in CUDA for approach nr.2 %f\n', 100 * sqrt(sum(sum(abs(xApproach2 - x).^2)) / sum(sum(abs(x).^2))));

Solution of linear systems AX = B with CUDA

More articles: