, -, , . (, - BLAS 3- - , - . , d squareroot). , ; - .
, CUDA. , , , .
, NxM (N * M); . , N * M. ( , float , SGEMM SAXPY.)
, CUDA, , . ( squareroot) . ( , , ). , . , - B_ij = (A_ij) ^ 2; inplace, A_ij = (A_ij) ^ 2, :
__global__ void squareElements(float *a, float *b, int N) {
int tid = blockDim.x * blockIdx.x + threadIdx.x;
if (tid < N)
b[tid] = (a[tid]*a[tid]);
}
__global__ void sqrtElements(float *a, float *b, int N) {
int tid = blockDim.x * blockIdx.x + threadIdx.x;
if (tid < N)
b[tid] = sqrt(a[tid]);
}
, , sqrtf(), 3 ulp ( ), .
, , . CUBLAS , , .