#pragma once#include <stdio.h>#define CHECK(call) do{const cudaError_t error_code = call; if (error_code != cudaSuccess){printf("CUDA Error:\n"); printf(" File: %s\n", __FILE__); printf(" Line: %d\n", __LINE__); printf(" Error code: %d\n", error_code); printf(" Error text: %s\n", cudaGetErrorString(error_code)); exit(1);}} while (0)然后在调用CUDA相关函数或者核函数的时候,就可以使用CHECK操作来监测其中有无相关异常。
// nvcc ./ -Xcompiler -fPIC -o ./test_error && ./test_error#include "error.cuh"#include <stdio.h>int main(void){ const int N = 100000000; const int M = sizeof(double) * N; double *d_x; CHECK(cudaMalloc((void **)&d_x, M)); CHECK(cudaFree(d_x)); printf("Success!\n");}运行结果是没有报错的:
// nvcc ./ -Xcompiler -fPIC -o ./test_error && ./test_error#include "error.cuh"#include <stdio.h>int main(void){ const int N = 1000000000; const int M = sizeof(double) * N; double *d_x; CHECK(cudaMalloc((void **)&d_x, M)); CHECK(cudaFree(d_x)); printf("Success!\n");}再次运行,就会报OOM错误:
./ warning #69-D: integer conversion resulted in truncation const int M = sizeof(double) * N; ^Remark: The warnings can be suppressed with "-diag-suppress <warning-number>"./ warning #68-D: integer conversion resulted in a change of sign do{const cudaError_t error_code = cudaMalloc((void **)&d_x, M); if (error_code != cudaSuccess){printf("CUDA Error:\n"); printf(" File: %s\n", "./"); printf(" Line: %d\n", 9); printf(" Error code: %d\n", error_code); printf(" Error text: %s\n", cudaGetErrorString(error_code)); exit(1);}} while (0); ^./ warning #69-D: integer conversion resulted in truncation const int M = sizeof(double) * N; ^Remark: The warnings can be suppressed with "-diag-suppress <warning-number>"./ warning #68-D: integer conversion resulted in a change of sign do{const cudaError_t error_code = cudaMalloc((void **)&d_x, M); if (error_code != cudaSuccess){printf("CUDA Error:\n"); printf(" File: %s\n", "./"); printf(" Line: %d\n", 9); printf(" Error code: %d\n", error_code); printf(" Error text: %s\n", cudaGetErrorString(error_code)); exit(1);}} while (0); ^./ In function 'int main()':./ warning: overflow in conversion from 'long unsigned int' to 'int' changes value from '8000000000' to '-589934592' [-Woverflow] 7 | const int M = sizeof(double) * N; | ~~~~~~~~~~~~~~~~^~~~CUDA Error: File: ./ Line: 9 Error code: 2 Error text: out of memory当然,中间因为整形溢出,还有一些其他的warnning信息,但是这里主要要展现的是OOM报错问题。
// nvcc ./ -Xcompiler -fPIC -o ./test_error && chmod +x ./test_error && ./test_error#include "error.cuh"#include <math.h>#include <stdio.h>void __global__ add(const double *x, const double *y, double *z, const int N){ int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < N){ z = x + y; }}int main(void){ const int N = 10; const int M = sizeof(double) * N; const double a = 1.23; double *h_x = (double*) malloc(M); for (int n = 0; n < N; ++n) { h_x = a; } double *d_x, *d_z; CHECK(cudaMalloc((void **)&d_x, M)); CHECK(cudaMalloc((void **)&d_z, M)); CHECK(cudaMemcpy(d_x, h_x, M, cudaMemcpyHostToDevice)); const int block_size = 1024; const int grid_size = (N + block_size - 1) / block_size; add<<<grid_size, block_size>>>(d_x, d_x, d_z, N); CHECK(cudaGetLastError()); CHECK(cudaDeviceSynchronize()); CHECK(cudaFree(d_x)); CHECK(cudaFree(d_z)); free(h_x); printf("Success!\n"); return 0;}这个CUDA程序运行的是一个数组加法。运行结果:
$ nvcc ./ -Xcompiler -fPIC -o ./test_error && chmod +x ./test_error && ./test_errorSuccess!调整一下block_size参数:
// nvcc ./ -Xcompiler -fPIC -o ./test_error && chmod +x ./test_error && ./test_error#include "error.cuh"#include <math.h>#include <stdio.h>void __global__ add(const double *x, const double *y, double *z, const int N){ int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < N){ z = x + y; }}int main(void){ const int N = 10; const int M = sizeof(double) * N; const double a = 1.23; double *h_x = (double*) malloc(M); for (int n = 0; n < N; ++n) { h_x = a; } double *d_x, *d_z; CHECK(cudaMalloc((void **)&d_x, M)); CHECK(cudaMalloc((void **)&d_z, M)); CHECK(cudaMemcpy(d_x, h_x, M, cudaMemcpyHostToDevice)); const int block_size = 1025; const int grid_size = (N + block_size - 1) / block_size; add<<<grid_size, block_size>>>(d_x, d_x, d_z, N); CHECK(cudaGetLastError()); CHECK(cudaDeviceSynchronize()); CHECK(cudaFree(d_x)); CHECK(cudaFree(d_z)); free(h_x); printf("Success!\n"); return 0;}由于Block大小在CUDA程序中最大只能是1024,因此如果超出这个数就会出现异常,但是如果没有异常检测函数的话,程序是能够正常执行下去的,这样这个异常就会一直保留在程序中。运行结果:
$ nvcc ./ -Xcompiler -fPIC -o ./test_error && chmod +x ./test_error && ./test_errorCUDA Error: File: ./ Line: 29 Error code: 9 Error text: invalid configuration argument因为加上了cudaGetLastError()函数,并使用了异常捕获的宏,所以这里就会提示参数配置异常。