背景
在上一篇中,笔者配好了 ROCm 的基础 LLVM 编译环境,并成功编译了 HIP 程序,然而实际运行时会出现 invalidDeviceFunction
错误。本文详细介绍该错误的排查过程。
问题详述
笔者询问 GLM-4 得到了一个基础的 HIP 程序:
#include <hip/hip_runtime.h>
#include <iostream>
// 定义kernel
__global__ void vector_add(int *a, int *b, int *c, int n) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < n) {
c[idx] = a[idx] + b[idx];
}
}
int main() {
int n = 1024; // 数组大小
int *a, *b, *c;
// 获取设备数量
int deviceCount;
hipError_t err = hipGetDeviceCount(&deviceCount);
if (err != hipSuccess) {
std::cerr << "hipGetDeviceCount failed: " << hipGetErrorString(err) << std::endl;
return -1;
}
// 如果有设备,选择第一个设备
if (deviceCount > 0) {
err = hipSetDevice(0); // 选择第一个设备
if (err != hipSuccess) {
std::cerr << "hipSetDevice failed: " << hipGetErrorString(err) << std::endl;
return -1;
} else {
std::cout << "Using device 0." << std::endl;
}
} else {
std::cerr << "No HIP capable devices found." << std::endl;
return -1;
}
// 分配内存
err = hipMalloc((void**)&a, n * sizeof(int));
if (err != hipSuccess) {
std::cerr << "hipMalloc failed for a: " << hipGetErrorString(err) << std::endl;
return -1;
}
err = hipMalloc((void**)&b, n * sizeof(int));
if (err != hipSuccess) {
std::cerr << "hipMalloc failed for b: " << hipGetErrorString(err) << std::endl;
return -1;
}
err = hipMalloc((void**)&c, n * sizeof(int));
if (err != hipSuccess) {
std::cerr << "hipMalloc failed for c: " << hipGetErrorString(err) << std::endl;
return -1;
}
// 初始化数组
int *host_a = new int[n];
int *host_b = new int[n];
int *host_c = new int[n];
for (int i = 0; i < n; i++) {
host_a[i] = i;
host_b[i] = i * 2;
host_c[i] = 0;
}
// 将数据复制到设备内存
err = hipMemcpy(a, host_a, n * sizeof(int), hipMemcpyHostToDevice);
if (err != hipSuccess) {
std::cerr << "hipMemcpy failed for a: " << hipGetErrorString(err) << std::endl;
return -1;
}
err = hipMemcpy(b, host_b, n * sizeof(int), hipMemcpyHostToDevice);
if (err != hipSuccess) {
std::cerr << "hipMemcpy failed for b: " << hipGetErrorString(err) << std::endl;
return -1;
}
// 启动kernel
int threadsPerBlock = 256;
int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock;
vector_add<<<blocksPerGrid, threadsPerBlock>>>(a, b, c, n);
err = hipGetLastError();
if (err != hipSuccess) {
std::cerr << "hipLaunchKernel failed: " << hipGetErrorString(err) << std::endl;
return -1;
}
// 同步以等待kernel完成
err = hipDeviceSynchronize();
if (err != hipSuccess) {
std::cerr << "hipDeviceSynchronize failed: " << hipGetErrorString(err) << std::endl;
return -1;
}
// 从设备内存复制结果回主机
err = hipMemcpy(host_c, c, n * sizeof(int), hipMemcpyDeviceToHost);
if (err != hipSuccess) {
std::cerr << "hipMemcpy failed for c: " << hipGetErrorString(err) << std::endl;
return -1;
}
// 输出结果
for (int i = 0; i < 10; i++) {
std::cout << "c[" << i << "] = " << host_c[i] << std::endl;
}
// 释放内存
err = hipFree(a);
if (err != hipSuccess) {
std::cerr << "hipFree failed for a: " << hipGetErrorString(err) << std::endl;
}
err = hipFree(b);
if (err != hipSuccess) {
std::cerr << "hipFree failed for b: " << hipGetErrorString(err) << std::endl;
}
err = hipFree(c);
if (err != hipSuccess) {
std::cerr << "hipFree failed for c: " << hipGetErrorString(err) << std::endl;
}
delete[] host_a;
delete[] host_b;
delete[] host_c;
return 0;
}
使用 hipcc
编译:hipcc -o vectoradd vectoradd.cpp
。可以正常编译,但是运行时,会在 hipLaunchKernel
处报错,这就很奇怪了。
- 如果是编译环境有问题,应该是编译不过,而不是运行时出错。
- 如果运行库有问题,应该一开始运行就出错,而不是
hipMalloc
hipMemcpy
这些函数调用正常,hipLaunchKernel
调用出错。 - 使用旧版 ROCm 5.7 的
hipcc
编译可以正常运行,所以硬件没有问题。
参考教程,出现 invalid device function
的常见错误是编译的架构和运行设备的架构不匹配,但是笔者已经指定了架构为 gfx1031
,所以应该不是这个问题。教程进一步提到,可以使用 --save-temps
参数保存中间结果。运行后产生了 host 和 device 的中间文件,检查汇编文件也没有发现问题。
寻找了一番资料,找到了官方的调试指南,可以使用 AMD_LOG_LEVEL
环境变量来设置日志级别,于是尝试设置 AMD_LOG_LEVEL=5
,再次运行,日志如下:
$ AMD_LOG_LEVEL=5 ./vectoradd
:3:rocdevice.cpp :468 : 89870210473 us: [pid:1089444 tid:0x76cc16cb11c0] Initializing HSA stack.
:3:rocdevice.cpp :554 : 89870224328 us: [pid:1089444 tid:0x76cc16cb11c0] Enumerated GPU agents = 1
:3:rocdevice.cpp :234 : 89870224364 us: [pid:1089444 tid:0x76cc16cb11c0] Numa selects cpu agent[0]=0x5c301dce7ac0(fine=0x5c301dcea210,coarse=0x5c301dce5a80) for gpu agent=0x5c301dcf28c0 CPU<->GPU XGMI=0
:3:rocsettings.cpp :290 : 89870224371 us: [pid:1089444 tid:0x76cc16cb11c0] Using dev kernel arg wa = 0
:3:comgrctx.cpp :33 : 89870224375 us: [pid:1089444 tid:0x76cc16cb11c0] Loading COMGR library.
:3:comgrctx.cpp :126 : 89870224406 us: [pid:1089444 tid:0x76cc16cb11c0] Loaded COMGR library version 2.8.
:3:rocdevice.cpp :1810: 89870224918 us: [pid:1089444 tid:0x76cc16cb11c0] Gfx Major/Minor/Stepping: 10/3/1
:3:rocdevice.cpp :1812: 89870224922 us: [pid:1089444 tid:0x76cc16cb11c0] HMM support: 1, XNACK: 0, Direct host access: 0
:3:rocdevice.cpp :1814: 89870224926 us: [pid:1089444 tid:0x76cc16cb11c0] Max SDMA Read Mask: 0x3, Max SDMA Write Mask: 0x3
:4:rocdevice.cpp :2222: 89870225341 us: [pid:1089444 tid:0x76cc16cb11c0] Allocate hsa host memory 0x76cc14900000, size 0x101000, numa_node = 0
:4:rocdevice.cpp :2222: 89870226242 us: [pid:1089444 tid:0x76cc16cb11c0] Allocate hsa host memory 0x76cc14700000, size 0x101000, numa_node = 0
:4:rocdevice.cpp :2222: 89870227212 us: [pid:1089444 tid:0x76cc16cb11c0] Allocate hsa host memory 0x76cc0fa00000, size 0x400000, numa_node = 0
:4:rocdevice.cpp :2222: 89870227568 us: [pid:1089444 tid:0x76cc16cb11c0] Allocate hsa host memory 0x76cc16ca3000, size 0x38, numa_node = 0
:4:runtime.cpp :85 : 89870227639 us: [pid:1089444 tid:0x76cc16cb11c0] init
:3:hip_context.cpp :49 : 89870227642 us: [pid:1089444 tid:0x76cc16cb11c0] Direct Dispatch: 1
:3:hip_device_runtime.cpp :651 : 89870227658 us: [pid:1089444 tid:0x76cc16cb11c0] hipGetDeviceCount ( 0x7ffc16ba1714 )
:3:hip_device_runtime.cpp :653 : 89870227663 us: [pid:1089444 tid:0x76cc16cb11c0] hipGetDeviceCount: Returned hipSuccess :
:3:hip_device_runtime.cpp :666 : 89870227668 us: [pid:1089444 tid:0x76cc16cb11c0] hipSetDevice ( 0 )
:3:hip_device_runtime.cpp :670 : 89870227671 us: [pid:1089444 tid:0x76cc16cb11c0] hipSetDevice: Returned hipSuccess :
Using device 0.
:3:hip_memory.cpp :615 : 89870227932 us: [pid:1089444 tid:0x76cc16cb11c0] hipMalloc ( 0x7ffc16ba1708, 4096 )
:4:rocdevice.cpp :2379: 89870228289 us: [pid:1089444 tid:0x76cc16cb11c0] Allocate hsa device memory 0x76cc14400000, size 0x1000
:3:rocdevice.cpp :2418: 89870228294 us: [pid:1089444 tid:0x76cc16cb11c0] Device=0x5c301dd2a920, freeMem_ = 0x2fefff000
:3:hip_memory.cpp :617 : 89870228299 us: [pid:1089444 tid:0x76cc16cb11c0] hipMalloc: Returned hipSuccess : 0x76cc14400000: duration: 367 us
:3:hip_memory.cpp :615 : 89870228303 us: [pid:1089444 tid:0x76cc16cb11c0] hipMalloc ( 0x7ffc16ba1700, 4096 )
:4:rocdevice.cpp :2379: 89870228307 us: [pid:1089444 tid:0x76cc16cb11c0] Allocate hsa device memory 0x76cc14401000, size 0x1000
:3:rocdevice.cpp :2418: 89870228312 us: [pid:1089444 tid:0x76cc16cb11c0] Device=0x5c301dd2a920, freeMem_ = 0x2feffe000
:3:hip_memory.cpp :617 : 89870228318 us: [pid:1089444 tid:0x76cc16cb11c0] hipMalloc: Returned hipSuccess : 0x76cc14401000: duration: 15 us
:3:hip_memory.cpp :615 : 89870228323 us: [pid:1089444 tid:0x76cc16cb11c0] hipMalloc ( 0x7ffc16ba16f8, 4096 )
:4:rocdevice.cpp :2379: 89870228328 us: [pid:1089444 tid:0x76cc16cb11c0] Allocate hsa device memory 0x76cc14402000, size 0x1000
:3:rocdevice.cpp :2418: 89870228332 us: [pid:1089444 tid:0x76cc16cb11c0] Device=0x5c301dd2a920, freeMem_ = 0x2feffd000
:3:hip_memory.cpp :617 : 89870228336 us: [pid:1089444 tid:0x76cc16cb11c0] hipMalloc: Returned hipSuccess : 0x76cc14402000: duration: 13 us
:3:hip_memory.cpp :690 : 89870228511 us: [pid:1089444 tid:0x76cc16cb11c0] hipMemcpy ( 0x76cc14400000, 0x5c301dd38510, 4096, hipMemcpyHostToDevice )
:3:rocdevice.cpp :3030: 89870228522 us: [pid:1089444 tid:0x76cc16cb11c0] Number of allocated hardware queues with low priority: 0, with normal priority: 0, with high priority: 0, maximum per priority is: 4
:3:rocdevice.cpp :3106: 89870239538 us: [pid:1089444 tid:0x76cc16cb11c0] Created SWq=0x76cc16c6a000 to map on HWq=0x76cc0dd00000 with size 16384 with priority 1, cooperative: 0
:3:rocdevice.cpp :3198: 89870239560 us: [pid:1089444 tid:0x76cc16cb11c0] acquireQueue refCount: 0x76cc0dd00000 (1)
:4:rocdevice.cpp :2222: 89870239969 us: [pid:1089444 tid:0x76cc16cb11c0] Allocate hsa host memory 0x76cc0db00000, size 0x100000, numa_node = 0
:3:devprogram.cpp :2648: 89870467601 us: [pid:1089444 tid:0x76cc16cb11c0] Using Code Object V5.
:4:command.cpp :348 : 89870473159 us: [pid:1089444 tid:0x76cc16cb11c0] Command (CopyHostToDevice) enqueued: 0x5c301e04e270
:4:rocblit.cpp :834 : 89870474035 us: [pid:1089444 tid:0x76cc16cb11c0] HSA Async Copy staged H2D dst=0x76cc14400000, src=0x76cc14900000, size=4096, completion_signal=0x76cc161ff700
:4:rocvirtual.cpp :572 : 89870474042 us: [pid:1089444 tid:0x76cc16cb11c0] Host wait on completion_signal=0x76cc161ff700
:3:rocvirtual.hpp :67 : 89870474048 us: [pid:1089444 tid:0x76cc16cb11c0] Host active wait for Signal = (0x76cc161ff700) for -1 ns
:4:command.cpp :287 : 89870474055 us: [pid:1089444 tid:0x76cc16cb11c0] Queue marker to command queue: 0x5c301dbcc110
:4:command.cpp :348 : 89870474059 us: [pid:1089444 tid:0x76cc16cb11c0] Command (InternalMarker) enqueued: 0x5c301df8c530
:4:command.cpp :177 : 89870474066 us: [pid:1089444 tid:0x76cc16cb11c0] Command 0x5c301e04e270 complete
:4:command.cpp :175 : 89870474070 us: [pid:1089444 tid:0x76cc16cb11c0] Command 0x5c301df8c530 complete (Wall: 89870474069, CPU: 0, GPU: 0 us)
:4:command.cpp :252 : 89870474075 us: [pid:1089444 tid:0x76cc16cb11c0] Waiting for event 0x5c301e04e270 to complete, current status 0
:4:command.cpp :266 : 89870474078 us: [pid:1089444 tid:0x76cc16cb11c0] Event 0x5c301e04e270 wait completed
:3:hip_memory.cpp :691 : 89870474082 us: [pid:1089444 tid:0x76cc16cb11c0] hipMemcpy: Returned hipSuccess : : duration: 245571 us
:3:hip_memory.cpp :690 : 89870474091 us: [pid:1089444 tid:0x76cc16cb11c0] hipMemcpy ( 0x76cc14401000, 0x5c301dd39520, 4096, hipMemcpyHostToDevice )
:4:command.cpp :348 : 89870474098 us: [pid:1089444 tid:0x76cc16cb11c0] Command (CopyHostToDevice) enqueued: 0x5c301e04e270
:4:rocblit.cpp :834 : 89870474105 us: [pid:1089444 tid:0x76cc16cb11c0] HSA Async Copy staged H2D dst=0x76cc14401000, src=0x76cc14900000, size=4096, completion_signal=0x76cc161ff680
:4:rocvirtual.cpp :572 : 89870474108 us: [pid:1089444 tid:0x76cc16cb11c0] Host wait on completion_signal=0x76cc161ff680
:3:rocvirtual.hpp :67 : 89870474113 us: [pid:1089444 tid:0x76cc16cb11c0] Host active wait for Signal = (0x76cc161ff680) for -1 ns
:4:command.cpp :287 : 89870474117 us: [pid:1089444 tid:0x76cc16cb11c0] Queue marker to command queue: 0x5c301dbcc110
:4:command.cpp :348 : 89870474121 us: [pid:1089444 tid:0x76cc16cb11c0] Command (InternalMarker) enqueued: 0x5c301df8c530
:4:command.cpp :177 : 89870474125 us: [pid:1089444 tid:0x76cc16cb11c0] Command 0x5c301e04e270 complete
:4:command.cpp :175 : 89870474129 us: [pid:1089444 tid:0x76cc16cb11c0] Command 0x5c301df8c530 complete (Wall: 89870474128, CPU: 0, GPU: 0 us)
:4:command.cpp :252 : 89870474133 us: [pid:1089444 tid:0x76cc16cb11c0] Waiting for event 0x5c301e04e270 to complete, current status 0
:4:command.cpp :266 : 89870474138 us: [pid:1089444 tid:0x76cc16cb11c0] Event 0x5c301e04e270 wait completed
:3:hip_memory.cpp :691 : 89870474140 us: [pid:1089444 tid:0x76cc16cb11c0] hipMemcpy: Returned hipSuccess : : duration: 49 us
:3:hip_platform.cpp :225 : 89870474148 us: [pid:1089444 tid:0x76cc16cb11c0] __hipPushCallConfiguration ( {4,1,1}, {256,1,1}, 0, stream:<null> )
:3:hip_platform.cpp :229 : 89870474152 us: [pid:1089444 tid:0x76cc16cb11c0] __hipPushCallConfiguration: Returned hipSuccess :
:3:hip_platform.cpp :234 : 89870474159 us: [pid:1089444 tid:0x76cc16cb11c0] __hipPopCallConfiguration ( {500405536,23600,0}, {4294966920,4294967295,381294696}, 0x7ffc16ba1680, 0x7ffc16ba1678 )
:3:hip_platform.cpp :243 : 89870474163 us: [pid:1089444 tid:0x76cc16cb11c0] __hipPopCallConfiguration: Returned hipSuccess :
:3:hip_module.cpp :677 : 89870474178 us: [pid:1089444 tid:0x76cc16cb11c0] hipLaunchKernel ( 0x5c2ff28dd840, {4,1,1}, {256,1,1}, 0x7ffc16ba16c0, 0, stream:<null> )
:3:hip_module.cpp :678 : 89870474184 us: [pid:1089444 tid:0x76cc16cb11c0] hipLaunchKernel: Returned hipErrorInvalidDeviceFunction :
:3:hip_error.cpp :36 : 89870474189 us: [pid:1089444 tid:0x76cc16cb11c0] hipGetLastError ( )
hipLaunchKernelGGL failed: invalid device function
:3:hip_device_runtime.cpp :620 : 89870474224 us: [pid:1089444 tid:0x76cc16cb11c0] hipDeviceSynchronize ( )
:4:commandqueue.cpp :147 : 89870474228 us: [pid:1089444 tid:0x76cc16cb11c0] HW Event not ready, awaiting completion instead
:4:commandqueue.cpp :163 : 89870474232 us: [pid:1089444 tid:0x76cc16cb11c0] All commands finished
:3:hip_device_runtime.cpp :624 : 89870474237 us: [pid:1089444 tid:0x76cc16cb11c0] hipDeviceSynchronize: Returned hipSuccess :
:4:command.cpp :348 : 89870474379 us: [pid:1089444 tid:0x76cc16cb11c0] Command (Marker) enqueued: 0x5c301df8c530
:3:rocvirtual.cpp :476 : 89870474391 us: [pid:1089444 tid:0x76cc16cb11c0] Set Handler: handle(0x76cc161ff600), timestamp(0x5c301e5076f0)
:4:rocvirtual.cpp :1091: 89870474397 us: [pid:1089444 tid:0x76cc16cb11c0] SWq=0x76cc16c6a000, HWq=0x76cc0dd00000, id=1, BarrierAND Header = 0x1503 (type=3, barrier=1, acquire=2, release=2), dep_signal=[0x0, 0x0, 0x0, 0x0, 0x0], completion_signal=0x76cc161ff600
:4:command.cpp :252 : 89870474403 us: [pid:1089444 tid:0x76cc16cb11c0] Waiting for event 0x5c301df8c530 to complete, current status 2
:3:rocvirtual.cpp :223 : 89870474533 us: [pid:1089444 tid:0x76cc0e8006c0] Handler: value(0), timestamp(0x5c301e1a33e0), handle(0x76cc161ff600)
:4:command.cpp :266 : 89870474549 us: [pid:1089444 tid:0x76cc16cb11c0] Event 0x5c301df8c530 wait completed
:4:command.cpp :175 : 89870474549 us: [pid:1089444 tid:0x76cc0e8006c0] Command 0x5c301df8c530 complete (Wall: 89870474548, CPU: 0, GPU: 162 us)
:4:rocdevice.cpp :2395: 89870474788 us: [pid:1089444 tid:0x76cc16cb11c0] Free hsa memory 0x76cc0db00000
:4:rocdevice.cpp :2395: 89870474793 us: [pid:1089444 tid:0x76cc16cb11c0] Free hsa memory (nil)
:3:rocdevice.cpp :3210: 89870474797 us: [pid:1089444 tid:0x76cc16cb11c0] releaseQueue refCount:0x76cc0dd00000 (0)
:4:runtime.cpp :93 : 89870474810 us: [pid:1089444 tid:0x76cc16cb11c0] tearDown
:4:rocdevice.cpp :2395: 89870474957 us: [pid:1089444 tid:0x76cc16cb11c0] Free hsa memory 0x76cc0fa00000
:4:rocdevice.cpp :2395: 89870475057 us: [pid:1089444 tid:0x76cc16cb11c0] Free hsa memory 0x76cc16ca3000
:3:rocdevice.cpp :286 : 89870475063 us: [pid:1089444 tid:0x76cc16cb11c0] Deleting hardware queue 0x76cc0dd00000 with refCount 0
:4:rocdevice.cpp :2395: 89870482134 us: [pid:1089444 tid:0x76cc16cb11c0] Free hsa memory 0x76cc14700000
:4:rocdevice.cpp :2395: 89870482356 us: [pid:1089444 tid:0x76cc16cb11c0] Free hsa memory 0x76cc14900000
可惜的是,这个日志最多只记录到了 hipLaunchKernel
返回错误代码,并没有更详细的错误信息。目前来看,唯一的办法就是检查 hipLaunchKernel
的源码了。
源码溯源
hipLaunchKernel
位于 hipamd/src/hip_module.cpp ,内容很简单:
hipError_t hipLaunchKernel(const void* hostFunction, dim3 gridDim, dim3 blockDim,
void** args, size_t sharedMemBytes, hipStream_t stream) {
HIP_INIT_API(hipLaunchKernel, hostFunction, gridDim, blockDim, args, sharedMemBytes, stream);
HIP_RETURN(hipLaunchKernel_common(hostFunction, gridDim, blockDim, args, sharedMemBytes, stream));
}
hipLaunchKernel_common
就在上头:
hipError_t hipLaunchKernel_common(const void* hostFunction, dim3 gridDim, dim3 blockDim,
void** args, size_t sharedMemBytes,
hipStream_t stream) {
STREAM_CAPTURE(hipLaunchKernel, stream, hostFunction, gridDim, blockDim, args, sharedMemBytes);
return ihipLaunchKernel(hostFunction, gridDim, blockDim, args, sharedMemBytes, stream, nullptr,
nullptr, 0);
}
溯源到 ihipLaunchKernel
,位于 hipamd/src/hip_platform.cpp ,这里实现了主要逻辑:
hipError_t ihipLaunchKernel(const void* hostFunction, dim3 gridDim, dim3 blockDim, void** args,
size_t sharedMemBytes, hipStream_t stream, hipEvent_t startEvent,
hipEvent_t stopEvent, int flags) {
hipFunction_t func = nullptr;
int deviceId = hip::Stream::DeviceId(stream);
hipError_t hip_error = PlatformState::instance().getStatFunc(&func, hostFunction, deviceId);
if ((hip_error != hipSuccess) || (func == nullptr)) {
if (hip_error == hipErrorNoBinaryForGpu) {
return hip_error;
} else {
return hipErrorInvalidDeviceFunction;
}
}
size_t globalWorkSizeX = static_cast<size_t>(gridDim.x) * blockDim.x;
size_t globalWorkSizeY = static_cast<size_t>(gridDim.y) * blockDim.y;
size_t globalWorkSizeZ = static_cast<size_t>(gridDim.z) * blockDim.z;
if (globalWorkSizeX > std::numeric_limits<uint32_t>::max() ||
globalWorkSizeY > std::numeric_limits<uint32_t>::max() ||
globalWorkSizeZ > std::numeric_limits<uint32_t>::max()) {
return hipErrorInvalidConfiguration;
}
return ihipModuleLaunchKernel(
func, static_cast<uint32_t>(globalWorkSizeX), static_cast<uint32_t>(globalWorkSizeY),
static_cast<uint32_t>(globalWorkSizeZ), blockDim.x, blockDim.y, blockDim.z, sharedMemBytes,
stream, args, nullptr, startEvent, stopEvent, flags);
}
可能抛出 invalid device function
的只有 PlatformState::instance().getStatFunc
这个函数,位于 hipamd/src/hip_global.cpp:
hipError_t Function::getStatFunc(hipFunction_t* hfunc, int deviceId) {
guarantee(modules_ != nullptr, "Module not initialized");
if (dFunc_.size() != g_devices.size()) {
return hipErrorNoBinaryForGpu;
}
hipModule_t hmod = nullptr;
IHIP_RETURN_ONFAIL((*modules_)->BuildProgram(deviceId));
IHIP_RETURN_ONFAIL((*modules_)->GetModule(deviceId, &hmod));
if (dFunc_[deviceId] == nullptr) {
dFunc_[deviceId] = new DeviceFunc(name_, hmod);
}
*hfunc = dFunc_[deviceId]->asHipFunction();
return hipSuccess;
}
这里可能出问题的在与 BuildProgram
和 GetModule
函数。
BuildProgram
位于 hipamd/src/hip_fatbin.cpp:
hipError_t FatBinaryInfo::BuildProgram(const int device_id) {
// Device Id Check and Add DeviceProgram if not added so far
DeviceIdCheck(device_id);
IHIP_RETURN_ONFAIL(AddDevProgram(device_id));
// If Program was already built skip this step and return success
FatBinaryDeviceInfo* fbd_info = fatbin_dev_info_[device_id];
if (fbd_info->prog_built_ == false) {
if(CL_SUCCESS != fbd_info->program_->build(g_devices[device_id]->devices(),
nullptr, nullptr, nullptr,
kOptionChangeable, kNewDevProg)) {
return hipErrorNoBinaryForGpu;
}
fbd_info->prog_built_ = true;
}
if (!fbd_info->program_->load()) {
return hipErrorNoBinaryForGpu;
}
return hipSuccess;
}
这里有可能抛出 invalid device function
错误的地方只有 AddDevProgram
函数,就在 BuildProgram
上方代码中:
hipError_t FatBinaryInfo::AddDevProgram(const int device_id) {
// Device Id bounds Check
DeviceIdCheck(device_id);
FatBinaryDeviceInfo* fbd_info = fatbin_dev_info_[device_id];
if (fbd_info == nullptr) {
return hipErrorInvalidKernelFile;
}
// If fat binary was already added, skip this step and return success
if (fbd_info->add_dev_prog_ == false) {
amd::Context* ctx = g_devices[device_id]->asContext();
if (CL_SUCCESS != fbd_info->program_->addDeviceProgram(*ctx->devices()[0],
fbd_info->binary_image_,
fbd_info->binary_size_, false,
nullptr, nullptr, fdesc_,
fbd_info->binary_offset_, uri_)) {
return hipErrorInvalidKernelFile;
}
fbd_info->add_dev_prog_ = true;
}
return hipSuccess;
}
这里有可能出问题的地方在与 addDeviceProgram
函数,位于 rocclr/platform/program.cpp :
int32_t Program::addDeviceProgram(Device& device, const void* image, size_t length,
bool make_copy, amd::option::Options* options,
const amd::Program* same_prog, amd::Os::FileDesc fdesc,
size_t foffset, std::string uri) {
if (image != NULL && !amd::Elf::isElfMagic((const char*)image)) {
if (device.settings().useLightning_) {
return CL_INVALID_BINARY;
}
#if defined(WITH_COMPILER_LIB)
else if (!amd::Hsail::ValidateBinaryImage(
image, length, language_ == SPIRV ? BINARY_TYPE_SPIRV : BINARY_TYPE_ELF | BINARY_TYPE_LLVM)) {
return CL_INVALID_BINARY;
}
#endif // !defined(WITH_COMPILER_LIB)
}
// Check if the device is already associated with this program
if (deviceList_.find(&device) != deviceList_.end()) {
return CL_INVALID_VALUE;
}
Device& rootDev = device;
// if the rootDev is already associated with a program
if (devicePrograms_[&rootDev] != NULL) {
return CL_SUCCESS;
}
#if defined(WITH_COMPILER_LIB)
bool emptyOptions = (options == nullptr);
#endif
amd::option::Options emptyOpts;
if (options == NULL) {
options = &emptyOpts;
}
#if defined(WITH_COMPILER_LIB)
if (image != NULL && length != 0 && amd::Hsail::ValidateBinaryImage(image, length, BINARY_TYPE_ELF)) {
acl_error errorCode;
aclBinary* binary = amd::Hsail::ReadFromMem(image, length, &errorCode);
if (errorCode != ACL_SUCCESS) {
return CL_INVALID_BINARY;
}
const oclBIFSymbolStruct* symbol = findBIF30SymStruct(symOpenclCompilerOptions);
assert(symbol && "symbol not found");
std::string symName = std::string(symbol->str[bif::PRE]) + std::string(symbol->str[bif::POST]);
size_t symSize = 0;
const void* opts = amd::Hsail::ExtractSymbol(device.binCompiler(), binary, &symSize, aclCOMMENT,
symName.c_str(), &errorCode);
// if we have options from binary and input options was not specified
if (opts != NULL && emptyOptions) {
std::string sBinOptions = std::string((char*)opts, symSize);
if (!amd::option::parseAllOptions(sBinOptions, *options, false, false)) {
programLog_ = options->optionsLog();
LogError("Parsing compilation options from binary failed.");
return CL_INVALID_COMPILER_OPTIONS;
}
}
options->oVariables->Legacy = !device.settings().useLightning_ ?
isAMDILTarget(*amd::aclutGetTargetInfo(binary)) :
isHSAILTarget(*amd::aclutGetTargetInfo(binary));
amd::Hsail::BinaryFini(binary);
}
#endif // defined(WITH_COMPILER_LIB)
options->oVariables->BinaryIsSpirv = language_ == SPIRV;
device::Program* program = rootDev.createProgram(*this, options);
if (program == NULL) {
return CL_OUT_OF_HOST_MEMORY;
}
if (image != NULL) {
const uint8_t* memory = std::get<0>(binary(rootDev));
// clone 'binary' (it is owned by the host thread).
if (memory == NULL) {
if (make_copy) {
auto *image_copy = new (std::nothrow) uint8_t[length];
if (image_copy == NULL) {
delete program;
return CL_OUT_OF_HOST_MEMORY;
}
::memcpy(image_copy, image, length);
memory = image_copy;
} else {
memory = static_cast<const uint8_t*>(image);
}
// Save the original image
binary_[&rootDev] = std::make_tuple(memory, length, make_copy);
}
const device::Program* same_dev_prog = nullptr;
if ((amd::IS_HIP) && (same_prog != nullptr)) {
const auto &same_dev_prog_map_ = same_prog->devicePrograms();
guarantee(same_dev_prog_map_.size() == 1, "For same_prog, devicePrograms size != 1");
same_dev_prog = same_dev_prog_map_.begin()->second;
}
if (!program->setBinary(reinterpret_cast<const char*>(memory), length, same_dev_prog,
fdesc, foffset, uri)) {
delete program;
return CL_INVALID_BINARY;
}
}
devicePrograms_[&rootDev] = program;
deviceList_.insert(&device);
return CL_SUCCESS;
}
这里的逻辑比较复杂,不好静态分析。所幸 AMD 提供了一系列调试函数,我主要使用了 warning 函数当 print 用。利用 GLM-4 自动增加一系列标记 warning ,这种低级编程很适合 LLM 完成 :
int32_t Program::addDeviceProgram(Device& device, const void* image, size_t length,
bool make_copy, amd::option::Options* options,
const amd::Program* same_prog, amd::Os::FileDesc fdesc,
size_t foffset, std::string uri) {
warning("Entering addDeviceProgram");
if (image != NULL && !amd::Elf::isElfMagic((const char*)image)) {
warning("Invalid ELF magic number");
if (device.settings().useLightning_) {
warning("Device uses Lightning, returning CL_INVALID_BINARY");
return CL_INVALID_BINARY;
}
#if defined(WITH_COMPILER_LIB)
else if (!amd::Hsail::ValidateBinaryImage(
image, length, language_ == SPIRV ? BINARY_TYPE_SPIRV : BINARY_TYPE_ELF | BINARY_TYPE_LLVM)) {
warning("Binary image validation failed, returning CL_INVALID_BINARY");
return CL_INVALID_BINARY;
}
#endif // !defined(WITH_COMPILER_LIB)
}
// Check if the device is already associated with this program
if (deviceList_.find(&device) != deviceList_.end()) {
warning("Device already associated with this program, returning CL_INVALID_VALUE");
return CL_INVALID_VALUE;
}
Device& rootDev = device;
// if the rootDev is already associated with a program
if (devicePrograms_[&rootDev] != NULL) {
warning("Root device already associated with a program, returning CL_SUCCESS");
return CL_SUCCESS;
}
#if defined(WITH_COMPILER_LIB)
bool emptyOptions = (options == nullptr);
#endif
amd::option::Options emptyOpts;
if (options == NULL) {
warning("Options is NULL, using empty options");
options = &emptyOpts;
}
#if defined(WITH_COMPILER_LIB)
if (image != NULL && length != 0 && amd::Hsail::ValidateBinaryImage(image, length, BINARY_TYPE_ELF)) {
warning("Valid ELF binary image detected");
acl_error errorCode;
aclBinary* binary = amd::Hsail::ReadFromMem(image, length, &errorCode);
if (errorCode != ACL_SUCCESS) {
warning("Reading binary from memory failed, returning CL_INVALID_BINARY");
return CL_INVALID_BINARY;
}
const oclBIFSymbolStruct* symbol = findBIF30SymStruct(symOpenclCompilerOptions);
assert(symbol && "symbol not found");
std::string symName = std::string(symbol->str[bif::PRE]) + std::string(symbol->str[bif::POST]);
size_t symSize = 0;
const void* opts = amd::Hsail::ExtractSymbol(device.binCompiler(), binary, &symSize, aclCOMMENT,
symName.c_str(), &errorCode);
// if we have options from binary and input options was not specified
if (opts != NULL && emptyOptions) {
warning("Extracted options from binary, parsing them");
std::string sBinOptions = std::string((char*)opts, symSize);
if (!amd::option::parseAllOptions(sBinOptions, *options, false, false)) {
programLog_ = options->optionsLog();
LogError("Parsing compilation options from binary failed.");
warning("Parsing compilation options from binary failed, returning CL_INVALID_COMPILER_OPTIONS");
return CL_INVALID_COMPILER_OPTIONS;
}
}
options->oVariables->Legacy = !device.settings().useLightning_ ?
isAMDILTarget(*amd::aclutGetTargetInfo(binary)) :
isHSAILTarget(*amd::aclutGetTargetInfo(binary));
amd::Hsail::BinaryFini(binary);
}
#endif // defined(WITH_COMPILER_LIB)
options->oVariables->BinaryIsSpirv = language_ == SPIRV;
warning("Creating device program");
device::Program* program = rootDev.createProgram(*this, options);
if (program == NULL) {
warning("Failed to create device program, returning CL_OUT_OF_HOST_MEMORY");
return CL_OUT_OF_HOST_MEMORY;
}
if (image != NULL) {
warning("Setting binary for device program");
const uint8_t* memory = std::get<0>(binary(rootDev));
// clone 'binary' (it is owned by the host thread).
if (memory == NULL) {
if (make_copy) {
warning("Making a copy of the binary image");
auto *image_copy = new (std::nothrow) uint8_t[length];
if (image_copy == NULL) {
delete program;
warning("Failed to allocate memory for image copy, returning CL_OUT_OF_HOST_MEMORY");
return CL_OUT_OF_HOST_MEMORY;
}
::memcpy(image_copy, image, length);
memory = image_copy;
} else {
memory = static_cast<const uint8_t*>(image);
}
// Save the original image
binary_[&rootDev] = std::make_tuple(memory, length, make_copy);
}
const device::Program* same_dev_prog = nullptr;
if ((amd::IS_HIP) && (same_prog != nullptr)) {
const auto &same_dev_prog_map_ = same_prog->devicePrograms();
guarantee(same_dev_prog_map_.size() == 1, "For same_prog, devicePrograms size != 1");
same_dev_prog = same_dev_prog_map_.begin()->second;
}
bool ret;
ret = program->setBinary(reinterpret_cast<const char*>(memory), length, same_dev_prog,
fdesc, foffset, uri);
if (ret) {
warning("outside setBinary return true");
} else {
warning("outside setBinary return false");
}
if (!ret) {
warning("Failed to set binary for device program, returning CL_INVALID_BINARY");
delete program;
return CL_INVALID_BINARY;
}
}
devicePrograms_[&rootDev] = program;
deviceList_.insert(&device);
warning("Device program added successfully, returning CL_SUCCESS");
return CL_SUCCESS;
}
重新编译运行后,发现问题就在第一步 isElfMagic
函数,位于 rocclr/elf/elf.cpp :
bool Elf::isElfMagic(const char* p)
{
if (p == nullptr || strncmp(p, ELFMAG, SELFMAG) != 0) {
return false;
}
return true;
}
ELFMAG
这个宏的定义在文件头:
#if !defined(ELFMAG)
#define ELFMAG "\177ELF"
#define SELFMAG 4
#endif
就是常见的 “ELF” 字符串。而程序运行的时候 image
携带的内容是 __CLANG_OFFLOAD_BUNDLE__
,这是另外一个数据结构的 magic,位于 hipamd/src/hip_code_object.cpp:
// In uncompressed mode
constexpr char kOffloadBundleUncompressedMagicStr[] = "__CLANG_OFFLOAD_BUNDLE__";
static constexpr size_t kOffloadBundleUncompressedMagicStrSize =
sizeof(kOffloadBundleUncompressedMagicStr);
这里问题就浮现了,两个不同的 magic 怎么会碰到一起进行比较呢?于是转而搜索 image
的来源:
image
来自fbd_info->binary_image_
fbd_info->binary_image_
来自fatbin_dev_info_[device_id]
fatbin_dev_info_
是成员变量,定义在 hipamd/src/hip_fatbin.hpp
接下来寻找 fatbin_dev_info_
的赋值,发现在 hipamd/src/hip_fatbin.cpp :
hipError_t FatBinaryInfo::ExtractFatBinary(const std::vector<hip::Device*>& devices) {
if (!HIP_USE_RUNTIME_UNBUNDLER) {
return ExtractFatBinaryUsingCOMGR(devices);
}
hipError_t hip_error = hipSuccess;
std::vector<std::pair<const void*, size_t>> code_objs;
// Copy device names for Extract Code object File
std::vector<std::string> device_names;
device_names.reserve(devices.size());
for (size_t dev_idx = 0; dev_idx < devices.size(); ++dev_idx) {
device_names.push_back(devices[dev_idx]->devices()[0]->isa().isaName());
}
// We are given file name, get the file desc and file size
if (fname_.size() > 0) {
// Get File Handle & size of the file.
if (!amd::Os::GetFileHandle(fname_.c_str(), &fdesc_, &fsize_)) {
return hipErrorFileNotFound;
}
if (fsize_ == 0) {
return hipErrorInvalidImage;
}
// Extract the code object from file
hip_error = CodeObject::ExtractCodeObjectFromFile(fdesc_, fsize_, &image_,
device_names, code_objs);
} else if (image_ != nullptr) {
// We are directly given image pointer directly, try to extract file desc & file Size
hip_error = CodeObject::ExtractCodeObjectFromMemory(image_,
device_names, code_objs, uri_);
} else {
return hipErrorInvalidValue;
}
if (hip_error == hipErrorNoBinaryForGpu) {
if (fname_.size() > 0) {
LogPrintfError("hipErrorNoBinaryForGpu: Couldn't find binary for file: %s", fname_.c_str());
} else {
LogPrintfError("hipErrorNoBinaryForGpu: Couldn't find binary for ptr: 0x%x", image_);
}
// For the condition: unable to find code object for all devices,
// still extract available images to those devices owning them.
// This helps users to work with ROCm if there is any supported
// GFX on system.
for (size_t dev_idx = 0; dev_idx < devices.size(); ++dev_idx) {
if (code_objs[dev_idx].first) {
// Calculate the offset wrt binary_image and the original image
size_t offset_l
= (reinterpret_cast<address>(const_cast<void*>(code_objs[dev_idx].first))
- reinterpret_cast<address>(const_cast<void*>(image_)));
fatbin_dev_info_[devices[dev_idx]->deviceId()]
= new FatBinaryDeviceInfo(code_objs[dev_idx].first, code_objs[dev_idx].second, offset_l);
fatbin_dev_info_[devices[dev_idx]->deviceId()]->program_
= new amd::Program(*devices[dev_idx]->asContext());
if (fatbin_dev_info_[devices[dev_idx]->deviceId()]->program_ == NULL) {
break;
}
}
}
return hip_error;
}
if (hip_error == hipErrorInvalidKernelFile) {
for (size_t dev_idx = 0; dev_idx < devices.size(); ++dev_idx) {
// the image type is no CLANG_OFFLOAD_BUNDLER, image for current device directly passed
fatbin_dev_info_[devices[dev_idx]->deviceId()]
= new FatBinaryDeviceInfo(image_, CodeObject::ElfSize(image_), 0);
}
} else if(hip_error == hipSuccess) {
for (size_t dev_idx = 0; dev_idx < devices.size(); ++dev_idx) {
// Calculate the offset wrt binary_image and the original image
size_t offset_l
= (reinterpret_cast<address>(const_cast<void*>(code_objs[dev_idx].first))
- reinterpret_cast<address>(const_cast<void*>(image_)));
fatbin_dev_info_[devices[dev_idx]->deviceId()]
= new FatBinaryDeviceInfo(code_objs[dev_idx].first, code_objs[dev_idx].second, offset_l);
}
}
for (size_t dev_idx = 0; dev_idx < devices.size(); ++dev_idx) {
fatbin_dev_info_[devices[dev_idx]->deviceId()]->program_
= new amd::Program(*devices[dev_idx]->asContext());
if (fatbin_dev_info_[devices[dev_idx]->deviceId()]->program_ == NULL) {
return hipErrorOutOfMemory;
}
}
return hipSuccess;
}
添加 warning 后编译运行,实际调用了 ExtractCodeObjectFromMemory
函数,位于 hipamd/src/hip_code_object.cpp :
// This will be moved to COMGR eventually
hipError_t CodeObject::ExtractCodeObjectFromMemory(
const void* data, const std::vector<std::string>& device_names,
std::vector<std::pair<const void*, size_t>>& code_objs, std::string& uri) {
// Get the URI from memory
if (!amd::Os::GetURIFromMemory(data, 0, uri)) {
return hipErrorInvalidValue;
}
return extractCodeObjectFromFatBinary(data, device_names, code_objs);
}
extractCodeObjectFromFatBinary
函数就在下面:
hipError_t CodeObject::extractCodeObjectFromFatBinary(
const void* data, const std::vector<std::string>& agent_triple_target_ids,
std::vector<std::pair<const void*, size_t>>& code_objs) {
std::string magic((const char*)data, kOffloadBundleUncompressedMagicStrSize);
if (magic.compare(kOffloadBundleUncompressedMagicStr)) {
return hipErrorInvalidKernelFile;
}
// Initialize Code objects
code_objs.reserve(agent_triple_target_ids.size());
for (size_t i = 0; i < agent_triple_target_ids.size(); i++) {
code_objs.push_back(std::make_pair(nullptr, 0));
}
const auto obheader = reinterpret_cast<const __ClangOffloadBundleUncompressedHeader*>(data);
const auto* desc = &obheader->desc[0];
size_t num_code_objs = code_objs.size();
for (uint64_t i = 0; i < obheader->numOfCodeObjects; ++i,
desc = reinterpret_cast<const __ClangOffloadBundleInfo*>(
reinterpret_cast<uintptr_t>(&desc->bundleEntryId[0]) +
desc->bundleEntryIdSize)) {
const void* image =
reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(obheader) + desc->offset);
const size_t image_size = desc->size;
if (num_code_objs == 0) break;
std::string bundleEntryId{desc->bundleEntryId, desc->bundleEntryIdSize};
std::string co_triple_target_id;
if (!getTripleTargetID(bundleEntryId, image, co_triple_target_id)) continue;
for (size_t dev = 0; dev < agent_triple_target_ids.size(); ++dev) {
if (code_objs[dev].first) continue;
if (isCodeObjectCompatibleWithDevice(co_triple_target_id, agent_triple_target_ids[dev])) {
code_objs[dev] = std::make_pair(image, image_size);
--num_code_objs;
}
}
}
if (num_code_objs == 0) {
return hipSuccess;
} else {
LogPrintfError("%s",
"hipErrorNoBinaryForGpu: Unable to find code object for all current devices!");
LogPrintfError("%s", " Devices:");
for (size_t i = 0; i < agent_triple_target_ids.size(); i++) {
LogPrintfError(" %s - [%s]", agent_triple_target_ids[i].c_str(),
((code_objs[i].first) ? "Found" : "Not Found"));
}
const auto obheader = reinterpret_cast<const __ClangOffloadBundleUncompressedHeader*>(data);
const auto* desc = &obheader->desc[0];
LogPrintfError("%s", " Bundled Code Objects:");
for (uint64_t i = 0; i < obheader->numOfCodeObjects; ++i,
desc = reinterpret_cast<const __ClangOffloadBundleInfo*>(
reinterpret_cast<uintptr_t>(&desc->bundleEntryId[0]) +
desc->bundleEntryIdSize)) {
std::string bundleEntryId{desc->bundleEntryId, desc->bundleEntryIdSize};
const void* image =
reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(obheader) + desc->offset);
std::string co_triple_target_id;
bool valid_co = getTripleTargetID(bundleEntryId, image, co_triple_target_id);
if (valid_co) {
LogPrintfError(" %s - [Code object targetID is %s]", bundleEntryId.c_str(),
co_triple_target_id.c_str());
} else {
LogPrintfError(" %s - [Unsupported]", bundleEntryId.c_str());
}
}
return hipErrorNoBinaryForGpu;
}
}
再次添加 warning 打印 data
,编译运行,惊奇地发现,data
的内容是 __CLANG_OFFLOAD_BUNDLE__
,理应 magic 匹配,但是分支却进入了不匹配的逻辑。一番搜索后,找到了 issue ,原来问题在于结尾的 \0
导致匹配失败了。在最新分支上已经修复了这个问题,这里我们手工打一个 patch,修改成下面这样:
std::string magic((const char*)data, kOffloadBundleUncompressedMagicStrSize-1);
if (magic.compare(kOffloadBundleUncompressedMagicStr)) {
return hipErrorInvalidKernelFile;
}
再次编译运行,终于可以正常运行了!
AMD_LOG_LEVEL=5 ./vectoradd
:3:rocdevice.cpp :468 : 89974442900 us: [pid:1092331 tid:0x78ad14e4d1c0] Initializing HSA stack.
:3:rocdevice.cpp :554 : 89974456729 us: [pid:1092331 tid:0x78ad14e4d1c0] Enumerated GPU agents = 1
:3:rocdevice.cpp :234 : 89974456763 us: [pid:1092331 tid:0x78ad14e4d1c0] Numa selects cpu agent[0]=0x5911ddce4ac0(fine=0x5911ddce7210,coarse=0x5911ddce2a80) for gpu agent=0x5911ddcef8c0 CPU<->GPU XGMI=0
:3:rocsettings.cpp :290 : 89974456770 us: [pid:1092331 tid:0x78ad14e4d1c0] Using dev kernel arg wa = 0
:3:comgrctx.cpp :33 : 89974456774 us: [pid:1092331 tid:0x78ad14e4d1c0] Loading COMGR library.
:3:comgrctx.cpp :126 : 89974456802 us: [pid:1092331 tid:0x78ad14e4d1c0] Loaded COMGR library version 2.8.
:3:rocdevice.cpp :1810: 89974456957 us: [pid:1092331 tid:0x78ad14e4d1c0] Gfx Major/Minor/Stepping: 10/3/1
:3:rocdevice.cpp :1812: 89974456961 us: [pid:1092331 tid:0x78ad14e4d1c0] HMM support: 1, XNACK: 0, Direct host access: 0
:3:rocdevice.cpp :1814: 89974456967 us: [pid:1092331 tid:0x78ad14e4d1c0] Max SDMA Read Mask: 0x3, Max SDMA Write Mask: 0x3
:4:rocdevice.cpp :2222: 89974457392 us: [pid:1092331 tid:0x78ad14e4d1c0] Allocate hsa host memory 0x78ac12900000, size 0x101000, numa_node = 0
:4:rocdevice.cpp :2222: 89974458254 us: [pid:1092331 tid:0x78ad14e4d1c0] Allocate hsa host memory 0x78ac12700000, size 0x101000, numa_node = 0
:4:rocdevice.cpp :2222: 89974459354 us: [pid:1092331 tid:0x78ad14e4d1c0] Allocate hsa host memory 0x78ac12200000, size 0x400000, numa_node = 0
:4:rocdevice.cpp :2222: 89974459733 us: [pid:1092331 tid:0x78ad14e4d1c0] Allocate hsa host memory 0x78ad14e3a000, size 0x38, numa_node = 0
:4:runtime.cpp :85 : 89974459820 us: [pid:1092331 tid:0x78ad14e4d1c0] init
:3:hip_context.cpp :49 : 89974459825 us: [pid:1092331 tid:0x78ad14e4d1c0] Direct Dispatch: 1
:3:hip_device_runtime.cpp :651 : 89974459859 us: [pid:1092331 tid:0x78ad14e4d1c0] hipGetDeviceCount ( 0x7ffc65b1ea44 )
:3:hip_device_runtime.cpp :653 : 89974459865 us: [pid:1092331 tid:0x78ad14e4d1c0] hipGetDeviceCount: Returned hipSuccess :
:3:hip_device_runtime.cpp :666 : 89974459871 us: [pid:1092331 tid:0x78ad14e4d1c0] hipSetDevice ( 0 )
:3:hip_device_runtime.cpp :670 : 89974459875 us: [pid:1092331 tid:0x78ad14e4d1c0] hipSetDevice: Returned hipSuccess :
Using device 0.
:3:hip_memory.cpp :615 : 89974460147 us: [pid:1092331 tid:0x78ad14e4d1c0] hipMalloc ( 0x7ffc65b1ea38, 4096 )
:4:rocdevice.cpp :2379: 89974460570 us: [pid:1092331 tid:0x78ad14e4d1c0] Allocate hsa device memory 0x78ac11e00000, size 0x1000
:3:rocdevice.cpp :2418: 89974460577 us: [pid:1092331 tid:0x78ad14e4d1c0] Device=0x5911ddd27920, freeMem_ = 0x2fefff000
:3:hip_memory.cpp :617 : 89974460584 us: [pid:1092331 tid:0x78ad14e4d1c0] hipMalloc: Returned hipSuccess : 0x78ac11e00000: duration: 437 us
:3:hip_memory.cpp :615 : 89974460589 us: [pid:1092331 tid:0x78ad14e4d1c0] hipMalloc ( 0x7ffc65b1ea30, 4096 )
:4:rocdevice.cpp :2379: 89974460595 us: [pid:1092331 tid:0x78ad14e4d1c0] Allocate hsa device memory 0x78ac11e01000, size 0x1000
:3:rocdevice.cpp :2418: 89974460599 us: [pid:1092331 tid:0x78ad14e4d1c0] Device=0x5911ddd27920, freeMem_ = 0x2feffe000
:3:hip_memory.cpp :617 : 89974460604 us: [pid:1092331 tid:0x78ad14e4d1c0] hipMalloc: Returned hipSuccess : 0x78ac11e01000: duration: 15 us
:3:hip_memory.cpp :615 : 89974460608 us: [pid:1092331 tid:0x78ad14e4d1c0] hipMalloc ( 0x7ffc65b1ea28, 4096 )
:4:rocdevice.cpp :2379: 89974460613 us: [pid:1092331 tid:0x78ad14e4d1c0] Allocate hsa device memory 0x78ac11e02000, size 0x1000
:3:rocdevice.cpp :2418: 89974460616 us: [pid:1092331 tid:0x78ad14e4d1c0] Device=0x5911ddd27920, freeMem_ = 0x2feffd000
:3:hip_memory.cpp :617 : 89974460622 us: [pid:1092331 tid:0x78ad14e4d1c0] hipMalloc: Returned hipSuccess : 0x78ac11e02000: duration: 14 us
:3:hip_memory.cpp :690 : 89974460833 us: [pid:1092331 tid:0x78ad14e4d1c0] hipMemcpy ( 0x78ac11e00000, 0x5911ddd355d0, 4096, hipMemcpyHostToDevice )
:3:rocdevice.cpp :3030: 89974460844 us: [pid:1092331 tid:0x78ad14e4d1c0] Number of allocated hardware queues with low priority: 0, with normal priority: 0, with high priority: 0, maximum per priority is: 4
:3:rocdevice.cpp :3106: 89974470492 us: [pid:1092331 tid:0x78ad14e4d1c0] Created SWq=0x78ad149f8000 to map on HWq=0x78ac0bd00000 with size 16384 with priority 1, cooperative: 0
:3:rocdevice.cpp :3198: 89974470514 us: [pid:1092331 tid:0x78ad14e4d1c0] acquireQueue refCount: 0x78ac0bd00000 (1)
:4:rocdevice.cpp :2222: 89974470907 us: [pid:1092331 tid:0x78ad14e4d1c0] Allocate hsa host memory 0x78ac0a900000, size 0x100000, numa_node = 0
:3:devprogram.cpp :2648: 89974691965 us: [pid:1092331 tid:0x78ad14e4d1c0] Using Code Object V5.
:4:command.cpp :348 : 89974695179 us: [pid:1092331 tid:0x78ad14e4d1c0] Command (CopyHostToDevice) enqueued: 0x5911ddf946a0
:4:rocblit.cpp :834 : 89974695970 us: [pid:1092331 tid:0x78ad14e4d1c0] HSA Async Copy staged H2D dst=0x78ac11e00000, src=0x78ac12900000, size=4096, completion_signal=0x78ad143ff700
:4:rocvirtual.cpp :572 : 89974695978 us: [pid:1092331 tid:0x78ad14e4d1c0] Host wait on completion_signal=0x78ad143ff700
:3:rocvirtual.hpp :67 : 89974695983 us: [pid:1092331 tid:0x78ad14e4d1c0] Host active wait for Signal = (0x78ad143ff700) for -1 ns
:4:command.cpp :287 : 89974695992 us: [pid:1092331 tid:0x78ad14e4d1c0] Queue marker to command queue: 0x5911ddbc9110
:4:command.cpp :348 : 89974695996 us: [pid:1092331 tid:0x78ad14e4d1c0] Command (InternalMarker) enqueued: 0x5911ddf663c0
:4:command.cpp :177 : 89974696012 us: [pid:1092331 tid:0x78ad14e4d1c0] Command 0x5911ddf946a0 complete
:4:command.cpp :175 : 89974696015 us: [pid:1092331 tid:0x78ad14e4d1c0] Command 0x5911ddf663c0 complete (Wall: 89974696014, CPU: 0, GPU: 0 us)
:4:command.cpp :252 : 89974696019 us: [pid:1092331 tid:0x78ad14e4d1c0] Waiting for event 0x5911ddf946a0 to complete, current status 0
:4:command.cpp :266 : 89974696023 us: [pid:1092331 tid:0x78ad14e4d1c0] Event 0x5911ddf946a0 wait completed
:3:hip_memory.cpp :691 : 89974696027 us: [pid:1092331 tid:0x78ad14e4d1c0] hipMemcpy: Returned hipSuccess : : duration: 235194 us
:3:hip_memory.cpp :690 : 89974696037 us: [pid:1092331 tid:0x78ad14e4d1c0] hipMemcpy ( 0x78ac11e01000, 0x5911ddd365e0, 4096, hipMemcpyHostToDevice )
:4:command.cpp :348 : 89974696044 us: [pid:1092331 tid:0x78ad14e4d1c0] Command (CopyHostToDevice) enqueued: 0x5911ddf946a0
:4:rocblit.cpp :834 : 89974696050 us: [pid:1092331 tid:0x78ad14e4d1c0] HSA Async Copy staged H2D dst=0x78ac11e01000, src=0x78ac12900000, size=4096, completion_signal=0x78ad143ff680
:4:rocvirtual.cpp :572 : 89974696054 us: [pid:1092331 tid:0x78ad14e4d1c0] Host wait on completion_signal=0x78ad143ff680
:3:rocvirtual.hpp :67 : 89974696058 us: [pid:1092331 tid:0x78ad14e4d1c0] Host active wait for Signal = (0x78ad143ff680) for -1 ns
:4:command.cpp :287 : 89974696065 us: [pid:1092331 tid:0x78ad14e4d1c0] Queue marker to command queue: 0x5911ddbc9110
:4:command.cpp :348 : 89974696068 us: [pid:1092331 tid:0x78ad14e4d1c0] Command (InternalMarker) enqueued: 0x5911ddf663c0
:4:command.cpp :177 : 89974696072 us: [pid:1092331 tid:0x78ad14e4d1c0] Command 0x5911ddf946a0 complete
:4:command.cpp :175 : 89974696075 us: [pid:1092331 tid:0x78ad14e4d1c0] Command 0x5911ddf663c0 complete (Wall: 89974696075, CPU: 0, GPU: 0 us)
:4:command.cpp :252 : 89974696081 us: [pid:1092331 tid:0x78ad14e4d1c0] Waiting for event 0x5911ddf946a0 to complete, current status 0
:4:command.cpp :266 : 89974696085 us: [pid:1092331 tid:0x78ad14e4d1c0] Event 0x5911ddf946a0 wait completed
:3:hip_memory.cpp :691 : 89974696088 us: [pid:1092331 tid:0x78ad14e4d1c0] hipMemcpy: Returned hipSuccess : : duration: 51 us
:3:hip_platform.cpp :225 : 89974696095 us: [pid:1092331 tid:0x78ad14e4d1c0] __hipPushCallConfiguration ( {4,1,1}, {256,1,1}, 0, stream:<null> )
:3:hip_platform.cpp :229 : 89974696099 us: [pid:1092331 tid:0x78ad14e4d1c0] __hipPushCallConfiguration: Returned hipSuccess :
:3:hip_platform.cpp :234 : 89974696106 us: [pid:1092331 tid:0x78ad14e4d1c0] __hipPopCallConfiguration ( {3721618912,22801,0}, {4294966920,4294967295,1706159000}, 0x7ffc65b1e9b0, 0x7ffc65b1e9a8 )
:3:hip_platform.cpp :243 : 89974696111 us: [pid:1092331 tid:0x78ad14e4d1c0] __hipPopCallConfiguration: Returned hipSuccess :
:3:hip_module.cpp :677 : 89974696124 us: [pid:1092331 tid:0x78ad14e4d1c0] hipLaunchKernel ( 0x5911d8f70840, {4,1,1}, {256,1,1}, 0x7ffc65b1e9f0, 0, stream:<null> )
:3:devprogram.cpp :2648: 89974696178 us: [pid:1092331 tid:0x78ad14e4d1c0] Using Code Object V5.
:4:command.cpp :348 : 89974703739 us: [pid:1092331 tid:0x78ad14e4d1c0] Command (KernelExecution) enqueued: 0x5911ddde4ba0
:3:rocvirtual.cpp :727 : 89974703752 us: [pid:1092331 tid:0x78ad14e4d1c0] Arg0: = ptr:0x78ac11e00000 obj:[0x78ac11e00000-0x78ac11e01000]
:3:rocvirtual.cpp :727 : 89974703757 us: [pid:1092331 tid:0x78ad14e4d1c0] Arg1: = ptr:0x78ac11e01000 obj:[0x78ac11e01000-0x78ac11e02000]
:3:rocvirtual.cpp :727 : 89974703762 us: [pid:1092331 tid:0x78ad14e4d1c0] Arg2: = ptr:0x78ac11e02000 obj:[0x78ac11e02000-0x78ac11e03000]
:3:rocvirtual.cpp :803 : 89974703767 us: [pid:1092331 tid:0x78ad14e4d1c0] Arg3: = val:1024
:3:rocvirtual.cpp :3028: 89974703772 us: [pid:1092331 tid:0x78ad14e4d1c0] ShaderName : _Z10vector_addPiS_S_i
:4:rocvirtual.cpp :926 : 89974703781 us: [pid:1092331 tid:0x78ad14e4d1c0] SWq=0x78ad149f8000, HWq=0x78ac0bd00000, id=1, Dispatch Header = 0x1502 (type=2, barrier=1, acquire=2, release=2), setup=3, grid=[1024, 1, 1], workgroup=[256, 1, 1], private_seg_size=0, group_seg_size=0, kernel_obj=0x78ad149c88c0, kernarg_address=0x78ac0a900000, completion_signal=0x0, correlation_id=0, rptr=0, wptr=0
:3:hip_module.cpp :678 : 89974703790 us: [pid:1092331 tid:0x78ad14e4d1c0] hipLaunchKernel: Returned hipSuccess :
:3:hip_error.cpp :36 : 89974703795 us: [pid:1092331 tid:0x78ad14e4d1c0] hipGetLastError ( )
:3:hip_device_runtime.cpp :620 : 89974703800 us: [pid:1092331 tid:0x78ad14e4d1c0] hipDeviceSynchronize ( )
:4:commandqueue.cpp :147 : 89974703805 us: [pid:1092331 tid:0x78ad14e4d1c0] HW Event not ready, awaiting completion instead
:4:command.cpp :287 : 89974703811 us: [pid:1092331 tid:0x78ad14e4d1c0] Queue marker to command queue: 0x5911ddbc9110
:4:command.cpp :348 : 89974703815 us: [pid:1092331 tid:0x78ad14e4d1c0] Command (InternalMarker) enqueued: 0x5911ddf663c0
:4:rocvirtual.cpp :1091: 89974703821 us: [pid:1092331 tid:0x78ad14e4d1c0] SWq=0x78ad149f8000, HWq=0x78ac0bd00000, id=1, BarrierAND Header = 0x1503 (type=3, barrier=1, acquire=2, release=2), dep_signal=[0x0, 0x0, 0x0, 0x0, 0x0], completion_signal=0x78ad143ff600
:4:rocvirtual.cpp :572 : 89974703826 us: [pid:1092331 tid:0x78ad14e4d1c0] Host wait on completion_signal=0x78ad143ff600
:3:rocvirtual.hpp :67 : 89974703830 us: [pid:1092331 tid:0x78ad14e4d1c0] Host active wait for Signal = (0x78ad143ff600) for -1 ns
:4:command.cpp :177 : 89974703837 us: [pid:1092331 tid:0x78ad14e4d1c0] Command 0x5911ddde4ba0 complete
:4:command.cpp :175 : 89974703840 us: [pid:1092331 tid:0x78ad14e4d1c0] Command 0x5911ddf663c0 complete (Wall: 89974703840, CPU: 0, GPU: 0 us)
:4:command.cpp :252 : 89974703845 us: [pid:1092331 tid:0x78ad14e4d1c0] Waiting for event 0x5911ddde4ba0 to complete, current status 0
:4:command.cpp :266 : 89974703853 us: [pid:1092331 tid:0x78ad14e4d1c0] Event 0x5911ddde4ba0 wait completed
:4:commandqueue.cpp :163 : 89974703856 us: [pid:1092331 tid:0x78ad14e4d1c0] All commands finished
:3:hip_device_runtime.cpp :624 : 89974703860 us: [pid:1092331 tid:0x78ad14e4d1c0] hipDeviceSynchronize: Returned hipSuccess :
:3:hip_memory.cpp :690 : 89974703867 us: [pid:1092331 tid:0x78ad14e4d1c0] hipMemcpy ( 0x5911ddd375f0, 0x78ac11e02000, 4096, hipMemcpyDeviceToHost )
:4:command.cpp :348 : 89974703876 us: [pid:1092331 tid:0x78ad14e4d1c0] Command (CopyDeviceToHost) enqueued: 0x5911ddf946a0
:4:rocblit.cpp :868 : 89974705215 us: [pid:1092331 tid:0x78ad14e4d1c0] HSA Async Copy staged D2H dst=0x78ac12700000, src=0x78ac11e02000, size=4096, completion_signal=0x78ad143ff580
:4:rocvirtual.cpp :572 : 89974705223 us: [pid:1092331 tid:0x78ad14e4d1c0] Host wait on completion_signal=0x78ad143ff580
:3:rocvirtual.hpp :67 : 89974705227 us: [pid:1092331 tid:0x78ad14e4d1c0] Host active wait for Signal = (0x78ad143ff580) for -1 ns
:4:command.cpp :287 : 89974705234 us: [pid:1092331 tid:0x78ad14e4d1c0] Queue marker to command queue: 0x5911ddbc9110
:4:command.cpp :348 : 89974705238 us: [pid:1092331 tid:0x78ad14e4d1c0] Command (InternalMarker) enqueued: 0x5911ddf663c0
:4:command.cpp :177 : 89974705242 us: [pid:1092331 tid:0x78ad14e4d1c0] Command 0x5911ddf946a0 complete
:4:command.cpp :175 : 89974705247 us: [pid:1092331 tid:0x78ad14e4d1c0] Command 0x5911ddf663c0 complete (Wall: 89974705247, CPU: 0, GPU: 0 us)
:4:command.cpp :252 : 89974705252 us: [pid:1092331 tid:0x78ad14e4d1c0] Waiting for event 0x5911ddf946a0 to complete, current status 0
:4:command.cpp :266 : 89974705255 us: [pid:1092331 tid:0x78ad14e4d1c0] Event 0x5911ddf946a0 wait completed
:3:hip_memory.cpp :691 : 89974705258 us: [pid:1092331 tid:0x78ad14e4d1c0] hipMemcpy: Returned hipSuccess : : duration: 1391 us
c[0] = 0
c[1] = 3
c[2] = 6
c[3] = 9
c[4] = 12
c[5] = 15
c[6] = 18
c[7] = 21
c[8] = 24
c[9] = 27
:3:hip_memory.cpp :667 : 89974705286 us: [pid:1092331 tid:0x78ad14e4d1c0] hipFree ( 0x78ac11e00000 )
:4:commandqueue.cpp :147 : 89974705290 us: [pid:1092331 tid:0x78ad14e4d1c0] HW Event not ready, awaiting completion instead
:4:commandqueue.cpp :163 : 89974705294 us: [pid:1092331 tid:0x78ad14e4d1c0] All commands finished
:4:rocdevice.cpp :2395: 89974705304 us: [pid:1092331 tid:0x78ad14e4d1c0] Free hsa memory 0x78ac11e00000
:3:rocdevice.cpp :2418: 89974705308 us: [pid:1092331 tid:0x78ad14e4d1c0] Device=0x5911ddd27920, freeMem_ = 0x2feffe000
:3:hip_memory.cpp :669 : 89974705312 us: [pid:1092331 tid:0x78ad14e4d1c0] hipFree: Returned hipSuccess :
:3:hip_memory.cpp :667 : 89974705316 us: [pid:1092331 tid:0x78ad14e4d1c0] hipFree ( 0x78ac11e01000 )
:4:rocdevice.cpp :2395: 89974705320 us: [pid:1092331 tid:0x78ad14e4d1c0] Free hsa memory 0x78ac11e01000
:3:rocdevice.cpp :2418: 89974705322 us: [pid:1092331 tid:0x78ad14e4d1c0] Device=0x5911ddd27920, freeMem_ = 0x2fefff000
:3:hip_memory.cpp :669 : 89974705327 us: [pid:1092331 tid:0x78ad14e4d1c0] hipFree: Returned hipSuccess :
:3:hip_memory.cpp :667 : 89974705330 us: [pid:1092331 tid:0x78ad14e4d1c0] hipFree ( 0x78ac11e02000 )
:4:rocdevice.cpp :2395: 89974705334 us: [pid:1092331 tid:0x78ad14e4d1c0] Free hsa memory 0x78ac11e02000
:3:rocdevice.cpp :2418: 89974705338 us: [pid:1092331 tid:0x78ad14e4d1c0] Device=0x5911ddd27920, freeMem_ = 0x2ff000000
:3:hip_memory.cpp :669 : 89974705342 us: [pid:1092331 tid:0x78ad14e4d1c0] hipFree: Returned hipSuccess :
:3:hip_device_runtime.cpp :620 : 89974705380 us: [pid:1092331 tid:0x78ad14e4d1c0] hipDeviceSynchronize ( )
:3:hip_device_runtime.cpp :624 : 89974705384 us: [pid:1092331 tid:0x78ad14e4d1c0] hipDeviceSynchronize: Returned hipSuccess :
:4:command.cpp :348 : 89974705686 us: [pid:1092331 tid:0x78ad14e4d1c0] Command (Marker) enqueued: 0x5911ddf663c0
:3:rocvirtual.cpp :476 : 89974705695 us: [pid:1092331 tid:0x78ad14e4d1c0] Set Handler: handle(0x78ad143ff500), timestamp(0x5911de4fdcc0)
:4:rocvirtual.cpp :1091: 89974705701 us: [pid:1092331 tid:0x78ad14e4d1c0] SWq=0x78ad149f8000, HWq=0x78ac0bd00000, id=1, BarrierAND Header = 0x1503 (type=3, barrier=1, acquire=2, release=2), dep_signal=[0x0, 0x0, 0x0, 0x0, 0x0], completion_signal=0x78ad143ff500
:4:command.cpp :252 : 89974705708 us: [pid:1092331 tid:0x78ad14e4d1c0] Waiting for event 0x5911ddf663c0 to complete, current status 2
:3:rocvirtual.cpp :223 : 89974705734 us: [pid:1092331 tid:0x78ac10c006c0] Handler: value(0), timestamp(0x5911de34ff90), handle(0x78ad143ff500)
:4:command.cpp :175 : 89974705750 us: [pid:1092331 tid:0x78ac10c006c0] Command 0x5911ddf663c0 complete (Wall: 89974705749, CPU: 0, GPU: 58 us)
:4:command.cpp :266 : 89974705750 us: [pid:1092331 tid:0x78ad14e4d1c0] Event 0x5911ddf663c0 wait completed
:4:rocdevice.cpp :2395: 89974705972 us: [pid:1092331 tid:0x78ad14e4d1c0] Free hsa memory 0x78ac0a900000
:4:rocdevice.cpp :2395: 89974705977 us: [pid:1092331 tid:0x78ad14e4d1c0] Free hsa memory (nil)
:3:rocdevice.cpp :3210: 89974705980 us: [pid:1092331 tid:0x78ad14e4d1c0] releaseQueue refCount:0x78ac0bd00000 (0)
:4:runtime.cpp :93 : 89974706035 us: [pid:1092331 tid:0x78ad14e4d1c0] tearDown
:4:rocdevice.cpp :2395: 89974706322 us: [pid:1092331 tid:0x78ad14e4d1c0] Free hsa memory 0x78ac12200000
:4:rocdevice.cpp :2395: 89974706405 us: [pid:1092331 tid:0x78ad14e4d1c0] Free hsa memory 0x78ad14e3a000
:3:rocdevice.cpp :286 : 89974706410 us: [pid:1092331 tid:0x78ad14e4d1c0] Deleting hardware queue 0x78ac0bd00000 with refCount 0
:4:rocdevice.cpp :2395: 89974712902 us: [pid:1092331 tid:0x78ad14e4d1c0] Free hsa memory 0x78ac12700000
:4:rocdevice.cpp :2395: 89974713096 us: [pid:1092331 tid:0x78ad14e4d1c0] Free hsa memory 0x78ac12900000
总结
笔者刚开始遇到这个问题的时候,一直怀疑是不是自己配的环境有问题,尝试了各种配置,没想到最后是官方的运行库有 bug 。不过这个问题也促使笔者探究了 HIP 运行时的细节,对于计算库有了更深的理解。