Jachin Shen
Jachin Shen

Categories

Tags

背景

在上一篇中,笔者配好了 ROCm 的基础 LLVM 编译环境,并成功编译了 HIP 程序,然而实际运行时会出现 invalidDeviceFunction 错误。本文详细介绍该错误的排查过程。

问题详述

笔者询问 GLM-4 得到了一个基础的 HIP 程序:

#include <hip/hip_runtime.h>
#include <iostream>

// 定义kernel
__global__ void vector_add(int *a, int *b, int *c, int n) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if (idx < n) {
        c[idx] = a[idx] + b[idx];
    }
}

int main() {
    int n = 1024; // 数组大小
    int *a, *b, *c;

    // 获取设备数量
    int deviceCount;
    hipError_t err = hipGetDeviceCount(&deviceCount);
    if (err != hipSuccess) {
        std::cerr << "hipGetDeviceCount failed: " << hipGetErrorString(err) << std::endl;
        return -1;
    }

    // 如果有设备,选择第一个设备
    if (deviceCount > 0) {
        err = hipSetDevice(0); // 选择第一个设备
        if (err != hipSuccess) {
            std::cerr << "hipSetDevice failed: " << hipGetErrorString(err) << std::endl;
            return -1;
        } else {
            std::cout << "Using device 0." << std::endl;
        }
    } else {
        std::cerr << "No HIP capable devices found." << std::endl;
        return -1;
    }

    // 分配内存
    err = hipMalloc((void**)&a, n * sizeof(int));
    if (err != hipSuccess) {
        std::cerr << "hipMalloc failed for a: " << hipGetErrorString(err) << std::endl;
        return -1;
    }
    err = hipMalloc((void**)&b, n * sizeof(int));
    if (err != hipSuccess) {
        std::cerr << "hipMalloc failed for b: " << hipGetErrorString(err) << std::endl;
        return -1;
    }
    err = hipMalloc((void**)&c, n * sizeof(int));
    if (err != hipSuccess) {
        std::cerr << "hipMalloc failed for c: " << hipGetErrorString(err) << std::endl;
        return -1;
    }

    // 初始化数组
    int *host_a = new int[n];
    int *host_b = new int[n];
    int *host_c = new int[n];
    for (int i = 0; i < n; i++) {
        host_a[i] = i;
        host_b[i] = i * 2;
        host_c[i] = 0;
    }

    // 将数据复制到设备内存
    err = hipMemcpy(a, host_a, n * sizeof(int), hipMemcpyHostToDevice);
    if (err != hipSuccess) {
        std::cerr << "hipMemcpy failed for a: " << hipGetErrorString(err) << std::endl;
        return -1;
    }
    err = hipMemcpy(b, host_b, n * sizeof(int), hipMemcpyHostToDevice);
    if (err != hipSuccess) {
        std::cerr << "hipMemcpy failed for b: " << hipGetErrorString(err) << std::endl;
        return -1;
    }

    // 启动kernel
    int threadsPerBlock = 256;
    int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock;
    vector_add<<<blocksPerGrid, threadsPerBlock>>>(a, b, c, n);
    err = hipGetLastError();
    if (err != hipSuccess) {
        std::cerr << "hipLaunchKernel failed: " << hipGetErrorString(err) << std::endl;
        return -1;
    }

    // 同步以等待kernel完成
    err = hipDeviceSynchronize();
    if (err != hipSuccess) {
        std::cerr << "hipDeviceSynchronize failed: " << hipGetErrorString(err) << std::endl;
        return -1;
    }

    // 从设备内存复制结果回主机
    err = hipMemcpy(host_c, c, n * sizeof(int), hipMemcpyDeviceToHost);
    if (err != hipSuccess) {
        std::cerr << "hipMemcpy failed for c: " << hipGetErrorString(err) << std::endl;
        return -1;
    }

    // 输出结果
    for (int i = 0; i < 10; i++) {
        std::cout << "c[" << i << "] = " << host_c[i] << std::endl;
    }

    // 释放内存
    err = hipFree(a);
    if (err != hipSuccess) {
        std::cerr << "hipFree failed for a: " << hipGetErrorString(err) << std::endl;
    }
    err = hipFree(b);
    if (err != hipSuccess) {
        std::cerr << "hipFree failed for b: " << hipGetErrorString(err) << std::endl;
    }
    err = hipFree(c);
    if (err != hipSuccess) {
        std::cerr << "hipFree failed for c: " << hipGetErrorString(err) << std::endl;
    }

    delete[] host_a;
    delete[] host_b;
    delete[] host_c;

    return 0;
}

使用 hipcc 编译:hipcc -o vectoradd vectoradd.cpp。可以正常编译,但是运行时,会在 hipLaunchKernel 处报错,这就很奇怪了。

  1. 如果是编译环境有问题,应该是编译不过,而不是运行时出错。
  2. 如果运行库有问题,应该一开始运行就出错,而不是 hipMalloc hipMemcpy 这些函数调用正常,hipLaunchKernel 调用出错。
  3. 使用旧版 ROCm 5.7 的 hipcc 编译可以正常运行,所以硬件没有问题。

参考教程,出现 invalid device function 的常见错误是编译的架构和运行设备的架构不匹配,但是笔者已经指定了架构为 gfx1031,所以应该不是这个问题。教程进一步提到,可以使用 --save-temps 参数保存中间结果。运行后产生了 host 和 device 的中间文件,检查汇编文件也没有发现问题。

寻找了一番资料,找到了官方的调试指南,可以使用 AMD_LOG_LEVEL 环境变量来设置日志级别,于是尝试设置 AMD_LOG_LEVEL=5,再次运行,日志如下:

$ AMD_LOG_LEVEL=5 ./vectoradd 
:3:rocdevice.cpp            :468 : 89870210473 us: [pid:1089444 tid:0x76cc16cb11c0] Initializing HSA stack.
:3:rocdevice.cpp            :554 : 89870224328 us: [pid:1089444 tid:0x76cc16cb11c0] Enumerated GPU agents = 1
:3:rocdevice.cpp            :234 : 89870224364 us: [pid:1089444 tid:0x76cc16cb11c0] Numa selects cpu agent[0]=0x5c301dce7ac0(fine=0x5c301dcea210,coarse=0x5c301dce5a80) for gpu agent=0x5c301dcf28c0 CPU<->GPU XGMI=0
:3:rocsettings.cpp          :290 : 89870224371 us: [pid:1089444 tid:0x76cc16cb11c0] Using dev kernel arg wa = 0
:3:comgrctx.cpp             :33  : 89870224375 us: [pid:1089444 tid:0x76cc16cb11c0] Loading COMGR library.
:3:comgrctx.cpp             :126 : 89870224406 us: [pid:1089444 tid:0x76cc16cb11c0] Loaded COMGR library version 2.8.
:3:rocdevice.cpp            :1810: 89870224918 us: [pid:1089444 tid:0x76cc16cb11c0] Gfx Major/Minor/Stepping: 10/3/1
:3:rocdevice.cpp            :1812: 89870224922 us: [pid:1089444 tid:0x76cc16cb11c0] HMM support: 1, XNACK: 0, Direct host access: 0
:3:rocdevice.cpp            :1814: 89870224926 us: [pid:1089444 tid:0x76cc16cb11c0] Max SDMA Read Mask: 0x3, Max SDMA Write Mask: 0x3
:4:rocdevice.cpp            :2222: 89870225341 us: [pid:1089444 tid:0x76cc16cb11c0] Allocate hsa host memory 0x76cc14900000, size 0x101000, numa_node = 0
:4:rocdevice.cpp            :2222: 89870226242 us: [pid:1089444 tid:0x76cc16cb11c0] Allocate hsa host memory 0x76cc14700000, size 0x101000, numa_node = 0
:4:rocdevice.cpp            :2222: 89870227212 us: [pid:1089444 tid:0x76cc16cb11c0] Allocate hsa host memory 0x76cc0fa00000, size 0x400000, numa_node = 0
:4:rocdevice.cpp            :2222: 89870227568 us: [pid:1089444 tid:0x76cc16cb11c0] Allocate hsa host memory 0x76cc16ca3000, size 0x38, numa_node = 0
:4:runtime.cpp              :85  : 89870227639 us: [pid:1089444 tid:0x76cc16cb11c0] init
:3:hip_context.cpp          :49  : 89870227642 us: [pid:1089444 tid:0x76cc16cb11c0] Direct Dispatch: 1
:3:hip_device_runtime.cpp   :651 : 89870227658 us: [pid:1089444 tid:0x76cc16cb11c0]  hipGetDeviceCount ( 0x7ffc16ba1714 ) 
:3:hip_device_runtime.cpp   :653 : 89870227663 us: [pid:1089444 tid:0x76cc16cb11c0] hipGetDeviceCount: Returned hipSuccess : 
:3:hip_device_runtime.cpp   :666 : 89870227668 us: [pid:1089444 tid:0x76cc16cb11c0]  hipSetDevice ( 0 ) 
:3:hip_device_runtime.cpp   :670 : 89870227671 us: [pid:1089444 tid:0x76cc16cb11c0] hipSetDevice: Returned hipSuccess : 
Using device 0.
:3:hip_memory.cpp           :615 : 89870227932 us: [pid:1089444 tid:0x76cc16cb11c0]  hipMalloc ( 0x7ffc16ba1708, 4096 ) 
:4:rocdevice.cpp            :2379: 89870228289 us: [pid:1089444 tid:0x76cc16cb11c0] Allocate hsa device memory 0x76cc14400000, size 0x1000
:3:rocdevice.cpp            :2418: 89870228294 us: [pid:1089444 tid:0x76cc16cb11c0] Device=0x5c301dd2a920, freeMem_ = 0x2fefff000
:3:hip_memory.cpp           :617 : 89870228299 us: [pid:1089444 tid:0x76cc16cb11c0] hipMalloc: Returned hipSuccess : 0x76cc14400000: duration: 367 us
:3:hip_memory.cpp           :615 : 89870228303 us: [pid:1089444 tid:0x76cc16cb11c0]  hipMalloc ( 0x7ffc16ba1700, 4096 ) 
:4:rocdevice.cpp            :2379: 89870228307 us: [pid:1089444 tid:0x76cc16cb11c0] Allocate hsa device memory 0x76cc14401000, size 0x1000
:3:rocdevice.cpp            :2418: 89870228312 us: [pid:1089444 tid:0x76cc16cb11c0] Device=0x5c301dd2a920, freeMem_ = 0x2feffe000
:3:hip_memory.cpp           :617 : 89870228318 us: [pid:1089444 tid:0x76cc16cb11c0] hipMalloc: Returned hipSuccess : 0x76cc14401000: duration: 15 us
:3:hip_memory.cpp           :615 : 89870228323 us: [pid:1089444 tid:0x76cc16cb11c0]  hipMalloc ( 0x7ffc16ba16f8, 4096 ) 
:4:rocdevice.cpp            :2379: 89870228328 us: [pid:1089444 tid:0x76cc16cb11c0] Allocate hsa device memory 0x76cc14402000, size 0x1000
:3:rocdevice.cpp            :2418: 89870228332 us: [pid:1089444 tid:0x76cc16cb11c0] Device=0x5c301dd2a920, freeMem_ = 0x2feffd000
:3:hip_memory.cpp           :617 : 89870228336 us: [pid:1089444 tid:0x76cc16cb11c0] hipMalloc: Returned hipSuccess : 0x76cc14402000: duration: 13 us
:3:hip_memory.cpp           :690 : 89870228511 us: [pid:1089444 tid:0x76cc16cb11c0]  hipMemcpy ( 0x76cc14400000, 0x5c301dd38510, 4096, hipMemcpyHostToDevice ) 
:3:rocdevice.cpp            :3030: 89870228522 us: [pid:1089444 tid:0x76cc16cb11c0] Number of allocated hardware queues with low priority: 0, with normal priority: 0, with high priority: 0, maximum per priority is: 4
:3:rocdevice.cpp            :3106: 89870239538 us: [pid:1089444 tid:0x76cc16cb11c0] Created SWq=0x76cc16c6a000 to map on HWq=0x76cc0dd00000 with size 16384 with priority 1, cooperative: 0
:3:rocdevice.cpp            :3198: 89870239560 us: [pid:1089444 tid:0x76cc16cb11c0] acquireQueue refCount: 0x76cc0dd00000 (1)
:4:rocdevice.cpp            :2222: 89870239969 us: [pid:1089444 tid:0x76cc16cb11c0] Allocate hsa host memory 0x76cc0db00000, size 0x100000, numa_node = 0
:3:devprogram.cpp           :2648: 89870467601 us: [pid:1089444 tid:0x76cc16cb11c0] Using Code Object V5.
:4:command.cpp              :348 : 89870473159 us: [pid:1089444 tid:0x76cc16cb11c0] Command (CopyHostToDevice) enqueued: 0x5c301e04e270
:4:rocblit.cpp              :834 : 89870474035 us: [pid:1089444 tid:0x76cc16cb11c0] HSA Async Copy staged H2D dst=0x76cc14400000, src=0x76cc14900000, size=4096, completion_signal=0x76cc161ff700
:4:rocvirtual.cpp           :572 : 89870474042 us: [pid:1089444 tid:0x76cc16cb11c0] Host wait on completion_signal=0x76cc161ff700
:3:rocvirtual.hpp           :67  : 89870474048 us: [pid:1089444 tid:0x76cc16cb11c0] Host active wait for Signal = (0x76cc161ff700) for -1 ns
:4:command.cpp              :287 : 89870474055 us: [pid:1089444 tid:0x76cc16cb11c0] Queue marker to command queue: 0x5c301dbcc110
:4:command.cpp              :348 : 89870474059 us: [pid:1089444 tid:0x76cc16cb11c0] Command (InternalMarker) enqueued: 0x5c301df8c530
:4:command.cpp              :177 : 89870474066 us: [pid:1089444 tid:0x76cc16cb11c0] Command 0x5c301e04e270 complete
:4:command.cpp              :175 : 89870474070 us: [pid:1089444 tid:0x76cc16cb11c0] Command 0x5c301df8c530 complete (Wall: 89870474069, CPU: 0, GPU: 0 us)
:4:command.cpp              :252 : 89870474075 us: [pid:1089444 tid:0x76cc16cb11c0] Waiting for event 0x5c301e04e270 to complete, current status 0
:4:command.cpp              :266 : 89870474078 us: [pid:1089444 tid:0x76cc16cb11c0] Event 0x5c301e04e270 wait completed
:3:hip_memory.cpp           :691 : 89870474082 us: [pid:1089444 tid:0x76cc16cb11c0] hipMemcpy: Returned hipSuccess : : duration: 245571 us
:3:hip_memory.cpp           :690 : 89870474091 us: [pid:1089444 tid:0x76cc16cb11c0]  hipMemcpy ( 0x76cc14401000, 0x5c301dd39520, 4096, hipMemcpyHostToDevice ) 
:4:command.cpp              :348 : 89870474098 us: [pid:1089444 tid:0x76cc16cb11c0] Command (CopyHostToDevice) enqueued: 0x5c301e04e270
:4:rocblit.cpp              :834 : 89870474105 us: [pid:1089444 tid:0x76cc16cb11c0] HSA Async Copy staged H2D dst=0x76cc14401000, src=0x76cc14900000, size=4096, completion_signal=0x76cc161ff680
:4:rocvirtual.cpp           :572 : 89870474108 us: [pid:1089444 tid:0x76cc16cb11c0] Host wait on completion_signal=0x76cc161ff680
:3:rocvirtual.hpp           :67  : 89870474113 us: [pid:1089444 tid:0x76cc16cb11c0] Host active wait for Signal = (0x76cc161ff680) for -1 ns
:4:command.cpp              :287 : 89870474117 us: [pid:1089444 tid:0x76cc16cb11c0] Queue marker to command queue: 0x5c301dbcc110
:4:command.cpp              :348 : 89870474121 us: [pid:1089444 tid:0x76cc16cb11c0] Command (InternalMarker) enqueued: 0x5c301df8c530
:4:command.cpp              :177 : 89870474125 us: [pid:1089444 tid:0x76cc16cb11c0] Command 0x5c301e04e270 complete
:4:command.cpp              :175 : 89870474129 us: [pid:1089444 tid:0x76cc16cb11c0] Command 0x5c301df8c530 complete (Wall: 89870474128, CPU: 0, GPU: 0 us)
:4:command.cpp              :252 : 89870474133 us: [pid:1089444 tid:0x76cc16cb11c0] Waiting for event 0x5c301e04e270 to complete, current status 0
:4:command.cpp              :266 : 89870474138 us: [pid:1089444 tid:0x76cc16cb11c0] Event 0x5c301e04e270 wait completed
:3:hip_memory.cpp           :691 : 89870474140 us: [pid:1089444 tid:0x76cc16cb11c0] hipMemcpy: Returned hipSuccess : : duration: 49 us
:3:hip_platform.cpp         :225 : 89870474148 us: [pid:1089444 tid:0x76cc16cb11c0]  __hipPushCallConfiguration ( {4,1,1}, {256,1,1}, 0, stream:<null> ) 
:3:hip_platform.cpp         :229 : 89870474152 us: [pid:1089444 tid:0x76cc16cb11c0] __hipPushCallConfiguration: Returned hipSuccess : 
:3:hip_platform.cpp         :234 : 89870474159 us: [pid:1089444 tid:0x76cc16cb11c0]  __hipPopCallConfiguration ( {500405536,23600,0}, {4294966920,4294967295,381294696}, 0x7ffc16ba1680, 0x7ffc16ba1678 ) 
:3:hip_platform.cpp         :243 : 89870474163 us: [pid:1089444 tid:0x76cc16cb11c0] __hipPopCallConfiguration: Returned hipSuccess : 
:3:hip_module.cpp           :677 : 89870474178 us: [pid:1089444 tid:0x76cc16cb11c0]  hipLaunchKernel ( 0x5c2ff28dd840, {4,1,1}, {256,1,1}, 0x7ffc16ba16c0, 0, stream:<null> ) 
:3:hip_module.cpp           :678 : 89870474184 us: [pid:1089444 tid:0x76cc16cb11c0] hipLaunchKernel: Returned hipErrorInvalidDeviceFunction : 
:3:hip_error.cpp            :36  : 89870474189 us: [pid:1089444 tid:0x76cc16cb11c0]  hipGetLastError (  ) 
hipLaunchKernelGGL failed: invalid device function
:3:hip_device_runtime.cpp   :620 : 89870474224 us: [pid:1089444 tid:0x76cc16cb11c0]  hipDeviceSynchronize (  ) 
:4:commandqueue.cpp         :147 : 89870474228 us: [pid:1089444 tid:0x76cc16cb11c0] HW Event not ready, awaiting completion instead
:4:commandqueue.cpp         :163 : 89870474232 us: [pid:1089444 tid:0x76cc16cb11c0] All commands finished
:3:hip_device_runtime.cpp   :624 : 89870474237 us: [pid:1089444 tid:0x76cc16cb11c0] hipDeviceSynchronize: Returned hipSuccess : 
:4:command.cpp              :348 : 89870474379 us: [pid:1089444 tid:0x76cc16cb11c0] Command (Marker) enqueued: 0x5c301df8c530
:3:rocvirtual.cpp           :476 : 89870474391 us: [pid:1089444 tid:0x76cc16cb11c0] Set Handler: handle(0x76cc161ff600), timestamp(0x5c301e5076f0)
:4:rocvirtual.cpp           :1091: 89870474397 us: [pid:1089444 tid:0x76cc16cb11c0] SWq=0x76cc16c6a000, HWq=0x76cc0dd00000, id=1, BarrierAND Header = 0x1503 (type=3, barrier=1, acquire=2, release=2), dep_signal=[0x0, 0x0, 0x0, 0x0, 0x0], completion_signal=0x76cc161ff600
:4:command.cpp              :252 : 89870474403 us: [pid:1089444 tid:0x76cc16cb11c0] Waiting for event 0x5c301df8c530 to complete, current status 2
:3:rocvirtual.cpp           :223 : 89870474533 us: [pid:1089444 tid:0x76cc0e8006c0] Handler: value(0), timestamp(0x5c301e1a33e0), handle(0x76cc161ff600)
:4:command.cpp              :266 : 89870474549 us: [pid:1089444 tid:0x76cc16cb11c0] Event 0x5c301df8c530 wait completed
:4:command.cpp              :175 : 89870474549 us: [pid:1089444 tid:0x76cc0e8006c0] Command 0x5c301df8c530 complete (Wall: 89870474548, CPU: 0, GPU: 162 us)
:4:rocdevice.cpp            :2395: 89870474788 us: [pid:1089444 tid:0x76cc16cb11c0] Free hsa memory 0x76cc0db00000
:4:rocdevice.cpp            :2395: 89870474793 us: [pid:1089444 tid:0x76cc16cb11c0] Free hsa memory (nil)
:3:rocdevice.cpp            :3210: 89870474797 us: [pid:1089444 tid:0x76cc16cb11c0] releaseQueue refCount:0x76cc0dd00000 (0)
:4:runtime.cpp              :93  : 89870474810 us: [pid:1089444 tid:0x76cc16cb11c0] tearDown
:4:rocdevice.cpp            :2395: 89870474957 us: [pid:1089444 tid:0x76cc16cb11c0] Free hsa memory 0x76cc0fa00000
:4:rocdevice.cpp            :2395: 89870475057 us: [pid:1089444 tid:0x76cc16cb11c0] Free hsa memory 0x76cc16ca3000
:3:rocdevice.cpp            :286 : 89870475063 us: [pid:1089444 tid:0x76cc16cb11c0] Deleting hardware queue 0x76cc0dd00000 with refCount 0
:4:rocdevice.cpp            :2395: 89870482134 us: [pid:1089444 tid:0x76cc16cb11c0] Free hsa memory 0x76cc14700000
:4:rocdevice.cpp            :2395: 89870482356 us: [pid:1089444 tid:0x76cc16cb11c0] Free hsa memory 0x76cc14900000

可惜的是,这个日志最多只记录到了 hipLaunchKernel 返回错误代码,并没有更详细的错误信息。目前来看,唯一的办法就是检查 hipLaunchKernel 的源码了。

源码溯源

hipLaunchKernel 位于 hipamd/src/hip_module.cpp ,内容很简单:

hipError_t hipLaunchKernel(const void* hostFunction, dim3 gridDim, dim3 blockDim,
                                      void** args, size_t sharedMemBytes, hipStream_t stream) {
  HIP_INIT_API(hipLaunchKernel, hostFunction, gridDim, blockDim, args, sharedMemBytes, stream);
  HIP_RETURN(hipLaunchKernel_common(hostFunction, gridDim, blockDim, args, sharedMemBytes, stream));
}

hipLaunchKernel_common 就在上头:

hipError_t hipLaunchKernel_common(const void* hostFunction, dim3 gridDim, dim3 blockDim,
                                             void** args, size_t sharedMemBytes,
                                             hipStream_t stream) {
  STREAM_CAPTURE(hipLaunchKernel, stream, hostFunction, gridDim, blockDim, args, sharedMemBytes);
  return ihipLaunchKernel(hostFunction, gridDim, blockDim, args, sharedMemBytes, stream, nullptr,
                          nullptr, 0);
}

溯源到 ihipLaunchKernel ,位于 hipamd/src/hip_platform.cpp ,这里实现了主要逻辑:

hipError_t ihipLaunchKernel(const void* hostFunction, dim3 gridDim, dim3 blockDim, void** args,
                            size_t sharedMemBytes, hipStream_t stream, hipEvent_t startEvent,
                            hipEvent_t stopEvent, int flags) {
  hipFunction_t func = nullptr;
  int deviceId = hip::Stream::DeviceId(stream);
  hipError_t hip_error = PlatformState::instance().getStatFunc(&func, hostFunction, deviceId);
  if ((hip_error != hipSuccess) || (func == nullptr)) {
    if (hip_error == hipErrorNoBinaryForGpu) {
      return hip_error;
    } else {
      return hipErrorInvalidDeviceFunction;
    }
  }
  size_t globalWorkSizeX = static_cast<size_t>(gridDim.x) * blockDim.x;
  size_t globalWorkSizeY = static_cast<size_t>(gridDim.y) * blockDim.y;
  size_t globalWorkSizeZ = static_cast<size_t>(gridDim.z) * blockDim.z;
  if (globalWorkSizeX > std::numeric_limits<uint32_t>::max() ||
      globalWorkSizeY > std::numeric_limits<uint32_t>::max() ||
      globalWorkSizeZ > std::numeric_limits<uint32_t>::max()) {
    return hipErrorInvalidConfiguration;
  }
  return ihipModuleLaunchKernel(
      func, static_cast<uint32_t>(globalWorkSizeX), static_cast<uint32_t>(globalWorkSizeY),
      static_cast<uint32_t>(globalWorkSizeZ), blockDim.x, blockDim.y, blockDim.z, sharedMemBytes,
      stream, args, nullptr, startEvent, stopEvent, flags);
}

可能抛出 invalid device function 的只有 PlatformState::instance().getStatFunc 这个函数,位于 hipamd/src/hip_global.cpp

hipError_t Function::getStatFunc(hipFunction_t* hfunc, int deviceId) {
  guarantee(modules_ != nullptr, "Module not initialized");

  if (dFunc_.size() != g_devices.size()) {
    return hipErrorNoBinaryForGpu;
  }

  hipModule_t hmod = nullptr;
  IHIP_RETURN_ONFAIL((*modules_)->BuildProgram(deviceId));
  IHIP_RETURN_ONFAIL((*modules_)->GetModule(deviceId, &hmod));

  if (dFunc_[deviceId] == nullptr) {
    dFunc_[deviceId] = new DeviceFunc(name_, hmod);
  }
  *hfunc = dFunc_[deviceId]->asHipFunction();

  return hipSuccess;
}

这里可能出问题的在与 BuildProgramGetModule 函数。

BuildProgram 位于 hipamd/src/hip_fatbin.cpp

hipError_t FatBinaryInfo::BuildProgram(const int device_id) {

  // Device Id Check and Add DeviceProgram if not added so far
  DeviceIdCheck(device_id);
  IHIP_RETURN_ONFAIL(AddDevProgram(device_id));

  // If Program was already built skip this step and return success
  FatBinaryDeviceInfo* fbd_info = fatbin_dev_info_[device_id];
  if (fbd_info->prog_built_ == false) {
    if(CL_SUCCESS != fbd_info->program_->build(g_devices[device_id]->devices(),
                                               nullptr, nullptr, nullptr,
                                               kOptionChangeable, kNewDevProg)) {
      return hipErrorNoBinaryForGpu;
    }
    fbd_info->prog_built_ = true;
  }

  if (!fbd_info->program_->load()) {
    return hipErrorNoBinaryForGpu;
  }
  return hipSuccess;
}

这里有可能抛出 invalid device function 错误的地方只有 AddDevProgram 函数,就在 BuildProgram 上方代码中:

hipError_t FatBinaryInfo::AddDevProgram(const int device_id) {
  // Device Id bounds Check
  DeviceIdCheck(device_id);

  FatBinaryDeviceInfo* fbd_info = fatbin_dev_info_[device_id];
  if (fbd_info == nullptr) {
    return hipErrorInvalidKernelFile;
  }

  // If fat binary was already added, skip this step and return success
  if (fbd_info->add_dev_prog_ == false) {
    amd::Context* ctx = g_devices[device_id]->asContext();
    if (CL_SUCCESS != fbd_info->program_->addDeviceProgram(*ctx->devices()[0],
                                          fbd_info->binary_image_,
                                          fbd_info->binary_size_, false,
                                          nullptr, nullptr, fdesc_,
                                          fbd_info->binary_offset_, uri_)) {
      return hipErrorInvalidKernelFile;
    }
    fbd_info->add_dev_prog_ = true;
  }
  return hipSuccess;
}

这里有可能出问题的地方在与 addDeviceProgram 函数,位于 rocclr/platform/program.cpp

int32_t Program::addDeviceProgram(Device& device, const void* image, size_t length,
                                  bool make_copy, amd::option::Options* options,
                                  const amd::Program* same_prog, amd::Os::FileDesc fdesc,
                                  size_t foffset, std::string uri) {
  if (image != NULL &&  !amd::Elf::isElfMagic((const char*)image)) {
    if (device.settings().useLightning_) {
      return CL_INVALID_BINARY;
    }
#if defined(WITH_COMPILER_LIB)
    else if (!amd::Hsail::ValidateBinaryImage(
          image, length, language_ == SPIRV ? BINARY_TYPE_SPIRV : BINARY_TYPE_ELF | BINARY_TYPE_LLVM)) {
      return CL_INVALID_BINARY;
    }
#endif // !defined(WITH_COMPILER_LIB)
  }

  // Check if the device is already associated with this program
  if (deviceList_.find(&device) != deviceList_.end()) {
    return CL_INVALID_VALUE;
  }

  Device& rootDev = device;

  // if the rootDev is already associated with a program
  if (devicePrograms_[&rootDev] != NULL) {
    return CL_SUCCESS;
  }

#if defined(WITH_COMPILER_LIB)
  bool emptyOptions = (options == nullptr);
#endif
  amd::option::Options emptyOpts;
  if (options == NULL) {
    options = &emptyOpts;
  }

#if defined(WITH_COMPILER_LIB)
  if (image != NULL && length != 0 && amd::Hsail::ValidateBinaryImage(image, length, BINARY_TYPE_ELF)) {
    acl_error errorCode;
    aclBinary* binary = amd::Hsail::ReadFromMem(image, length, &errorCode);
    if (errorCode != ACL_SUCCESS) {
      return CL_INVALID_BINARY;
    }
    const oclBIFSymbolStruct* symbol = findBIF30SymStruct(symOpenclCompilerOptions);
    assert(symbol && "symbol not found");
    std::string symName = std::string(symbol->str[bif::PRE]) + std::string(symbol->str[bif::POST]);
    size_t symSize = 0;
    const void* opts = amd::Hsail::ExtractSymbol(device.binCompiler(), binary, &symSize, aclCOMMENT,
                                                 symName.c_str(), &errorCode);
    // if we have options from binary and input options was not specified
    if (opts != NULL && emptyOptions) {
      std::string sBinOptions = std::string((char*)opts, symSize);
      if (!amd::option::parseAllOptions(sBinOptions, *options, false, false)) {
        programLog_ = options->optionsLog();
        LogError("Parsing compilation options from binary failed.");
        return CL_INVALID_COMPILER_OPTIONS;
      }
    }
    options->oVariables->Legacy = !device.settings().useLightning_ ?
                                     isAMDILTarget(*amd::aclutGetTargetInfo(binary)) :
                                     isHSAILTarget(*amd::aclutGetTargetInfo(binary));
    amd::Hsail::BinaryFini(binary);
  }
#endif // defined(WITH_COMPILER_LIB)
  options->oVariables->BinaryIsSpirv = language_ == SPIRV;
  device::Program* program = rootDev.createProgram(*this, options);
  if (program == NULL) {
    return CL_OUT_OF_HOST_MEMORY;
  }

  if (image != NULL) {
    const uint8_t* memory = std::get<0>(binary(rootDev));
    // clone 'binary' (it is owned by the host thread).
    if (memory == NULL) {
      if (make_copy) {
        auto *image_copy = new (std::nothrow) uint8_t[length];
        if (image_copy == NULL) {
          delete program;
          return CL_OUT_OF_HOST_MEMORY;
        }

        ::memcpy(image_copy, image, length);
        memory = image_copy;
      } else {
        memory = static_cast<const uint8_t*>(image);
      }

      // Save the original image
      binary_[&rootDev] = std::make_tuple(memory, length, make_copy);
    }

    const device::Program* same_dev_prog = nullptr;
    if ((amd::IS_HIP) && (same_prog != nullptr)) {
      const auto &same_dev_prog_map_ = same_prog->devicePrograms();
      guarantee(same_dev_prog_map_.size() == 1, "For same_prog, devicePrograms size != 1");
      same_dev_prog = same_dev_prog_map_.begin()->second;
    }

    if (!program->setBinary(reinterpret_cast<const char*>(memory), length, same_dev_prog,
                            fdesc, foffset, uri)) {
      delete program;
      return CL_INVALID_BINARY;
    }
  }

  devicePrograms_[&rootDev] = program;

  deviceList_.insert(&device);
  return CL_SUCCESS;
}

这里的逻辑比较复杂,不好静态分析。所幸 AMD 提供了一系列调试函数,我主要使用了 warning 函数当 print 用。利用 GLM-4 自动增加一系列标记 warning ,这种低级编程很适合 LLM 完成 :

int32_t Program::addDeviceProgram(Device& device, const void* image, size_t length,
                                 bool make_copy, amd::option::Options* options,
                                 const amd::Program* same_prog, amd::Os::FileDesc fdesc,
                                 size_t foffset, std::string uri) {
 warning("Entering addDeviceProgram");

 if (image != NULL &&  !amd::Elf::isElfMagic((const char*)image)) {
   warning("Invalid ELF magic number");
   if (device.settings().useLightning_) {
     warning("Device uses Lightning, returning CL_INVALID_BINARY");
     return CL_INVALID_BINARY;
   }
#if defined(WITH_COMPILER_LIB)
   else if (!amd::Hsail::ValidateBinaryImage(
         image, length, language_ == SPIRV ? BINARY_TYPE_SPIRV : BINARY_TYPE_ELF | BINARY_TYPE_LLVM)) {
     warning("Binary image validation failed, returning CL_INVALID_BINARY");
     return CL_INVALID_BINARY;
   }
#endif // !defined(WITH_COMPILER_LIB)
 }

 // Check if the device is already associated with this program
 if (deviceList_.find(&device) != deviceList_.end()) {
   warning("Device already associated with this program, returning CL_INVALID_VALUE");
   return CL_INVALID_VALUE;
 }

 Device& rootDev = device;

 // if the rootDev is already associated with a program
 if (devicePrograms_[&rootDev] != NULL) {
   warning("Root device already associated with a program, returning CL_SUCCESS");
   return CL_SUCCESS;
 }

#if defined(WITH_COMPILER_LIB)
 bool emptyOptions = (options == nullptr);
#endif
 amd::option::Options emptyOpts;
 if (options == NULL) {
   warning("Options is NULL, using empty options");
   options = &emptyOpts;
 }

#if defined(WITH_COMPILER_LIB)
 if (image != NULL && length != 0 && amd::Hsail::ValidateBinaryImage(image, length, BINARY_TYPE_ELF)) {
   warning("Valid ELF binary image detected");
   acl_error errorCode;
   aclBinary* binary = amd::Hsail::ReadFromMem(image, length, &errorCode);
   if (errorCode != ACL_SUCCESS) {
     warning("Reading binary from memory failed, returning CL_INVALID_BINARY");
     return CL_INVALID_BINARY;
   }
   const oclBIFSymbolStruct* symbol = findBIF30SymStruct(symOpenclCompilerOptions);
   assert(symbol && "symbol not found");
   std::string symName = std::string(symbol->str[bif::PRE]) + std::string(symbol->str[bif::POST]);
   size_t symSize = 0;
   const void* opts = amd::Hsail::ExtractSymbol(device.binCompiler(), binary, &symSize, aclCOMMENT,
                                                symName.c_str(), &errorCode);
   // if we have options from binary and input options was not specified
   if (opts != NULL && emptyOptions) {
     warning("Extracted options from binary, parsing them");
     std::string sBinOptions = std::string((char*)opts, symSize);
     if (!amd::option::parseAllOptions(sBinOptions, *options, false, false)) {
       programLog_ = options->optionsLog();
       LogError("Parsing compilation options from binary failed.");
       warning("Parsing compilation options from binary failed, returning CL_INVALID_COMPILER_OPTIONS");
       return CL_INVALID_COMPILER_OPTIONS;
     }
   }
   options->oVariables->Legacy = !device.settings().useLightning_ ?
                                    isAMDILTarget(*amd::aclutGetTargetInfo(binary)) :
                                    isHSAILTarget(*amd::aclutGetTargetInfo(binary));
   amd::Hsail::BinaryFini(binary);
 }
#endif // defined(WITH_COMPILER_LIB)
 options->oVariables->BinaryIsSpirv = language_ == SPIRV;
 warning("Creating device program");
 device::Program* program = rootDev.createProgram(*this, options);
 if (program == NULL) {
   warning("Failed to create device program, returning CL_OUT_OF_HOST_MEMORY");
   return CL_OUT_OF_HOST_MEMORY;
 }

 if (image != NULL) {
   warning("Setting binary for device program");
   const uint8_t* memory = std::get<0>(binary(rootDev));
   // clone 'binary' (it is owned by the host thread).
   if (memory == NULL) {
     if (make_copy) {
       warning("Making a copy of the binary image");
       auto *image_copy = new (std::nothrow) uint8_t[length];
       if (image_copy == NULL) {
         delete program;
         warning("Failed to allocate memory for image copy, returning CL_OUT_OF_HOST_MEMORY");
         return CL_OUT_OF_HOST_MEMORY;
       }

       ::memcpy(image_copy, image, length);
       memory = image_copy;
     } else {
       memory = static_cast<const uint8_t*>(image);
     }

     // Save the original image
     binary_[&rootDev] = std::make_tuple(memory, length, make_copy);
   }

   const device::Program* same_dev_prog = nullptr;
   if ((amd::IS_HIP) && (same_prog != nullptr)) {
     const auto &same_dev_prog_map_ = same_prog->devicePrograms();
     guarantee(same_dev_prog_map_.size() == 1, "For same_prog, devicePrograms size != 1");
     same_dev_prog = same_dev_prog_map_.begin()->second;
   }

    bool ret;
    ret = program->setBinary(reinterpret_cast<const char*>(memory), length, same_dev_prog,
                           fdesc, foffset, uri);

    if (ret) {
      warning("outside setBinary return true");
    } else {
      warning("outside setBinary return false");
    }
    if (!ret) {
     warning("Failed to set binary for device program, returning CL_INVALID_BINARY");
     delete program;
     return CL_INVALID_BINARY;
   }
 }

 devicePrograms_[&rootDev] = program;
 deviceList_.insert(&device);
 warning("Device program added successfully, returning CL_SUCCESS");
 return CL_SUCCESS;
}

重新编译运行后,发现问题就在第一步 isElfMagic 函数,位于 rocclr/elf/elf.cpp

bool Elf::isElfMagic(const char* p)
{
  if (p == nullptr || strncmp(p, ELFMAG, SELFMAG) != 0) {
    return false;
  }
  return true;
}

ELFMAG 这个宏的定义在文件头:

#if !defined(ELFMAG)
#define ELFMAG  "\177ELF"
#define SELFMAG 4
#endif

就是常见的 “ELF” 字符串。而程序运行的时候 image 携带的内容是 __CLANG_OFFLOAD_BUNDLE__,这是另外一个数据结构的 magic,位于 hipamd/src/hip_code_object.cpp

// In uncompressed mode
constexpr char kOffloadBundleUncompressedMagicStr[] = "__CLANG_OFFLOAD_BUNDLE__";
static constexpr size_t kOffloadBundleUncompressedMagicStrSize =
    sizeof(kOffloadBundleUncompressedMagicStr);

这里问题就浮现了,两个不同的 magic 怎么会碰到一起进行比较呢?于是转而搜索 image 的来源:

  1. image 来自 fbd_info->binary_image_
  2. fbd_info->binary_image_ 来自 fatbin_dev_info_[device_id]
  3. fatbin_dev_info_ 是成员变量,定义在 hipamd/src/hip_fatbin.hpp

接下来寻找 fatbin_dev_info_ 的赋值,发现在 hipamd/src/hip_fatbin.cpp

hipError_t FatBinaryInfo::ExtractFatBinary(const std::vector<hip::Device*>& devices) {
  if (!HIP_USE_RUNTIME_UNBUNDLER) {
    return ExtractFatBinaryUsingCOMGR(devices);
  }

  hipError_t hip_error = hipSuccess;
  std::vector<std::pair<const void*, size_t>> code_objs;

  // Copy device names for Extract Code object File
  std::vector<std::string> device_names;
  device_names.reserve(devices.size());
  for (size_t dev_idx = 0; dev_idx < devices.size(); ++dev_idx) {
    device_names.push_back(devices[dev_idx]->devices()[0]->isa().isaName());
  }

  // We are given file name, get the file desc and file size
  if (fname_.size() > 0) {
    // Get File Handle & size of the file.
    if (!amd::Os::GetFileHandle(fname_.c_str(), &fdesc_, &fsize_)) {
      return hipErrorFileNotFound;
    }
    if (fsize_ == 0) {
      return hipErrorInvalidImage;
    }

    // Extract the code object from file
    hip_error = CodeObject::ExtractCodeObjectFromFile(fdesc_, fsize_, &image_,
                device_names, code_objs);

  } else if (image_ != nullptr) {
    // We are directly given image pointer directly, try to extract file desc & file Size
    hip_error = CodeObject::ExtractCodeObjectFromMemory(image_,
                device_names, code_objs, uri_);
  } else {
    return hipErrorInvalidValue;
  }

  if (hip_error == hipErrorNoBinaryForGpu) {
    if (fname_.size() > 0) {
      LogPrintfError("hipErrorNoBinaryForGpu: Couldn't find binary for file: %s", fname_.c_str());
    } else {
      LogPrintfError("hipErrorNoBinaryForGpu: Couldn't find binary for ptr: 0x%x", image_);
    }

    // For the condition: unable to find code object for all devices,
    // still extract available images to those devices owning them.
    // This helps users to work with ROCm if there is any supported
    // GFX on system.
    for (size_t dev_idx = 0; dev_idx < devices.size(); ++dev_idx) {
      if (code_objs[dev_idx].first) {
        // Calculate the offset wrt binary_image and the original image
        size_t offset_l
          = (reinterpret_cast<address>(const_cast<void*>(code_objs[dev_idx].first))
              - reinterpret_cast<address>(const_cast<void*>(image_)));

        fatbin_dev_info_[devices[dev_idx]->deviceId()]
          = new FatBinaryDeviceInfo(code_objs[dev_idx].first, code_objs[dev_idx].second, offset_l);

        fatbin_dev_info_[devices[dev_idx]->deviceId()]->program_
          = new amd::Program(*devices[dev_idx]->asContext());
        if (fatbin_dev_info_[devices[dev_idx]->deviceId()]->program_ == NULL) {
          break;
        }
      }
    }

    return hip_error;
  }

  if (hip_error == hipErrorInvalidKernelFile) {
    for (size_t dev_idx = 0; dev_idx < devices.size(); ++dev_idx) {
      // the image type is no CLANG_OFFLOAD_BUNDLER, image for current device directly passed
      fatbin_dev_info_[devices[dev_idx]->deviceId()]
        = new FatBinaryDeviceInfo(image_, CodeObject::ElfSize(image_), 0);
    }
  } else if(hip_error == hipSuccess) {
    for (size_t dev_idx = 0; dev_idx < devices.size(); ++dev_idx) {
      // Calculate the offset wrt binary_image and the original image
      size_t offset_l
        = (reinterpret_cast<address>(const_cast<void*>(code_objs[dev_idx].first))
            - reinterpret_cast<address>(const_cast<void*>(image_)));

      fatbin_dev_info_[devices[dev_idx]->deviceId()]
        = new FatBinaryDeviceInfo(code_objs[dev_idx].first, code_objs[dev_idx].second, offset_l);
    }
  }

  for (size_t dev_idx = 0; dev_idx < devices.size(); ++dev_idx) {
    fatbin_dev_info_[devices[dev_idx]->deviceId()]->program_
       = new amd::Program(*devices[dev_idx]->asContext());
    if (fatbin_dev_info_[devices[dev_idx]->deviceId()]->program_ == NULL) {
      return hipErrorOutOfMemory;
    }
  }

  return hipSuccess;
}

添加 warning 后编译运行,实际调用了 ExtractCodeObjectFromMemory 函数,位于 hipamd/src/hip_code_object.cpp

// This will be moved to COMGR eventually
hipError_t CodeObject::ExtractCodeObjectFromMemory(
    const void* data, const std::vector<std::string>& device_names,
    std::vector<std::pair<const void*, size_t>>& code_objs, std::string& uri) {
  // Get the URI from memory
  if (!amd::Os::GetURIFromMemory(data, 0, uri)) {
    return hipErrorInvalidValue;
  }

  return extractCodeObjectFromFatBinary(data, device_names, code_objs);
}

extractCodeObjectFromFatBinary 函数就在下面:

hipError_t CodeObject::extractCodeObjectFromFatBinary(
    const void* data, const std::vector<std::string>& agent_triple_target_ids,
    std::vector<std::pair<const void*, size_t>>& code_objs) {
  std::string magic((const char*)data, kOffloadBundleUncompressedMagicStrSize);
  if (magic.compare(kOffloadBundleUncompressedMagicStr)) {
    return hipErrorInvalidKernelFile;
  }

  // Initialize Code objects
  code_objs.reserve(agent_triple_target_ids.size());
  for (size_t i = 0; i < agent_triple_target_ids.size(); i++) {
    code_objs.push_back(std::make_pair(nullptr, 0));
  }

  const auto obheader = reinterpret_cast<const __ClangOffloadBundleUncompressedHeader*>(data);
  const auto* desc = &obheader->desc[0];
  size_t num_code_objs = code_objs.size();
  for (uint64_t i = 0; i < obheader->numOfCodeObjects; ++i,
                desc = reinterpret_cast<const __ClangOffloadBundleInfo*>(
                    reinterpret_cast<uintptr_t>(&desc->bundleEntryId[0]) +
                    desc->bundleEntryIdSize)) {
    const void* image =
        reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(obheader) + desc->offset);
    const size_t image_size = desc->size;

    if (num_code_objs == 0) break;
    std::string bundleEntryId{desc->bundleEntryId, desc->bundleEntryIdSize};

    std::string co_triple_target_id;
    if (!getTripleTargetID(bundleEntryId, image, co_triple_target_id)) continue;

    for (size_t dev = 0; dev < agent_triple_target_ids.size(); ++dev) {
      if (code_objs[dev].first) continue;
      if (isCodeObjectCompatibleWithDevice(co_triple_target_id, agent_triple_target_ids[dev])) {
        code_objs[dev] = std::make_pair(image, image_size);
        --num_code_objs;
      }
    }
  }
  if (num_code_objs == 0) {
    return hipSuccess;
  } else {
    LogPrintfError("%s",
                   "hipErrorNoBinaryForGpu: Unable to find code object for all current devices!");
    LogPrintfError("%s", "  Devices:");
    for (size_t i = 0; i < agent_triple_target_ids.size(); i++) {
      LogPrintfError("    %s - [%s]", agent_triple_target_ids[i].c_str(),
                     ((code_objs[i].first) ? "Found" : "Not Found"));
    }
    const auto obheader = reinterpret_cast<const __ClangOffloadBundleUncompressedHeader*>(data);
    const auto* desc = &obheader->desc[0];
    LogPrintfError("%s", "  Bundled Code Objects:");
    for (uint64_t i = 0; i < obheader->numOfCodeObjects; ++i,
                  desc = reinterpret_cast<const __ClangOffloadBundleInfo*>(
                      reinterpret_cast<uintptr_t>(&desc->bundleEntryId[0]) +
                      desc->bundleEntryIdSize)) {
      std::string bundleEntryId{desc->bundleEntryId, desc->bundleEntryIdSize};
      const void* image =
          reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(obheader) + desc->offset);

      std::string co_triple_target_id;
      bool valid_co = getTripleTargetID(bundleEntryId, image, co_triple_target_id);

      if (valid_co) {
        LogPrintfError("    %s - [Code object targetID is %s]", bundleEntryId.c_str(),
                       co_triple_target_id.c_str());
      } else {
        LogPrintfError("    %s - [Unsupported]", bundleEntryId.c_str());
      }
    }
    return hipErrorNoBinaryForGpu;
  }
}

再次添加 warning 打印 data,编译运行,惊奇地发现,data 的内容是 __CLANG_OFFLOAD_BUNDLE__,理应 magic 匹配,但是分支却进入了不匹配的逻辑。一番搜索后,找到了 issue ,原来问题在于结尾的 \0 导致匹配失败了。在最新分支上已经修复了这个问题,这里我们手工打一个 patch,修改成下面这样:

std::string magic((const char*)data, kOffloadBundleUncompressedMagicStrSize-1);
  if (magic.compare(kOffloadBundleUncompressedMagicStr)) {
    return hipErrorInvalidKernelFile;
  }

再次编译运行,终于可以正常运行了!

AMD_LOG_LEVEL=5 ./vectoradd 
:3:rocdevice.cpp            :468 : 89974442900 us: [pid:1092331 tid:0x78ad14e4d1c0] Initializing HSA stack.
:3:rocdevice.cpp            :554 : 89974456729 us: [pid:1092331 tid:0x78ad14e4d1c0] Enumerated GPU agents = 1
:3:rocdevice.cpp            :234 : 89974456763 us: [pid:1092331 tid:0x78ad14e4d1c0] Numa selects cpu agent[0]=0x5911ddce4ac0(fine=0x5911ddce7210,coarse=0x5911ddce2a80) for gpu agent=0x5911ddcef8c0 CPU<->GPU XGMI=0
:3:rocsettings.cpp          :290 : 89974456770 us: [pid:1092331 tid:0x78ad14e4d1c0] Using dev kernel arg wa = 0
:3:comgrctx.cpp             :33  : 89974456774 us: [pid:1092331 tid:0x78ad14e4d1c0] Loading COMGR library.
:3:comgrctx.cpp             :126 : 89974456802 us: [pid:1092331 tid:0x78ad14e4d1c0] Loaded COMGR library version 2.8.
:3:rocdevice.cpp            :1810: 89974456957 us: [pid:1092331 tid:0x78ad14e4d1c0] Gfx Major/Minor/Stepping: 10/3/1
:3:rocdevice.cpp            :1812: 89974456961 us: [pid:1092331 tid:0x78ad14e4d1c0] HMM support: 1, XNACK: 0, Direct host access: 0
:3:rocdevice.cpp            :1814: 89974456967 us: [pid:1092331 tid:0x78ad14e4d1c0] Max SDMA Read Mask: 0x3, Max SDMA Write Mask: 0x3
:4:rocdevice.cpp            :2222: 89974457392 us: [pid:1092331 tid:0x78ad14e4d1c0] Allocate hsa host memory 0x78ac12900000, size 0x101000, numa_node = 0
:4:rocdevice.cpp            :2222: 89974458254 us: [pid:1092331 tid:0x78ad14e4d1c0] Allocate hsa host memory 0x78ac12700000, size 0x101000, numa_node = 0
:4:rocdevice.cpp            :2222: 89974459354 us: [pid:1092331 tid:0x78ad14e4d1c0] Allocate hsa host memory 0x78ac12200000, size 0x400000, numa_node = 0
:4:rocdevice.cpp            :2222: 89974459733 us: [pid:1092331 tid:0x78ad14e4d1c0] Allocate hsa host memory 0x78ad14e3a000, size 0x38, numa_node = 0
:4:runtime.cpp              :85  : 89974459820 us: [pid:1092331 tid:0x78ad14e4d1c0] init
:3:hip_context.cpp          :49  : 89974459825 us: [pid:1092331 tid:0x78ad14e4d1c0] Direct Dispatch: 1
:3:hip_device_runtime.cpp   :651 : 89974459859 us: [pid:1092331 tid:0x78ad14e4d1c0]  hipGetDeviceCount ( 0x7ffc65b1ea44 ) 
:3:hip_device_runtime.cpp   :653 : 89974459865 us: [pid:1092331 tid:0x78ad14e4d1c0] hipGetDeviceCount: Returned hipSuccess : 
:3:hip_device_runtime.cpp   :666 : 89974459871 us: [pid:1092331 tid:0x78ad14e4d1c0]  hipSetDevice ( 0 ) 
:3:hip_device_runtime.cpp   :670 : 89974459875 us: [pid:1092331 tid:0x78ad14e4d1c0] hipSetDevice: Returned hipSuccess : 
Using device 0.
:3:hip_memory.cpp           :615 : 89974460147 us: [pid:1092331 tid:0x78ad14e4d1c0]  hipMalloc ( 0x7ffc65b1ea38, 4096 ) 
:4:rocdevice.cpp            :2379: 89974460570 us: [pid:1092331 tid:0x78ad14e4d1c0] Allocate hsa device memory 0x78ac11e00000, size 0x1000
:3:rocdevice.cpp            :2418: 89974460577 us: [pid:1092331 tid:0x78ad14e4d1c0] Device=0x5911ddd27920, freeMem_ = 0x2fefff000
:3:hip_memory.cpp           :617 : 89974460584 us: [pid:1092331 tid:0x78ad14e4d1c0] hipMalloc: Returned hipSuccess : 0x78ac11e00000: duration: 437 us
:3:hip_memory.cpp           :615 : 89974460589 us: [pid:1092331 tid:0x78ad14e4d1c0]  hipMalloc ( 0x7ffc65b1ea30, 4096 ) 
:4:rocdevice.cpp            :2379: 89974460595 us: [pid:1092331 tid:0x78ad14e4d1c0] Allocate hsa device memory 0x78ac11e01000, size 0x1000
:3:rocdevice.cpp            :2418: 89974460599 us: [pid:1092331 tid:0x78ad14e4d1c0] Device=0x5911ddd27920, freeMem_ = 0x2feffe000
:3:hip_memory.cpp           :617 : 89974460604 us: [pid:1092331 tid:0x78ad14e4d1c0] hipMalloc: Returned hipSuccess : 0x78ac11e01000: duration: 15 us
:3:hip_memory.cpp           :615 : 89974460608 us: [pid:1092331 tid:0x78ad14e4d1c0]  hipMalloc ( 0x7ffc65b1ea28, 4096 ) 
:4:rocdevice.cpp            :2379: 89974460613 us: [pid:1092331 tid:0x78ad14e4d1c0] Allocate hsa device memory 0x78ac11e02000, size 0x1000
:3:rocdevice.cpp            :2418: 89974460616 us: [pid:1092331 tid:0x78ad14e4d1c0] Device=0x5911ddd27920, freeMem_ = 0x2feffd000
:3:hip_memory.cpp           :617 : 89974460622 us: [pid:1092331 tid:0x78ad14e4d1c0] hipMalloc: Returned hipSuccess : 0x78ac11e02000: duration: 14 us
:3:hip_memory.cpp           :690 : 89974460833 us: [pid:1092331 tid:0x78ad14e4d1c0]  hipMemcpy ( 0x78ac11e00000, 0x5911ddd355d0, 4096, hipMemcpyHostToDevice ) 
:3:rocdevice.cpp            :3030: 89974460844 us: [pid:1092331 tid:0x78ad14e4d1c0] Number of allocated hardware queues with low priority: 0, with normal priority: 0, with high priority: 0, maximum per priority is: 4
:3:rocdevice.cpp            :3106: 89974470492 us: [pid:1092331 tid:0x78ad14e4d1c0] Created SWq=0x78ad149f8000 to map on HWq=0x78ac0bd00000 with size 16384 with priority 1, cooperative: 0
:3:rocdevice.cpp            :3198: 89974470514 us: [pid:1092331 tid:0x78ad14e4d1c0] acquireQueue refCount: 0x78ac0bd00000 (1)
:4:rocdevice.cpp            :2222: 89974470907 us: [pid:1092331 tid:0x78ad14e4d1c0] Allocate hsa host memory 0x78ac0a900000, size 0x100000, numa_node = 0
:3:devprogram.cpp           :2648: 89974691965 us: [pid:1092331 tid:0x78ad14e4d1c0] Using Code Object V5.
:4:command.cpp              :348 : 89974695179 us: [pid:1092331 tid:0x78ad14e4d1c0] Command (CopyHostToDevice) enqueued: 0x5911ddf946a0
:4:rocblit.cpp              :834 : 89974695970 us: [pid:1092331 tid:0x78ad14e4d1c0] HSA Async Copy staged H2D dst=0x78ac11e00000, src=0x78ac12900000, size=4096, completion_signal=0x78ad143ff700
:4:rocvirtual.cpp           :572 : 89974695978 us: [pid:1092331 tid:0x78ad14e4d1c0] Host wait on completion_signal=0x78ad143ff700
:3:rocvirtual.hpp           :67  : 89974695983 us: [pid:1092331 tid:0x78ad14e4d1c0] Host active wait for Signal = (0x78ad143ff700) for -1 ns
:4:command.cpp              :287 : 89974695992 us: [pid:1092331 tid:0x78ad14e4d1c0] Queue marker to command queue: 0x5911ddbc9110
:4:command.cpp              :348 : 89974695996 us: [pid:1092331 tid:0x78ad14e4d1c0] Command (InternalMarker) enqueued: 0x5911ddf663c0
:4:command.cpp              :177 : 89974696012 us: [pid:1092331 tid:0x78ad14e4d1c0] Command 0x5911ddf946a0 complete
:4:command.cpp              :175 : 89974696015 us: [pid:1092331 tid:0x78ad14e4d1c0] Command 0x5911ddf663c0 complete (Wall: 89974696014, CPU: 0, GPU: 0 us)
:4:command.cpp              :252 : 89974696019 us: [pid:1092331 tid:0x78ad14e4d1c0] Waiting for event 0x5911ddf946a0 to complete, current status 0
:4:command.cpp              :266 : 89974696023 us: [pid:1092331 tid:0x78ad14e4d1c0] Event 0x5911ddf946a0 wait completed
:3:hip_memory.cpp           :691 : 89974696027 us: [pid:1092331 tid:0x78ad14e4d1c0] hipMemcpy: Returned hipSuccess : : duration: 235194 us
:3:hip_memory.cpp           :690 : 89974696037 us: [pid:1092331 tid:0x78ad14e4d1c0]  hipMemcpy ( 0x78ac11e01000, 0x5911ddd365e0, 4096, hipMemcpyHostToDevice ) 
:4:command.cpp              :348 : 89974696044 us: [pid:1092331 tid:0x78ad14e4d1c0] Command (CopyHostToDevice) enqueued: 0x5911ddf946a0
:4:rocblit.cpp              :834 : 89974696050 us: [pid:1092331 tid:0x78ad14e4d1c0] HSA Async Copy staged H2D dst=0x78ac11e01000, src=0x78ac12900000, size=4096, completion_signal=0x78ad143ff680
:4:rocvirtual.cpp           :572 : 89974696054 us: [pid:1092331 tid:0x78ad14e4d1c0] Host wait on completion_signal=0x78ad143ff680
:3:rocvirtual.hpp           :67  : 89974696058 us: [pid:1092331 tid:0x78ad14e4d1c0] Host active wait for Signal = (0x78ad143ff680) for -1 ns
:4:command.cpp              :287 : 89974696065 us: [pid:1092331 tid:0x78ad14e4d1c0] Queue marker to command queue: 0x5911ddbc9110
:4:command.cpp              :348 : 89974696068 us: [pid:1092331 tid:0x78ad14e4d1c0] Command (InternalMarker) enqueued: 0x5911ddf663c0
:4:command.cpp              :177 : 89974696072 us: [pid:1092331 tid:0x78ad14e4d1c0] Command 0x5911ddf946a0 complete
:4:command.cpp              :175 : 89974696075 us: [pid:1092331 tid:0x78ad14e4d1c0] Command 0x5911ddf663c0 complete (Wall: 89974696075, CPU: 0, GPU: 0 us)
:4:command.cpp              :252 : 89974696081 us: [pid:1092331 tid:0x78ad14e4d1c0] Waiting for event 0x5911ddf946a0 to complete, current status 0
:4:command.cpp              :266 : 89974696085 us: [pid:1092331 tid:0x78ad14e4d1c0] Event 0x5911ddf946a0 wait completed
:3:hip_memory.cpp           :691 : 89974696088 us: [pid:1092331 tid:0x78ad14e4d1c0] hipMemcpy: Returned hipSuccess : : duration: 51 us
:3:hip_platform.cpp         :225 : 89974696095 us: [pid:1092331 tid:0x78ad14e4d1c0]  __hipPushCallConfiguration ( {4,1,1}, {256,1,1}, 0, stream:<null> ) 
:3:hip_platform.cpp         :229 : 89974696099 us: [pid:1092331 tid:0x78ad14e4d1c0] __hipPushCallConfiguration: Returned hipSuccess : 
:3:hip_platform.cpp         :234 : 89974696106 us: [pid:1092331 tid:0x78ad14e4d1c0]  __hipPopCallConfiguration ( {3721618912,22801,0}, {4294966920,4294967295,1706159000}, 0x7ffc65b1e9b0, 0x7ffc65b1e9a8 ) 
:3:hip_platform.cpp         :243 : 89974696111 us: [pid:1092331 tid:0x78ad14e4d1c0] __hipPopCallConfiguration: Returned hipSuccess : 
:3:hip_module.cpp           :677 : 89974696124 us: [pid:1092331 tid:0x78ad14e4d1c0]  hipLaunchKernel ( 0x5911d8f70840, {4,1,1}, {256,1,1}, 0x7ffc65b1e9f0, 0, stream:<null> ) 
:3:devprogram.cpp           :2648: 89974696178 us: [pid:1092331 tid:0x78ad14e4d1c0] Using Code Object V5.
:4:command.cpp              :348 : 89974703739 us: [pid:1092331 tid:0x78ad14e4d1c0] Command (KernelExecution) enqueued: 0x5911ddde4ba0
:3:rocvirtual.cpp           :727 : 89974703752 us: [pid:1092331 tid:0x78ad14e4d1c0] Arg0:   = ptr:0x78ac11e00000 obj:[0x78ac11e00000-0x78ac11e01000]
:3:rocvirtual.cpp           :727 : 89974703757 us: [pid:1092331 tid:0x78ad14e4d1c0] Arg1:   = ptr:0x78ac11e01000 obj:[0x78ac11e01000-0x78ac11e02000]
:3:rocvirtual.cpp           :727 : 89974703762 us: [pid:1092331 tid:0x78ad14e4d1c0] Arg2:   = ptr:0x78ac11e02000 obj:[0x78ac11e02000-0x78ac11e03000]
:3:rocvirtual.cpp           :803 : 89974703767 us: [pid:1092331 tid:0x78ad14e4d1c0] Arg3:   = val:1024
:3:rocvirtual.cpp           :3028: 89974703772 us: [pid:1092331 tid:0x78ad14e4d1c0] ShaderName : _Z10vector_addPiS_S_i
:4:rocvirtual.cpp           :926 : 89974703781 us: [pid:1092331 tid:0x78ad14e4d1c0] SWq=0x78ad149f8000, HWq=0x78ac0bd00000, id=1, Dispatch Header = 0x1502 (type=2, barrier=1, acquire=2, release=2), setup=3, grid=[1024, 1, 1], workgroup=[256, 1, 1], private_seg_size=0, group_seg_size=0, kernel_obj=0x78ad149c88c0, kernarg_address=0x78ac0a900000, completion_signal=0x0, correlation_id=0, rptr=0, wptr=0
:3:hip_module.cpp           :678 : 89974703790 us: [pid:1092331 tid:0x78ad14e4d1c0] hipLaunchKernel: Returned hipSuccess : 
:3:hip_error.cpp            :36  : 89974703795 us: [pid:1092331 tid:0x78ad14e4d1c0]  hipGetLastError (  ) 
:3:hip_device_runtime.cpp   :620 : 89974703800 us: [pid:1092331 tid:0x78ad14e4d1c0]  hipDeviceSynchronize (  ) 
:4:commandqueue.cpp         :147 : 89974703805 us: [pid:1092331 tid:0x78ad14e4d1c0] HW Event not ready, awaiting completion instead
:4:command.cpp              :287 : 89974703811 us: [pid:1092331 tid:0x78ad14e4d1c0] Queue marker to command queue: 0x5911ddbc9110
:4:command.cpp              :348 : 89974703815 us: [pid:1092331 tid:0x78ad14e4d1c0] Command (InternalMarker) enqueued: 0x5911ddf663c0
:4:rocvirtual.cpp           :1091: 89974703821 us: [pid:1092331 tid:0x78ad14e4d1c0] SWq=0x78ad149f8000, HWq=0x78ac0bd00000, id=1, BarrierAND Header = 0x1503 (type=3, barrier=1, acquire=2, release=2), dep_signal=[0x0, 0x0, 0x0, 0x0, 0x0], completion_signal=0x78ad143ff600
:4:rocvirtual.cpp           :572 : 89974703826 us: [pid:1092331 tid:0x78ad14e4d1c0] Host wait on completion_signal=0x78ad143ff600
:3:rocvirtual.hpp           :67  : 89974703830 us: [pid:1092331 tid:0x78ad14e4d1c0] Host active wait for Signal = (0x78ad143ff600) for -1 ns
:4:command.cpp              :177 : 89974703837 us: [pid:1092331 tid:0x78ad14e4d1c0] Command 0x5911ddde4ba0 complete
:4:command.cpp              :175 : 89974703840 us: [pid:1092331 tid:0x78ad14e4d1c0] Command 0x5911ddf663c0 complete (Wall: 89974703840, CPU: 0, GPU: 0 us)
:4:command.cpp              :252 : 89974703845 us: [pid:1092331 tid:0x78ad14e4d1c0] Waiting for event 0x5911ddde4ba0 to complete, current status 0
:4:command.cpp              :266 : 89974703853 us: [pid:1092331 tid:0x78ad14e4d1c0] Event 0x5911ddde4ba0 wait completed
:4:commandqueue.cpp         :163 : 89974703856 us: [pid:1092331 tid:0x78ad14e4d1c0] All commands finished
:3:hip_device_runtime.cpp   :624 : 89974703860 us: [pid:1092331 tid:0x78ad14e4d1c0] hipDeviceSynchronize: Returned hipSuccess : 
:3:hip_memory.cpp           :690 : 89974703867 us: [pid:1092331 tid:0x78ad14e4d1c0]  hipMemcpy ( 0x5911ddd375f0, 0x78ac11e02000, 4096, hipMemcpyDeviceToHost ) 
:4:command.cpp              :348 : 89974703876 us: [pid:1092331 tid:0x78ad14e4d1c0] Command (CopyDeviceToHost) enqueued: 0x5911ddf946a0
:4:rocblit.cpp              :868 : 89974705215 us: [pid:1092331 tid:0x78ad14e4d1c0] HSA Async Copy staged D2H dst=0x78ac12700000, src=0x78ac11e02000, size=4096, completion_signal=0x78ad143ff580
:4:rocvirtual.cpp           :572 : 89974705223 us: [pid:1092331 tid:0x78ad14e4d1c0] Host wait on completion_signal=0x78ad143ff580
:3:rocvirtual.hpp           :67  : 89974705227 us: [pid:1092331 tid:0x78ad14e4d1c0] Host active wait for Signal = (0x78ad143ff580) for -1 ns
:4:command.cpp              :287 : 89974705234 us: [pid:1092331 tid:0x78ad14e4d1c0] Queue marker to command queue: 0x5911ddbc9110
:4:command.cpp              :348 : 89974705238 us: [pid:1092331 tid:0x78ad14e4d1c0] Command (InternalMarker) enqueued: 0x5911ddf663c0
:4:command.cpp              :177 : 89974705242 us: [pid:1092331 tid:0x78ad14e4d1c0] Command 0x5911ddf946a0 complete
:4:command.cpp              :175 : 89974705247 us: [pid:1092331 tid:0x78ad14e4d1c0] Command 0x5911ddf663c0 complete (Wall: 89974705247, CPU: 0, GPU: 0 us)
:4:command.cpp              :252 : 89974705252 us: [pid:1092331 tid:0x78ad14e4d1c0] Waiting for event 0x5911ddf946a0 to complete, current status 0
:4:command.cpp              :266 : 89974705255 us: [pid:1092331 tid:0x78ad14e4d1c0] Event 0x5911ddf946a0 wait completed
:3:hip_memory.cpp           :691 : 89974705258 us: [pid:1092331 tid:0x78ad14e4d1c0] hipMemcpy: Returned hipSuccess : : duration: 1391 us
c[0] = 0
c[1] = 3
c[2] = 6
c[3] = 9
c[4] = 12
c[5] = 15
c[6] = 18
c[7] = 21
c[8] = 24
c[9] = 27
:3:hip_memory.cpp           :667 : 89974705286 us: [pid:1092331 tid:0x78ad14e4d1c0]  hipFree ( 0x78ac11e00000 ) 
:4:commandqueue.cpp         :147 : 89974705290 us: [pid:1092331 tid:0x78ad14e4d1c0] HW Event not ready, awaiting completion instead
:4:commandqueue.cpp         :163 : 89974705294 us: [pid:1092331 tid:0x78ad14e4d1c0] All commands finished
:4:rocdevice.cpp            :2395: 89974705304 us: [pid:1092331 tid:0x78ad14e4d1c0] Free hsa memory 0x78ac11e00000
:3:rocdevice.cpp            :2418: 89974705308 us: [pid:1092331 tid:0x78ad14e4d1c0] Device=0x5911ddd27920, freeMem_ = 0x2feffe000
:3:hip_memory.cpp           :669 : 89974705312 us: [pid:1092331 tid:0x78ad14e4d1c0] hipFree: Returned hipSuccess : 
:3:hip_memory.cpp           :667 : 89974705316 us: [pid:1092331 tid:0x78ad14e4d1c0]  hipFree ( 0x78ac11e01000 ) 
:4:rocdevice.cpp            :2395: 89974705320 us: [pid:1092331 tid:0x78ad14e4d1c0] Free hsa memory 0x78ac11e01000
:3:rocdevice.cpp            :2418: 89974705322 us: [pid:1092331 tid:0x78ad14e4d1c0] Device=0x5911ddd27920, freeMem_ = 0x2fefff000
:3:hip_memory.cpp           :669 : 89974705327 us: [pid:1092331 tid:0x78ad14e4d1c0] hipFree: Returned hipSuccess : 
:3:hip_memory.cpp           :667 : 89974705330 us: [pid:1092331 tid:0x78ad14e4d1c0]  hipFree ( 0x78ac11e02000 ) 
:4:rocdevice.cpp            :2395: 89974705334 us: [pid:1092331 tid:0x78ad14e4d1c0] Free hsa memory 0x78ac11e02000
:3:rocdevice.cpp            :2418: 89974705338 us: [pid:1092331 tid:0x78ad14e4d1c0] Device=0x5911ddd27920, freeMem_ = 0x2ff000000
:3:hip_memory.cpp           :669 : 89974705342 us: [pid:1092331 tid:0x78ad14e4d1c0] hipFree: Returned hipSuccess : 
:3:hip_device_runtime.cpp   :620 : 89974705380 us: [pid:1092331 tid:0x78ad14e4d1c0]  hipDeviceSynchronize (  ) 
:3:hip_device_runtime.cpp   :624 : 89974705384 us: [pid:1092331 tid:0x78ad14e4d1c0] hipDeviceSynchronize: Returned hipSuccess : 
:4:command.cpp              :348 : 89974705686 us: [pid:1092331 tid:0x78ad14e4d1c0] Command (Marker) enqueued: 0x5911ddf663c0
:3:rocvirtual.cpp           :476 : 89974705695 us: [pid:1092331 tid:0x78ad14e4d1c0] Set Handler: handle(0x78ad143ff500), timestamp(0x5911de4fdcc0)
:4:rocvirtual.cpp           :1091: 89974705701 us: [pid:1092331 tid:0x78ad14e4d1c0] SWq=0x78ad149f8000, HWq=0x78ac0bd00000, id=1, BarrierAND Header = 0x1503 (type=3, barrier=1, acquire=2, release=2), dep_signal=[0x0, 0x0, 0x0, 0x0, 0x0], completion_signal=0x78ad143ff500
:4:command.cpp              :252 : 89974705708 us: [pid:1092331 tid:0x78ad14e4d1c0] Waiting for event 0x5911ddf663c0 to complete, current status 2
:3:rocvirtual.cpp           :223 : 89974705734 us: [pid:1092331 tid:0x78ac10c006c0] Handler: value(0), timestamp(0x5911de34ff90), handle(0x78ad143ff500)
:4:command.cpp              :175 : 89974705750 us: [pid:1092331 tid:0x78ac10c006c0] Command 0x5911ddf663c0 complete (Wall: 89974705749, CPU: 0, GPU: 58 us)
:4:command.cpp              :266 : 89974705750 us: [pid:1092331 tid:0x78ad14e4d1c0] Event 0x5911ddf663c0 wait completed
:4:rocdevice.cpp            :2395: 89974705972 us: [pid:1092331 tid:0x78ad14e4d1c0] Free hsa memory 0x78ac0a900000
:4:rocdevice.cpp            :2395: 89974705977 us: [pid:1092331 tid:0x78ad14e4d1c0] Free hsa memory (nil)
:3:rocdevice.cpp            :3210: 89974705980 us: [pid:1092331 tid:0x78ad14e4d1c0] releaseQueue refCount:0x78ac0bd00000 (0)
:4:runtime.cpp              :93  : 89974706035 us: [pid:1092331 tid:0x78ad14e4d1c0] tearDown
:4:rocdevice.cpp            :2395: 89974706322 us: [pid:1092331 tid:0x78ad14e4d1c0] Free hsa memory 0x78ac12200000
:4:rocdevice.cpp            :2395: 89974706405 us: [pid:1092331 tid:0x78ad14e4d1c0] Free hsa memory 0x78ad14e3a000
:3:rocdevice.cpp            :286 : 89974706410 us: [pid:1092331 tid:0x78ad14e4d1c0] Deleting hardware queue 0x78ac0bd00000 with refCount 0
:4:rocdevice.cpp            :2395: 89974712902 us: [pid:1092331 tid:0x78ad14e4d1c0] Free hsa memory 0x78ac12700000
:4:rocdevice.cpp            :2395: 89974713096 us: [pid:1092331 tid:0x78ad14e4d1c0] Free hsa memory 0x78ac12900000

总结

笔者刚开始遇到这个问题的时候,一直怀疑是不是自己配的环境有问题,尝试了各种配置,没想到最后是官方的运行库有 bug 。不过这个问题也促使笔者探究了 HIP 运行时的细节,对于计算库有了更深的理解。