[openmp][amdgpu] Initial gfx10 offloading implementation

Lets wavefront size be 32 for amdgpu openmp, as well as 64. Fixes up as little as possible to pass that through the libraries. This change is end to end, as opposed to updating clang/devicertl/plugin separately. It can be broken up for review/commit if preferred. Posting as-is so that others with a gfx10 can try it out. It works roughly as well as gfx9 for me, but there are probably bugs remaining as well as the todo: for letting grid values vary more. Reviewed By: ronlieb Differential Revision: https://reviews.llvm.org/D108708
2021-08-27 12:34:02 +01:00 · 2021-08-27 12:34:02 +01:00 · 78f92c3810
parent cdbe569fb6
commit 78f92c3810
5 changed files with 63 additions and 24 deletions
--- a/clang/lib/Basic/Targets/AMDGPU.h
+++ b/clang/lib/Basic/Targets/AMDGPU.h
@ -371,7 +371,14 @@ public:
  }

  const llvm::omp::GV &getGridValue() const override {
-    return llvm::omp::AMDGPUGridValues;
+    switch (WavefrontSize) {
+    case 32:
+      return llvm::omp::getAMDGPUGridValues<32>();
+    case 64:
+      return llvm::omp::getAMDGPUGridValues<64>();
+    default:
+      llvm_unreachable("getGridValue not implemented for this wavesize");
+    }
  }

  /// \returns Target specific vtbl ptr address space.
--- a/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h
@ -81,7 +81,7 @@ struct GV {
 };

 /// For AMDGPU GPUs
-static constexpr GV AMDGPUGridValues = {
+static constexpr GV AMDGPUGridValues64 = {
    256,  // GV_Slot_Size
    64,   // GV_Warp_Size
    128,  // GV_Max_Teams
@ -90,6 +90,20 @@ static constexpr GV AMDGPUGridValues = {
    256,  // GV_Default_WG_Size
 };

+static constexpr GV AMDGPUGridValues32 = {
+    256,  // GV_Slot_Size
+    32,   // GV_Warp_Size
+    128,  // GV_Max_Teams
+    896,  // GV_SimpleBufferSize
+    1024, // GV_Max_WG_Size,
+    256,  // GV_Default_WG_Size
+};
+
+template <unsigned wavesize> constexpr const GV &getAMDGPUGridValues() {
+  static_assert(wavesize == 32 || wavesize == 64, "");
+  return wavesize == 32 ? AMDGPUGridValues32 : AMDGPUGridValues64;
+}
+
 /// For Nvidia GPUs
 static constexpr GV NVPTXGridValues = {
    256,  // GV_Slot_Size
--- a/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt
+++ b/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt
@ -107,7 +107,7 @@ else()
 endif()

 # create libraries
-set(mcpus gfx700 gfx701 gfx801 gfx803 gfx900 gfx902 gfx906 gfx908)
+set(mcpus gfx700 gfx701 gfx801 gfx803 gfx900 gfx902 gfx906 gfx908 gfx1010 gfx1030 gfx1031)
 if (DEFINED LIBOMPTARGET_AMDGCN_GFXLIST)
  set(mcpus ${LIBOMPTARGET_AMDGCN_GFXLIST})
 endif()
--- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
+++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
@ -38,7 +38,7 @@ typedef uint64_t __kmpc_impl_lanemask_t;
 #include "llvm/Frontend/OpenMP/OMPGridValues.h"

 INLINE constexpr const llvm::omp::GV &getGridValue() {
-  return llvm::omp::AMDGPUGridValues;
+  return llvm::omp::getAMDGPUGridValues<__AMDGCN_WAVEFRONT_SIZE>();
 }

 ////////////////////////////////////////////////////////////////////////////////
--- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
@ -435,8 +435,9 @@ struct EnvironmentVariables {
  int MaxTeamsDefault;
 };

+template <uint32_t wavesize>
 static constexpr const llvm::omp::GV &getGridValue() {
-  return llvm::omp::AMDGPUGridValues;
+  return llvm::omp::getAMDGPUGridValues<wavesize>();
 }

 /// Class containing all the device information
@ -507,10 +508,24 @@ public:
  static const unsigned HardTeamLimit =
      (1 << 16) - 1; // 64K needed to fit in uint16
  static const int DefaultNumTeams = 128;
-  static const int Max_Teams = getGridValue().GV_Max_Teams;
-  static const int Warp_Size = getGridValue().GV_Warp_Size;
-  static const int Max_WG_Size = getGridValue().GV_Max_WG_Size;
-  static const int Default_WG_Size = getGridValue().GV_Default_WG_Size;
+
+  // These need to be per-device since different devices can have different
+  // wave sizes, but are currently the same number for each so that refactor
+  // can be postponed.
+  static_assert(getGridValue<32>().GV_Max_Teams ==
+                    getGridValue<64>().GV_Max_Teams,
+                "");
+  static const int Max_Teams = getGridValue<64>().GV_Max_Teams;
+
+  static_assert(getGridValue<32>().GV_Max_WG_Size ==
+                    getGridValue<64>().GV_Max_WG_Size,
+                "");
+  static const int Max_WG_Size = getGridValue<64>().GV_Max_WG_Size;
+
+  static_assert(getGridValue<32>().GV_Default_WG_Size ==
+                    getGridValue<64>().GV_Default_WG_Size,
+                "");
+  static const int Default_WG_Size = getGridValue<64>().GV_Default_WG_Size;

  using MemcpyFunc = hsa_status_t (*)(hsa_signal_t, void *, const void *,
                                      size_t size, hsa_agent_t,
@ -1059,8 +1074,9 @@ int32_t __tgt_rtl_init_device(int device_id) {
    DP("Queried wavefront size: %d\n", wavefront_size);
    DeviceInfo.WarpSize[device_id] = wavefront_size;
  } else {
-    DP("Default wavefront size: %d\n", getGridValue().GV_Warp_Size);
-    DeviceInfo.WarpSize[device_id] = getGridValue().GV_Warp_Size;
+    // TODO: Burn the wavefront size into the code object
+    DP("Warning: Unknown wavefront size, assuming 64\n");
+    DeviceInfo.WarpSize[device_id] = 64;
  }

  // Adjust teams to the env variables
@ -1885,9 +1901,10 @@ struct launchVals {
  int WorkgroupSize;
  int GridSize;
 };
-launchVals getLaunchVals(EnvironmentVariables Env, int ConstWGSize,
-                         int ExecutionMode, int num_teams, int thread_limit,
-                         uint64_t loop_tripcount, int DeviceNumTeams) {
+launchVals getLaunchVals(int WarpSize, EnvironmentVariables Env,
+                         int ConstWGSize, int ExecutionMode, int num_teams,
+                         int thread_limit, uint64_t loop_tripcount,
+                         int DeviceNumTeams) {

  int threadsPerGroup = RTLDeviceInfoTy::Default_WG_Size;
  int num_groups = 0;
@ -1900,7 +1917,7 @@ launchVals getLaunchVals(EnvironmentVariables Env, int ConstWGSize,
  if (print_kernel_trace & STARTUP_DETAILS) {
    DP("RTLDeviceInfoTy::Max_Teams: %d\n", RTLDeviceInfoTy::Max_Teams);
    DP("Max_Teams: %d\n", Max_Teams);
-    DP("RTLDeviceInfoTy::Warp_Size: %d\n", RTLDeviceInfoTy::Warp_Size);
+    DP("RTLDeviceInfoTy::Warp_Size: %d\n", WarpSize);
    DP("RTLDeviceInfoTy::Max_WG_Size: %d\n", RTLDeviceInfoTy::Max_WG_Size);
    DP("RTLDeviceInfoTy::Default_WG_Size: %d\n",
       RTLDeviceInfoTy::Default_WG_Size);
@ -1913,8 +1930,8 @@ launchVals getLaunchVals(EnvironmentVariables Env, int ConstWGSize,
    threadsPerGroup = thread_limit;
    DP("Setting threads per block to requested %d\n", thread_limit);
    if (ExecutionMode == GENERIC) { // Add master warp for GENERIC
-      threadsPerGroup += RTLDeviceInfoTy::Warp_Size;
-      DP("Adding master wavefront: +%d threads\n", RTLDeviceInfoTy::Warp_Size);
+      threadsPerGroup += WarpSize;
+      DP("Adding master wavefront: +%d threads\n", WarpSize);
    }
    if (threadsPerGroup > RTLDeviceInfoTy::Max_WG_Size) { // limit to max
      threadsPerGroup = RTLDeviceInfoTy::Max_WG_Size;
@ -1950,7 +1967,7 @@ launchVals getLaunchVals(EnvironmentVariables Env, int ConstWGSize,
  // So we only handle constant thread_limits.
  if (threadsPerGroup >
      RTLDeviceInfoTy::Default_WG_Size) //  256 < threadsPerGroup <= 1024
-    // Should we round threadsPerGroup up to nearest RTLDeviceInfoTy::Warp_Size
+    // Should we round threadsPerGroup up to nearest WarpSize
    // here?
    num_groups = (Max_Teams * RTLDeviceInfoTy::Max_WG_Size) / threadsPerGroup;

@ -2099,12 +2116,13 @@ int32_t __tgt_rtl_run_target_team_region_locked(
  /*
   * Set limit based on ThreadsPerGroup and GroupsPerDevice
   */
-  launchVals LV = getLaunchVals(DeviceInfo.Env, KernelInfo->ConstWGSize,
-                                KernelInfo->ExecutionMode,
-                                num_teams,      // From run_region arg
-                                thread_limit,   // From run_region arg
-                                loop_tripcount, // From run_region arg
-                                DeviceInfo.NumTeams[KernelInfo->device_id]);
+  launchVals LV =
+      getLaunchVals(DeviceInfo.WarpSize[device_id], DeviceInfo.Env,
+                    KernelInfo->ConstWGSize, KernelInfo->ExecutionMode,
+                    num_teams,      // From run_region arg
+                    thread_limit,   // From run_region arg
+                    loop_tripcount, // From run_region arg
+                    DeviceInfo.NumTeams[KernelInfo->device_id]);
  const int GridSize = LV.GridSize;
  const int WorkgroupSize = LV.WorkgroupSize;