From 86cdab3b14b556e95eafe370b8e8a1a80e8d093b Mon Sep 17 00:00:00 2001 From: Seunghoon Lee Date: Sun, 14 Jul 2024 16:37:39 +0900 Subject: [PATCH] Follow-up changes of #27 (#32) * Update README. * Drop dead codes. * Use addr_of. * Update Dockerfiles. * Disable warnings for nvrtc. * nvrtc. --- .devcontainer/Dockerfile-common | 4 +- .devcontainer/Dockerfile-el8_8 | 4 +- README.md | 86 ++++++++++++++++++++++----------- zluda/src/impl/mod.rs | 7 --- zluda_redirect/src/lib.rs | 36 +++++++------- zluda_rtc/src/lib.rs | 10 +++- 6 files changed, 88 insertions(+), 59 deletions(-) diff --git a/.devcontainer/Dockerfile-common b/.devcontainer/Dockerfile-common index e7a5356..e66c2ff 100644 --- a/.devcontainer/Dockerfile-common +++ b/.devcontainer/Dockerfile-common @@ -33,11 +33,11 @@ RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86 cuda-nvml-dev-${CUDA_VERSION} \ libcudnn8-dev -ARG RUST_VERSION=1.66.1 +ARG RUST_VERSION=1.79.0 RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain=${RUST_VERSION} RUN . $HOME/.cargo/env && cargo install bindgen-cli --locked -ARG ROCM_VERSION=5.7.3 +ARG ROCM_VERSION=6.1.2 RUN echo "Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600" > /etc/apt/preferences.d/rocm-pin-600 RUN mkdir --parents --mode=0755 /etc/apt/keyrings && \ sh -c 'wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | gpg --dearmor | tee /etc/apt/keyrings/rocm.gpg > /dev/null' && \ diff --git a/.devcontainer/Dockerfile-el8_8 b/.devcontainer/Dockerfile-el8_8 index 105ed05..f617cbc 100644 --- a/.devcontainer/Dockerfile-el8_8 +++ b/.devcontainer/Dockerfile-el8_8 @@ -37,11 +37,11 @@ RUN dnf config-manager --add-repo https://developer.download.nvidia.com/compute/ cuda-nvml-devel-${CUDA_VERSION} \ libcudnn8-devel -ARG RUST_VERSION=1.66.1 +ARG RUST_VERSION=1.79.0 RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain=${RUST_VERSION} RUN . $HOME/.cargo/env && cargo install bindgen-cli --locked -ARG ROCM_VERSION=5.7.1 +ARG ROCM_VERSION=6.1.2 RUN sh -c 'echo -e "[ROCm-${ROCM_VERSION}]\n\ name=ROCm${ROCM_VERSION}\n\ baseurl=https://repo.radeon.com/rocm/rhel8/${ROCM_VERSION}/main\n\ diff --git a/README.md b/README.md index 5f80498..1d45833 100644 --- a/README.md +++ b/README.md @@ -11,18 +11,24 @@ If you want to give it a try, download it from Release page to the right and rea ## Usage ### Windows + Using command line: + ``` \zluda.exe -- ``` + If you downloaded a ZIP file with the release and unpacked it, then `` is the `zluda` directory you have just unpacked.\ If you are building from source, then `` is subdirectory `target\release`. ### Linux + Using command line: + ``` LD_LIBRARY_PATH=":$LD_LIBRARY_PATH" ``` + If you downloaded a ZIP file with the release and unpacked it, then `` is the `zluda` directory you have just unpacked.\ If you are building from source, then `` is subdirectory `target\release`. @@ -31,25 +37,28 @@ If you are building from source, then `` is subdirectory `targe ### Prerequisites Make sure you have the following installed: -* Git -* CMake -* Python 3 -* Rust (1.66.1 or newer) -* C++ compiler -* (Linux only) ROCm 5.7+ (_not ROCm 6_) (https://rocm.docs.amd.com/en/latest/deploy/linux/install_overview.html) -* (Windows only) Recent [AMD Radeon Software Adrenalin](https://www.amd.com/en/technologies/software) -* (Recommended, optional) Ninja (https://ninja-build.org/) + +- Git +- CMake +- Python 3 +- Rust (1.66.1 or newer) +- C++ compiler +- [ROCm](https://rocm.docs.amd.com/en/latest/deploy/linux/install_overview.html) 6.0+ (or [HIP SDK](https://rocm.docs.amd.com/projects/install-on-windows/en/latest/) on Windows) +- (Windows only) Recent [AMD Radeon Software Adrenalin](https://www.amd.com/en/technologies/software) +- (Recommended, optional) [Ninja](https://ninja-build.org/) Alternatively, if you are building for Linux, [.devcontainer](.devcontainer) directory contains various developer Dockerfiles with all the required dependencies ### Checkout Checkout ZLUDA code with: + ``` git clone --recurse-submodules https://github.com/vosen/zluda.git ``` ### Build + Build by running: ``` @@ -79,10 +88,12 @@ If an application fails to start under ZLUDA or crashes please check [Known Issu - ZLUDA can use AMD server GPUs (as tested with Instinct MI200) with a caveat. On Server GPUs, ZLUDA can compile CUDA GPU code to run in one of two modes: + - Fast mode, which is faster, but can make exotic (but correct) GPU code hang. - Slow mode, which should make GPU code more stable, but can prevent some applications from running on ZLUDA. By default, ZLUDA uses fast mode. That's because: + - There's a huge performance difference, fast mode can be twice as fast. - The code patterns that can trip fast mode were not encountered across multiple projects (SPECFEM3D, QUDA, CHroma, MILC, Kokkos, LAMMPS, OpenFOAM, XGBoost, NAMD, LAMMPS). @@ -90,7 +101,6 @@ If an application fails to start under ZLUDA or crashes please check [Known Issu Nothing of that applies to desktop and integrated GPUs (RDNA family). - ### Software - Applications using ZLUDA are slow to start. @@ -103,14 +113,17 @@ If an application fails to start under ZLUDA or crashes please check [Known Issu Firstly, ZLUDA ignores some of the floating point denormal and rounding mode information present in the kernels. Secondly, for certain approximate (not IEEE 754) NVIDIA floating point operations in CUDA, ZLUDA blindly uses approximate AMD floating point operations. The two might have a different precision. #### CUDA 12+ -- Application built with CUDA 12 and using Thrust crashes with `LLVM ERROR: unsupported libcall legalization`. - + +- Application built with CUDA 12 and using Thrust crashes with `LLVM ERROR: unsupported libcall legalization`. + This is a ROCm/HIP bug. Currently, CUDA applications built with CUDA versions pre-12 work the best. Building with CUDA 12 and a pre-CUDA 12 Thrust might also work. #### OptiX + - ZLUDA has a bare-minimum OptiX implementation for Arnold. See details in [Arnold](#arnold) section. #### Windows + - Antivirus flags ZLUDA as malware. ZLUDA launcher (`zluda.exe`) uses some of the techniques used by malware, but for good. `zluda.exe` hijacks the process and redirects all uses of the original NVIDIA's CUDA libraries to use ZLUDA's CUDA instead. @@ -139,51 +152,53 @@ Meshroom works only with on Windows due to an underlying ROCm/HIP issue. Meshroom 2023.3.0 might not work, it's recommended to use Meshroom freshly built from develop branch. See #79 and alicevision/Meshroom#595. Please open an issue here if you run into problems. - #### llama.cpp If you are building llama.cpp with cmake and don't want it to crash on ZLUDA then you should use `CUDA_DOCKER_ARCH=compute_61` like this: + ``` -make CUDA_DOCKER_ARCH=compute_61 +make CUDA_DOCKER_ARCH=compute_61 ``` + Alternatively, building with cmake should work with no changes. Performance is currently much lower than the native HIP backend, see the discussion in #102. #### Arnold -* ZLUDA implements minimum of OptiX framework to support Arnold. ZLUDA's OptiX is buggy, unoptimized and incomplete. It's been tested with Arnold 7.1.4.1 command line rendering on Linux. +- ZLUDA implements minimum of OptiX framework to support Arnold. ZLUDA's OptiX is buggy, unoptimized and incomplete. It's been tested with Arnold 7.1.4.1 command line rendering on Linux. ZLUDA-OptiX is not built by default or redistributed in the release. To use it follow those steps: - - * Firstly build a newer version of ROCm LLVM. Version shipped with 5.7.1 is known to miscompile Arnold code. Get it here: https://github.com/ROCm/llvm-project. Switch to a known good commit: `0c7fd5b6d1bbf471d2c068c2b6172d9cfd76b08d` and build it. - * Then build amd_comgr: https://github.com/ROCm/ROCm-CompilerSupport with the LLVM built in the previous step. I'm using the last commit from https://github.com/ROCm/ROCm-CompilerSupport (`8276083301409001ec7643e68f5ad58b057c21fd`). + - Firstly build a newer version of ROCm LLVM. Version shipped with 5.7.1 is known to miscompile Arnold code. Get it here: https://github.com/ROCm/llvm-project. Switch to a known good commit: `0c7fd5b6d1bbf471d2c068c2b6172d9cfd76b08d` and build it. + + - Then build amd_comgr: https://github.com/ROCm/ROCm-CompilerSupport with the LLVM built in the previous step. I'm using the last commit from https://github.com/ROCm/ROCm-CompilerSupport (`8276083301409001ec7643e68f5ad58b057c21fd`). + + - Now build ZLUDA-OptiX: - * Now build ZLUDA-OptiX: ``` cargo ctask --release cargo build -p zluda_rt --release cd target/release - ln -s libnvoptix.so liboptix.so.6.6.0 + ln -s libnvoptix.so liboptix.so.6.6.0 cp ../../hiprt-sys/lib/libhiprt64.so . ``` - * After those quick and easy steps you can use it with the command line Arnold renderer: + - After those quick and easy steps you can use it with the command line Arnold renderer: + ``` LD_LIBRARY_PATH=/target/release/ LD_PRELOAD="/build/libamd_comgr.so.2 /liboptix.so.6.6.0" /usr/autodesk/arnold/maya2023/bin/kick attic.ass -device gpu -o /tmp/attic.jpg -v 6 -sl ``` - * Keep in mind that ZLUDA-OptiX can only successfully render the simplest Arnold scene (and possibly one more): - - * Cornell box (from [here](https://help.autodesk.com/view/ARNOL/ENU/?guid=arnold_user_guide_ac_scene_source_ac_ass_examples_html)):\ - [![cornell](https://imgur.com/4Vv3GO8s.jpg)](https://imgur.com/4Vv3GO8) - * (used to work, broken now) Attic scene (from [here](https://github.com/wahn/export_multi/tree/master/17_attic)):\ - [![cornell](https://imgur.com/Sut2YMys.jpg)](https://imgur.com/a/2jF9Kb5) + - Keep in mind that ZLUDA-OptiX can only successfully render the simplest Arnold scene (and possibly one more): + - Cornell box (from [here](https://help.autodesk.com/view/ARNOL/ENU/?guid=arnold_user_guide_ac_scene_source_ac_ass_examples_html)):\ + [![cornell](https://imgur.com/4Vv3GO8s.jpg)](https://imgur.com/4Vv3GO8) + - (used to work, broken now) Attic scene (from [here](https://github.com/wahn/export_multi/tree/master/17_attic)):\ + [![cornell](https://imgur.com/Sut2YMys.jpg)](https://imgur.com/a/2jF9Kb5) #### PyTorch -* PyTorch received very little testing. ZLUDA's coverage of cuDNN APIs is very minimal (just enough to run ResNet-50) and realistically you won't get much running.\ +- PyTorch received very little testing. ZLUDA's coverage of cuDNN APIs is very minimal (just enough to run ResNet-50) and realistically you won't get much running.\ However if you are interested in trying it out you need to build it from sources with the settings below. Default PyTorch does not ship PTX and uses bundled NCCL which also builds without PTX: ``` @@ -196,7 +211,9 @@ Performance is currently much lower than the native HIP backend, see the discuss export NCCL_LIB_DIR=/usr/lib/x86_64-linux-gnu export USE_EXPERIMENTAL_CUDNN_V8_API=OFF ``` + or (untested): + ``` export TORCH_CUDA_ARCH_LIST="6.1+PTX" export CUDAARCHS=61 @@ -207,17 +224,29 @@ Performance is currently much lower than the native HIP backend, see the discuss ``` When running use the following environment variable: + ``` DISABLE_ADDMM_CUDA_LT=1 ``` +- On Windows, you don't have to build PyTorch from sources. Instead, just replace dll files of official CUDA release of PyTorch. + After that, insert these codes into the application. + + ```py + torch.backends.cudnn.enabled = False + torch.backends.cuda.enable_flash_sdp(False) + torch.backends.cuda.enable_math_sdp(True) + torch.backends.cuda.enable_mem_efficient_sdp(False) + ``` #### 3DF Zephyr + - ZLUDA is much slower than CUDA. 3DF Zephyr is triggering an underlying ROCm/HIP performance issue. #### Reality Capture + - ZLUDA is much slower than CUDA. Reality Capture is triggering an underlying ROCm/HIP performance issue. @@ -225,7 +254,7 @@ Performance is currently much lower than the native HIP backend, see the discuss #### CompuBench - When running multiple tests, first test passes and the subsequent tests fail with `CUDA_ERROR_UNKNOWN`. - + This is a ROCm/HIP bug. Currently, CompuBench tests have to be run one at a time. #### V-Ray Benchmark @@ -250,7 +279,6 @@ Performance is currently much lower than the native HIP backend, see the discuss One of the terms of my contract with AMD was that if AMD did not find it fit for further development, I could release it. Which brings us to today. - * What's the future of the project? With neither Intel nor AMD interested, we've run out of GPU companies. I'm open though to any offers of that could move the project forward. diff --git a/zluda/src/impl/mod.rs b/zluda/src/impl/mod.rs index d892928..052b8cc 100644 --- a/zluda/src/impl/mod.rs +++ b/zluda/src/impl/mod.rs @@ -109,13 +109,6 @@ pub(crate) trait ZludaObject: Sized { fn drop_with_result(&mut self, by_owner: bool) -> Result<(), CUresult>; } -pub(crate) trait HasLivenessCookie: Sized { - const COOKIE: usize; - const LIVENESS_FAIL: CUresult; - - fn try_drop(&mut self) -> Result<(), CUresult>; -} - // This struct is a best-effort check if wrapped value has been dropped, // while it's inherently safe, its use coming from FFI is very unsafe #[repr(C)] diff --git a/zluda_redirect/src/lib.rs b/zluda_redirect/src/lib.rs index ccc905f..9c02f08 100644 --- a/zluda_redirect/src/lib.rs +++ b/zluda_redirect/src/lib.rs @@ -3,7 +3,7 @@ extern crate detours_sys; extern crate winapi; -use std::{ffi::c_void, mem, path::PathBuf, ptr, slice, usize}; +use std::{ffi::c_void, mem, path::PathBuf, ptr::{self, addr_of, addr_of_mut}, slice, usize}; use detours_sys::{ DetourAttach, DetourRestoreAfterWith, DetourTransactionAbort, DetourTransactionBegin, @@ -306,11 +306,11 @@ unsafe fn zero_terminated(t: *const T) -> &'static [T] { } unsafe fn is_driverstore_utf8(lib: &[u8]) -> bool { - starts_with_ignore_case(lib, &DRIVERSTORE_UTF8, utf8_to_ascii_uppercase) + starts_with_ignore_case(lib, addr_of!(DRIVERSTORE_UTF8).as_ref().unwrap(), utf8_to_ascii_uppercase) } unsafe fn is_driverstore_utf16(lib: &[u16]) -> bool { - starts_with_ignore_case(lib, &DRIVERSTORE_UTF16, utf16_to_ascii_uppercase) + starts_with_ignore_case(lib, addr_of!(DRIVERSTORE_UTF16).as_ref().unwrap(), utf16_to_ascii_uppercase) } fn is_nvcuda_dll_utf8(lib: &[u8]) -> bool { @@ -578,36 +578,36 @@ impl DetourDetachGuard { } result.overriden_non_cuda_fns.extend_from_slice(&[ ( - &mut LOAD_LIBRARY_A as *mut _ as *mut *mut c_void, + addr_of_mut!(LOAD_LIBRARY_A) as *mut *mut c_void, ZludaLoadLibraryA as *mut c_void, ), - (&mut LOAD_LIBRARY_W as *mut _ as _, ZludaLoadLibraryW as _), + (addr_of_mut!(LOAD_LIBRARY_W) as _, ZludaLoadLibraryW as _), ( - &mut LOAD_LIBRARY_EX_A as *mut _ as _, + addr_of_mut!(LOAD_LIBRARY_EX_A) as _, ZludaLoadLibraryExA as _, ), ( - &mut LOAD_LIBRARY_EX_W as *mut _ as _, + addr_of_mut!(LOAD_LIBRARY_EX_W) as _, ZludaLoadLibraryExW as _, ), ( - &mut CREATE_PROCESS_A as *mut _ as _, + addr_of_mut!(CREATE_PROCESS_A) as _, ZludaCreateProcessA as _, ), ( - &mut CREATE_PROCESS_W as *mut _ as _, + addr_of_mut!(CREATE_PROCESS_W) as _, ZludaCreateProcessW as _, ), ( - &mut CREATE_PROCESS_AS_USER_W as *mut _ as _, + addr_of_mut!(CREATE_PROCESS_AS_USER_W) as _, ZludaCreateProcessAsUserW as _, ), ( - &mut CREATE_PROCESS_WITH_LOGON_W as *mut _ as _, + addr_of_mut!(CREATE_PROCESS_WITH_LOGON_W) as _, ZludaCreateProcessWithLogonW as _, ), ( - &mut CREATE_PROCESS_WITH_TOKEN_W as *mut _ as _, + addr_of_mut!(CREATE_PROCESS_WITH_TOKEN_W) as _, ZludaCreateProcessWithTokenW as _, ), ]); @@ -845,20 +845,20 @@ unsafe fn initialize_globals(current_module: HINSTANCE) -> bool { let driver_store_string = driver_store.to_str().unwrap().to_ascii_uppercase(); DRIVERSTORE_UTF16 = driver_store_string.encode_utf16().collect::>(); DRIVERSTORE_UTF8 = driver_store_string.into_bytes(); - if !load_global_string(&PAYLOAD_NVCUDA_GUID, &mut ZLUDA_PATH_UTF8, || { - &mut ZLUDA_PATH_UTF16 + if !load_global_string(&PAYLOAD_NVCUDA_GUID, addr_of_mut!(ZLUDA_PATH_UTF8).as_mut().unwrap(), || { + addr_of_mut!(ZLUDA_PATH_UTF16).as_mut().unwrap() }) { return false; } - if !load_global_string(&PAYLOAD_NVML_GUID, &mut ZLUDA_ML_PATH_UTF8, || { - &mut ZLUDA_ML_PATH_UTF16 + if !load_global_string(&PAYLOAD_NVML_GUID, addr_of_mut!(ZLUDA_ML_PATH_UTF8).as_mut().unwrap(), || { + addr_of_mut!(ZLUDA_ML_PATH_UTF16).as_mut().unwrap() }) { return false; } - load_global_string(&PAYLOAD_NVAPI_GUID, &mut ZLUDA_API_PATH_UTF8, || { + load_global_string(&PAYLOAD_NVAPI_GUID, addr_of_mut!(ZLUDA_API_PATH_UTF8).as_mut().unwrap(), || { ZLUDA_API_PATH_UTF16.get_or_insert(Vec::new()) }); - load_global_string(&PAYLOAD_NVOPTIX_GUID, &mut ZLUDA_OPTIX_PATH_UTF8, || { + load_global_string(&PAYLOAD_NVOPTIX_GUID, addr_of_mut!(ZLUDA_OPTIX_PATH_UTF8).as_mut().unwrap(), || { ZLUDA_OPTIX_PATH_UTF16.get_or_insert(Vec::new()) }); true diff --git a/zluda_rtc/src/lib.rs b/zluda_rtc/src/lib.rs index 670fdfb..ef75d7a 100644 --- a/zluda_rtc/src/lib.rs +++ b/zluda_rtc/src/lib.rs @@ -1,3 +1,4 @@ +#![allow(warnings)] mod nvrtc; pub use nvrtc::*; @@ -22,6 +23,7 @@ fn to_nvrtc(status: hiprtc_sys::hiprtcResult) -> nvrtcResult { hiprtc_sys::hiprtcResult::HIPRTC_SUCCESS => nvrtcResult::NVRTC_SUCCESS, hiprtc_sys::hiprtcResult::HIPRTC_ERROR_INVALID_PROGRAM => nvrtcResult::NVRTC_ERROR_INVALID_PROGRAM, hiprtc_sys::hiprtcResult::HIPRTC_ERROR_COMPILATION => nvrtcResult::NVRTC_ERROR_COMPILATION, + hiprtc_sys::hiprtcResult::HIPRTC_ERROR_INTERNAL_ERROR => nvrtcResult::NVRTC_ERROR_INTERNAL_ERROR, err => panic!("[ZLUDA] HIPRTC failed: {}", err.0), } } @@ -31,6 +33,7 @@ fn to_hiprtc(status: nvrtcResult) -> hiprtc_sys::hiprtcResult { nvrtcResult::NVRTC_SUCCESS => hiprtc_sys::hiprtcResult::HIPRTC_SUCCESS, nvrtcResult::NVRTC_ERROR_INVALID_PROGRAM => hiprtc_sys::hiprtcResult::HIPRTC_ERROR_INVALID_PROGRAM, nvrtcResult::NVRTC_ERROR_COMPILATION => hiprtc_sys::hiprtcResult::HIPRTC_ERROR_COMPILATION, + nvrtcResult::NVRTC_ERROR_INTERNAL_ERROR => hiprtc_sys::hiprtcResult::HIPRTC_ERROR_INTERNAL_ERROR, err => panic!("[ZLUDA] HIPRTC failed: {}", err.0), } } @@ -86,7 +89,12 @@ unsafe fn compile_program( arguments.push(cstr.as_ptr()); } } - to_nvrtc(hiprtcCompileProgram(prog.cast(), arguments.len() as _, arguments.as_mut_ptr())) + // TODO + to_nvrtc(hiprtcCompileProgram( + prog.cast(), + arguments.len() as _, + arguments.as_mut_ptr(), + )) } unsafe fn get_code_size(prog: nvrtcProgram, code_size_ret: *mut usize) -> nvrtcResult {