ZLUDA/ptx/lib/raytracing.hpp
Andrzej Janik 1b9ba2b233 Nobody expects the Red Team
Too many changes to list, but broadly:
* Remove Intel GPU support from the compiler
* Add AMD GPU support to the compiler
* Remove Intel GPU host code
* Add AMD GPU host code
* More device instructions. From 40 to 68
* More host functions. From 48 to 184
* Add proof of concept implementation of OptiX framework
* Add minimal support of cuDNN, cuBLAS, cuSPARSE, cuFFT, NCCL, NVML
* Improve ZLUDA launcher for Windows
2024-02-11 20:45:51 +01:00

1007 lines
39 KiB
C++
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#ifndef SKIP_RAYTRACING_GLOBALS
#include "raytracing_defines.hpp"
#ifndef FUNCTION_NAME
#error FUNCTION_NAME required
#endif
#ifndef EXPORTED_FUNCTION
#error EXPORTED_FUNCTION required
#endif
#ifndef EXPORTED_ATTRIBUTE_FUNCTION
#error EXPORTED_ATTRIBUTE_FUNCTION required
#endif
#ifndef EXPORTED_KERNEL
#error EXPORTED_KERNEL required
#endif
#define FUNC(NAME) __attribute__((device)) __zluda_rt_ptx_impl__##NAME
#define GLOBAL_SPACE __attribute__((address_space(1)))
#define SHARED_SPACE __attribute__((address_space(3)))
#define CONSTANT_SPACE __attribute__((address_space(4)))
#define PRIVATE_SPACE __attribute__((address_space(5)))
#define EXPORTED(X) X ## EXPORT_SUFFIX
constexpr float INFINITY = __builtin_inff();
typedef unsigned char uint8_t;
typedef unsigned short uint16_t;
#endif
#define ZLUDA_RT(NAME) __zluda_rt_ptx_impl__##NAME
// This function is for argument passing for compatibility with LLVM
// We still use float3 in fields because sizeof(zluda_float3) == 16
typedef float zluda_float3 __attribute__((ext_vector_type(3)));
#define ZLUDA_NAN __int_as_float(0x7fffffff)
struct OptixBuffer {
uint8_t *ptr;
uint64_t x;
uint64_t y;
};
struct OptixTransform {
float transform_matrix[16];
float inverse_transform_matrix[16];
};
struct OptixRay {
// We don't use zluda_float3 below because sizeof(zluda_float3) == 16
float3 origin;
float3 direction;
unsigned int ray_type;
float tmin;
float tmax;
};
static_assert((sizeof(OptixRay) == 36), "");
typedef uint32_t (*raytracing_fn)(
/* vvv INJECTED vvv */
uint8_t SHARED_SPACE* global_state,
uint32_t prim_idx,
uint2::Native_vec_ launch_idx,
uint2::Native_vec_ launch_dim,
OptixRay PRIVATE_SPACE* current_ray,
float current_time,
uint8_t PRIVATE_SPACE* payload,
float current_distance,
float2::Native_vec_ barycentric, // in case it is an attribute function
zluda_float3 normals, // in case it is an attribute function
uint8_t GLOBAL_SPACE* variable_block,
uint8_t GLOBAL_SPACE* attribute_block,
uint8_t GLOBAL_SPACE* transform_block
);
typedef uint32_t (*attribute_fn)(
/* vvv INJECTED vvv */
uint8_t SHARED_SPACE* global_state,
uint32_t prim_idx,
uint2::Native_vec_ launch_idx,
uint2::Native_vec_ launch_dim,
OptixRay PRIVATE_SPACE* current_ray,
float current_time,
uint8_t PRIVATE_SPACE* payload,
float current_distance,
float2::Native_vec_ barycentric,
zluda_float3 normals,
uint8_t GLOBAL_SPACE* variable_block,
uint8_t GLOBAL_SPACE* attribute_block,
uint8_t GLOBAL_SPACE* transform_block,
/* vvv EXTRA INJECTED vvv */
uint64_t any_hit_fn
);
typedef void(*bounding_box_fn)(
uint32_t prim_idx,
float GLOBAL_SPACE* result,
/* vvv INJECTED vvv */
uint8_t SHARED_SPACE* global_state,
uint8_t GLOBAL_SPACE* variable_block
);
/*
uint32_t variable uint32_t uint32_t variable variable
┌┬─────────────────┬┬────────────────────┬┬──────────────────┬──────────────────┬─────┬┬──────────────────────┬──────────────────────┬─────┬┐
││ materials_start ││var_block_intersect ││ offset_mat0_ray0 │ offset_mat0_ray1 │ ... ││ prog_block_mat0_ray0 │ prog_block_mat0_ray1 │ ... ││
└┴────────┬────────┴┴────────────────────┘▲─────────┬────────┴────────┬─────────┴─────┘▲──────────────────────▲──────────────────────┴─────┴┘
│ │ │ │ │ │
└───────────────────────────────┘ │ └────────────────┼──────────────────────┘
│ │
└──────────────────────────────────┘
*/
struct IntersectionInput {
uint8_t GLOBAL_SPACE* transform_block;
uint32_t materials_start;
};
struct IntersectionPayload {
uint32_t result;
uint32_t ray_type;
float tmin;
float tmax;
float time;
uint8_t SHARED_SPACE* global_state;
uint8_t PRIVATE_SPACE* payload;
float closest_distance;
uint32_t material;
uint8_t GLOBAL_SPACE* attribute_block;
};
template <typename T>
__device__ static inline T get_program_block_inner(
void GLOBAL_SPACE *offset_block,
uint32_t prim_idx
) {
uint32_t byte_offset = reinterpret_cast<uint32_t GLOBAL_SPACE *>(offset_block)[prim_idx];
if (byte_offset == ~uint32_t(0))
return nullptr;
uint8_t GLOBAL_SPACE *program_block = reinterpret_cast<uint8_t GLOBAL_SPACE *>(offset_block) + byte_offset;
return (T)(program_block);
}
template <typename T>
__device__ static inline T get_program_block_outer(
void GLOBAL_SPACE *offset_block,
uint32_t id,
uint32_t subid
) {
uint32_t byte_offset = reinterpret_cast<uint32_t GLOBAL_SPACE*>(offset_block)[id];
if (byte_offset == ~uint32_t(0))
return nullptr;
if (byte_offset & 1U) { // is bottom bit set?
byte_offset &= ~(1U); // clear bottom bit
uint8_t GLOBAL_SPACE* inner_chain = reinterpret_cast<uint8_t GLOBAL_SPACE*>(offset_block) + byte_offset;
return get_program_block_inner<T>(
inner_chain,
subid
);
} else {
uint8_t GLOBAL_SPACE* program_block = reinterpret_cast<uint8_t GLOBAL_SPACE*>(offset_block) + byte_offset;
return (T)(program_block);
}
}
__device__ static inline uint32_t call_raytracing_fn_inner(
uint8_t SHARED_SPACE* global_state,
void GLOBAL_SPACE* offset_block,
uint32_t prim_idx,
uint2::Native_vec_ launch_idx,
uint2::Native_vec_ launch_dim,
OptixRay PRIVATE_SPACE* current_ray,
float current_time,
uint8_t PRIVATE_SPACE* payload,
float current_distance,
float2::Native_vec_ barycentric,
zluda_float3 normals,
uint8_t GLOBAL_SPACE* attribute_block,
uint8_t GLOBAL_SPACE* transform_block
) {
uint32_t byte_offset = reinterpret_cast<uint32_t GLOBAL_SPACE*>(offset_block)[prim_idx];
if (byte_offset == ~uint32_t(0))
return 0;
uint8_t GLOBAL_SPACE* program_block = reinterpret_cast<uint8_t GLOBAL_SPACE*>(offset_block) + byte_offset;
raytracing_fn GLOBAL_SPACE* fn = reinterpret_cast<raytracing_fn GLOBAL_SPACE*>(program_block);
return (*fn)(
global_state,
prim_idx,
launch_idx,
launch_dim,
current_ray,
current_time,
payload,
current_distance,
barycentric,
normals,
program_block,
attribute_block,
transform_block
);
}
template<typename T>
__device__ void GLOBAL_SPACE* byte_offset(T GLOBAL_SPACE* ptr, uint32_t offset) {
uint8_t GLOBAL_SPACE* byte_ptr = reinterpret_cast<uint8_t GLOBAL_SPACE*>(ptr);
return reinterpret_cast<void GLOBAL_SPACE*>(byte_ptr + offset);
}
// https://stackoverflow.com/a/9194117
__device__ static inline int ZLUDA_RT(round_up)(int number, int multiple)
{
return (number + multiple - 1) & -multiple;
}
struct BvhDetails;
struct GlobalState {
BvhDetails CONSTANT_SPACE* scenes;
void CONSTANT_SPACE* miss_programs;
OptixBuffer CONSTANT_SPACE* buffers;
uint32_t GLOBAL_SPACE* callable_programs;
hipTextureObject_t CONSTANT_SPACE* textures;
uint32_t ray_type_count;
uint32_t uv_attribute_offset;
uint32_t width;
uint32_t height;
uint16_t attribute_block_size; // in DWORDs
uint16_t attribute_block_align; // in DWORDs
uint16_t thread_global_stack_size;
uint16_t _padding;
};
static_assert((sizeof(GlobalState) == 64), "");
// We are executing with 256 VGPRs/per wavefront (due to virtual calls), consequently
// * Dual CU has four SIMDs
// * One SIMD has 1024 VGPRs
// * At maximum 16 wavefronts / dual CU
// * Dual CU has 128kB LDS
// * At minimum 8kBs LDS per wavefront
// * At minimum 2048 dwords LDS per wavefront
// * At minimum 64 dwords per thread
// We might raise LDS/thread in the future
struct OptixStack {
constexpr static auto LOCAL_STACK_SIZE = 61;
constexpr static auto LOCAL_STACK_BITS = 6;
constexpr static auto LOCAL_STACK_MASK = 63;
GlobalState globals;
// Bottom 6 bits are for LDS, stack, top 10 bits are for global stack
// Both are in dwords
uint32_t stack_pointers[32];
uint32_t stacks[32 * LOCAL_STACK_SIZE];
uint32_t GLOBAL_SPACE* global_stack;
__device__ uint32_t& stack_pointer(void) {
return this->stack_pointers[__lane_id()];
}
// does not change the value of stack offset
__device__ uint32_t soft_push_attribute_block(uint8_t GLOBAL_SPACE** stack_pointer) {
uint32_t size = this->globals.attribute_block_size;
uint32_t align = this->globals.attribute_block_align;
uint32_t original_stack_pointer = this->stack_pointer();
uint32_t global_stack_offset = original_stack_pointer >> LOCAL_STACK_BITS;
uint32_t aligned_start_in_dwords = uint32_t(ZLUDA_RT(round_up)(int32_t(global_stack_offset), int32_t(align)));
uint32_t aligned_start = this->globals.thread_global_stack_size * uint32_t(__lane_id()) + aligned_start_in_dwords;
*stack_pointer = (uint8_t GLOBAL_SPACE*)&this->global_stack[aligned_start];
return (original_stack_pointer & LOCAL_STACK_MASK) | ((aligned_start_in_dwords + size) << LOCAL_STACK_BITS);
}
__device__ void init(
uint32_t local_x,
uint32_t local_y,
uint32_t wavefront_id,
GlobalState globals_,
void GLOBAL_SPACE* global_stack_space
) {
if (local_x == 0 && local_y == 0) {
this->globals = globals_;
}
if (__lane_id() == 0)
{
uint32_t wavefront_stack_offset = globals_.thread_global_stack_size * 32 * wavefront_id;
this->global_stack = &((uint32_t GLOBAL_SPACE*)global_stack_space)[wavefront_stack_offset];
}
this->stack_pointers[__lane_id()] = 0;
}
};
static_assert((sizeof(OptixStack) <= 8192), "");
static __shared__ OptixStack ZLUDA_RT(STACK);
static __shared__ GlobalState ZLUDA_RT(BB_GLOBALS);
#ifndef SKIP_RAYTRACING_GLOBALS
/*
uint32_t uint32_t uint32_t uint32_t variable variable uint32_t variable
┌┬───────────────┬┬───────────────────┬┬─────────┬┬───────────┬───────────┬─────┬┬─────────────────┬─────────────────┬─────┬┬──────────────┬─────┬┬─────────────────┬─────┬┐
││ any_hit_start ││ closest_hit_start ││ padding ││ any_hit_0 │ any_hit_1 │ ... ││ prog_block_ah_0 │ prog_block_ah_1 │ ... ││ closet_hit_0 │ ... ││ prog_block_ch_0 │ ... ││
└┴────┬──────────┴┴────────┬──────────┴┴─────────┘▲────────┬──┴────────┬──┴─────┘▲─────────────────▲─────────────────┴─────┘►──────┬───────┴─────┘▲─────────────────┴─────┴┘
│ │ │ │ │ │ │ │ │ │
│ │ │ │ └─────────┼─────────────────┘ │ └──────────────┘
│ │ │ │ │ │
└────────────────────┼──────────────────────┘ └─────────────────────┘ │
│ │
└────────────────────────────────────────────────────────────────────────────────────────────────┘
*/
// Due to alignment there might be padding between those two variables and the chain proper
struct HitProgramChain {
uint32_t any_hit_start;
uint32_t closest_hit_start;
};
struct BvhDetails {
hiprtScene scene;
hiprtCustomFuncTable func_set;
void CONSTANT_SPACE* attribute_programs;
uint8_t GLOBAL_SPACE* GLOBAL_SPACE* transform_blocks;
HitProgramChain GLOBAL_SPACE* GLOBAL_SPACE* hit_chains;
};
template<typename T>
__device__ T GLOBAL_SPACE* global_space_cast(T CONSTANT_SPACE* ptr) {
return (T GLOBAL_SPACE*)(ptr);
}
extern "C" {
struct ZLUDA_RT(TraversalStack) {
__device__ int pop() {
OptixStack SHARED_SPACE* stack = (OptixStack SHARED_SPACE*)0;
if (global_offset > (bottom_pointer >> OptixStack::LOCAL_STACK_BITS)) {
return thread_global_stack[--global_offset];
} else {
auto pos = (__lane_id() * OptixStack::LOCAL_STACK_SIZE) + --shared_offset;
return stack->stacks[pos];
}
}
__device__ void push( int val ) {
OptixStack SHARED_SPACE* stack = (OptixStack SHARED_SPACE*)0;
if (shared_offset != OptixStack::LOCAL_STACK_SIZE) {
auto pos = (__lane_id() * OptixStack::LOCAL_STACK_SIZE) + shared_offset++;
stack->stacks[pos] = val;
} else {
thread_global_stack[global_offset++] = val;
}
}
__device__ bool empty() const {
return (global_offset == (bottom_pointer >> OptixStack::LOCAL_STACK_BITS))
&& (shared_offset == (bottom_pointer & OptixStack::LOCAL_STACK_MASK));
}
__device__ int vacancy() const {
int32_t global_vacancy = int32_t(global_stack_size) - int32_t(global_offset);
int32_t local_vacancy = int32_t(OptixStack::LOCAL_STACK_SIZE) - int32_t(shared_offset);
int32_t vacancy = global_vacancy + local_vacancy;
return vacancy;
}
__device__ void reset() {
global_offset = (bottom_pointer >> OptixStack::LOCAL_STACK_BITS);
shared_offset = (bottom_pointer & OptixStack::LOCAL_STACK_MASK);
}
__device__ ZLUDA_RT(TraversalStack)(uint32_t stack_pointer) : bottom_pointer(stack_pointer) {
OptixStack SHARED_SPACE* stack = (OptixStack SHARED_SPACE*)0;
global_stack_size = stack->globals.thread_global_stack_size;
thread_global_stack = &stack->global_stack[global_stack_size * __lane_id()];
reset();
}
uint32_t GLOBAL_SPACE* thread_global_stack;
const uint32_t bottom_pointer;
uint32_t global_stack_size;
uint32_t global_offset;
uint16_t shared_offset;
};
/*
uint32_t uint32_t variable variable
┌┬────────┬────────┬─────┬┬───────────────────┬───────────────────┬─────┬┐
││ miss_0 │ miss_1 │ ... ││ prog_block_miss_0 │ prog_block_miss_1 │ ... ││
└┴───┬────┴───┬────┴─────┘▲───────────────────▲───────────────────┴─────┴┘
│ │ │ │
│ └───────────┼───────────────────┘
│ │
└────────────────────┘
*/
// HACK START
}
using BoxNode = hiprt::BoxNode;
using Ray = hiprt::Ray;
using InstanceNode = hiprt::InstanceNode;
using CustomNode = hiprt::CustomNode;
using TriangleNode = hiprt::TriangleNode;
using GeomHeader = hiprt::GeomHeader;
using Frame = hiprt::Frame;
using SceneHeader = hiprt::SceneHeader;
using Transform = hiprt::Transform;
template <typename Stack>
using TraversalBase = hiprt::TraversalBase<Stack>;
template<u32 StackSize>
using PrivateStack = hiprt::PrivateStack<StackSize>;
struct IntersectResult {
bool hit;
float t;
float2 uv;
float3 normal;
};
struct hiprtHit2 {
uint32_t instanceID;
uint32_t primID;
float2 uv;
float3 normal;
float t;
float3 origin;
float3 direction;
HIPRT_DEVICE bool hasHit() const { return primID != hiprtInvalidValue; }
};
typedef IntersectResult ( *hiprtIntersectFunc2 )(hiprtRay ray, u32 primIdx, const void* data, void* payload );
template <typename Stack, hiprtTraversalType TraversalType>
class SceneTraversal2 : public TraversalBase<Stack>
{
public:
HIPRT_DEVICE SceneTraversal2( hiprtScene scene, const hiprtRay& ray, hiprtRayMask mask, Stack& stack );
HIPRT_DEVICE SceneTraversal2(
hiprtScene scene, const hiprtRay& ray, hiprtRayMask mask, hiprtCustomFuncTable funcTable, Stack& stack, void* payload = nullptr );
HIPRT_DEVICE void transformRay( Ray& ray, float3& invD, float3& oxInvD, const InstanceNode& node ) const;
HIPRT_DEVICE void restoreRay( Ray& ray, float3& invD, float3& oxInvD ) const;
HIPRT_DEVICE void set_ray_maxT(float x)
{
m_ray.maxT = x;
}
HIPRT_DEVICE bool testLeafNode(
const Ray& ray,
const float3& invD,
int leafIndex,
int shapeId,
int& hitIndex,
float2& hitUv,
float3& hitNormal,
float& hitT );
HIPRT_DEVICE hiprtHit2 getNextHit2();
HIPRT_DEVICE hiprtHit getNextHit() {
abort();
return hiprtHit {};
}
protected:
using TraversalBase<Stack>::m_ray;
using TraversalBase<Stack>::m_state;
using TraversalBase<Stack>::m_stack;
using TraversalBase<Stack>::m_payload;
#if defined( __USE_HWI__ )
using TraversalBase<Stack>::m_descriptor;
#endif
int m_nodeIndex;
int m_instanceIndex;
BoxNode* m_boxNodes;
InstanceNode* m_instanceNodes;
GeomHeader** m_geoms;
Frame* m_frames;
hiprtRayMask m_mask;
hiprtCustomFuncTable m_funcTable;
};
template <typename Stack, hiprtTraversalType TraversalType>
HIPRT_DEVICE SceneTraversal2<Stack, TraversalType>::SceneTraversal2( hiprtScene scene, const hiprtRay& ray, hiprtRayMask mask, Stack& stack )
: TraversalBase<Stack>( ray, stack, nullptr ), m_funcTable( nullptr ), m_mask( mask ), m_instanceIndex( hiprtInvalidValue ), m_nodeIndex( hiprt::RootIndex )
{
SceneHeader* data = (SceneHeader*)scene;
m_boxNodes = data->m_boxNodes;
m_instanceNodes = data->m_primNodes;
m_geoms = data->m_geoms;
m_frames = data->m_frames;
m_stack.reset();
}
template <typename Stack, hiprtTraversalType TraversalType>
HIPRT_DEVICE SceneTraversal2<Stack, TraversalType>::SceneTraversal2(
hiprtScene scene, const hiprtRay& ray, hiprtRayMask mask, hiprtCustomFuncTable funcTable, Stack& stack, void* payload )
: TraversalBase<Stack>( ray, stack, payload ), m_funcTable( funcTable ), m_mask( mask ), m_instanceIndex( hiprtInvalidValue ),
m_nodeIndex( hiprt::RootIndex )
{
SceneHeader* data = (SceneHeader*)scene;
m_boxNodes = data->m_boxNodes;
m_instanceNodes = data->m_primNodes;
m_geoms = data->m_geoms;
m_frames = data->m_frames;
m_stack.reset();
}
__device__ static inline float3 invTransform2(Frame& frame, const float3& p )
{
float3 result = p;
result = result - frame.m_translation;
result = hiprt::qtInvRotate( hiprt::qtFromAxisAngle( frame.m_rotation ), result );
result = result / frame.m_scale;
return result;
}
__device__ static inline Ray transformRay2(Transform& transform, Ray& ray)
{
Frame frame = transform.interpolateFrames( ray.m_time );
if ( frame.m_translation.x != 0.0f || frame.m_translation.y != 0.0f || frame.m_translation.z != 0.0f
|| frame.m_rotation.w != 0.0f
|| frame.m_scale.x != 1.0f || frame.m_scale.y != 1.0f || frame.m_scale.z != 1.0f )
{
Ray outRay;
outRay.m_origin = invTransform2(frame, ray.m_origin );
outRay.m_direction = invTransform2(frame, ray.m_origin + ray.m_direction );
outRay.m_direction = outRay.m_direction - outRay.m_origin;
outRay.m_time = ray.m_time;
outRay.m_maxT = ray.m_maxT;
return outRay;
}
else
{
return ray;
}
}
template <typename Stack, hiprtTraversalType TraversalType>
HIPRT_DEVICE void SceneTraversal2<Stack, TraversalType>::transformRay(
Ray& ray, float3& invD, float3& oxInvD, const InstanceNode& node ) const
{
Transform tr( m_frames + node.m_frameIndex, node.m_frameCount );
ray = transformRay2( tr, ray );
invD = hiprt::safeInv( ray.m_direction );
oxInvD = -ray.m_origin * invD;
}
template <typename Stack, hiprtTraversalType TraversalType>
HIPRT_DEVICE void
SceneTraversal2<Stack, TraversalType>::restoreRay( Ray& ray, float3& invD, float3& oxInvD ) const
{
ray.m_origin = m_ray.origin;
ray.m_direction = m_ray.direction;
invD = hiprt::safeInv( m_ray.direction );
oxInvD = -m_ray.origin * invD;
}
template <typename Stack, hiprtTraversalType TraversalType>
HIPRT_DEVICE bool SceneTraversal2<Stack, TraversalType>::testLeafNode(
const Ray& ray,
const float3& invD,
int leafIndex,
int instanceId,
int& hitIndex,
float2& hitUv,
float3& hitNormal,
float& hitT )
{
bool hit = false;
const GeomHeader* geom = m_geoms[instanceId];
int type = geom->m_customType;
if ( type == hiprtInvalidValue )
{
TriangleNode node = ( (TriangleNode*)geom->m_primNodes )[hiprt::getNodeAddr( leafIndex )];
#if !defined( __USE_HWI__ )
hit = node.m_triangle.intersect( ray, hitUv, hitT );
if ( hit )
{
hitIndex = node.m_primIndex;
hitNormal = node.m_triangle.normal();
}
#else
auto result = __builtin_amdgcn_image_bvh_intersect_ray_l(
hiprt::encodeBaseAddr( geom->m_primNodes, leafIndex ),
ray.m_maxT,
make_float4( ray.m_origin.x, ray.m_origin.y, ray.m_origin.z, ray.m_origin.z ).data,
make_float4( ray.m_direction.x, ray.m_direction.y, ray.m_direction.z, ray.m_direction.z ).data,
make_float4( invD.x, invD.y, invD.z, invD.z ).data,
m_descriptor.data );
float invDenom = __ocml_native_recip_f32( __int_as_float( result[1] ) );
float t = __int_as_float( result[0] ) * invDenom;
hit = t <= ray.m_maxT;
if ( hit )
{
hitT = t;
hitUv.x = __int_as_float( result[2] ) * invDenom;
hitUv.y = __int_as_float( result[3] ) * invDenom;
hitIndex = node.m_primIndex;
hitNormal = node.m_triangle.normal();
}
#endif
}
else
{
CustomNode node = ( (CustomNode*)geom->m_primNodes )[hiprt::getNodeAddr( leafIndex )];
hiprtCustomFuncSet* funcTable = (hiprtCustomFuncSet*)m_funcTable;
IntersectResult result = ((hiprtIntersectFunc2)(funcTable[type].intersectFunc))(
reinterpret_cast<const hiprtRay&>(ray),
node.m_primIndex,
funcTable[type].intersectFuncData,
m_payload );
if ( result.hit ) {
hitIndex = node.m_primIndex;
hit = true;
hitT = result.t;
hitUv = result.uv;
hitNormal = result.normal;
}
}
return hit;
}
template <typename Stack, hiprtTraversalType TraversalType>
HIPRT_DEVICE hiprtHit2 SceneTraversal2<Stack, TraversalType>::getNextHit2()
{
int instanceId = hiprtInvalidValue;
int nodeIndex = m_nodeIndex;
int instanceIndex = m_instanceIndex;
Ray ray = *(Ray*)&m_ray;
float3 invD, oxInvD;
if ( instanceIndex == hiprtInvalidValue )
{
restoreRay( ray, invD, oxInvD );
}
else
{
InstanceNode node = m_instanceNodes[hiprt::getNodeAddr( instanceIndex )];
instanceId = node.m_primIndex;
transformRay( ray, invD, oxInvD, node );
}
int hitInstanceId = hiprtInvalidValue;
int hitInstanceIndex = hiprtInvalidValue;
int hitPrimIndex = hiprtInvalidValue;
float hitT = 0.0f;
float2 hitUv = make_float2( 0.0f, 0.0f );
float3 hitNormal;
if ( m_stack.empty() ) m_stack.push( hiprtInvalidValue );
while ( nodeIndex != hiprtInvalidValue && m_state != hiprtTraversalStateStackOverflow )
{
if ( hiprt::isInternalNode( nodeIndex ) )
{
BoxNode* nodes = ( instanceId == hiprtInvalidValue ) ? m_boxNodes : m_geoms[instanceId]->m_boxNodes;
if ( this->testInternalNode( ray, invD, oxInvD, nodes, nodeIndex ) )
continue;
}
else
{
if ( instanceId != hiprtInvalidValue )
{
if ( testLeafNode( ray, invD, nodeIndex, instanceId, hitPrimIndex, hitUv, hitNormal, hitT ) )
{
if constexpr ( TraversalType == hiprtTraversalTerminateAtAnyHit )
{
m_nodeIndex = m_stack.pop();
m_instanceIndex = instanceIndex;
m_state = hiprtTraversalStateHit;
if ( m_nodeIndex == hiprtInvalidValue && !m_stack.empty() )
{
m_instanceIndex = hiprtInvalidValue;
m_nodeIndex = m_stack.pop();
}
InstanceNode& node = m_instanceNodes[hiprt::getNodeAddr( instanceIndex )];
Transform tr( m_frames + node.m_frameIndex, node.m_frameCount );
hitNormal = tr.transformNormal( hitNormal, ray.m_time );
return hiprtHit2
{
(u32)instanceId,
(u32)hitPrimIndex,
hitUv,
hitNormal,
hitT,
ray.m_origin,
ray.m_direction
};
}
else
{
hitInstanceId = instanceId;
hitInstanceIndex = instanceIndex;
ray.m_maxT = hitT;
}
}
}
else
{
InstanceNode node = m_instanceNodes[hiprt::getNodeAddr( nodeIndex )];
if ( node.m_mask & m_mask )
{
if ( instanceId != node.m_primIndex )
{
if ( m_stack.vacancy() < 1 )
{
m_state = hiprtTraversalStateStackOverflow;
continue;
}
m_stack.push( hiprtInvalidValue );
}
instanceIndex = nodeIndex;
nodeIndex = hiprt::RootIndex;
instanceId = node.m_primIndex;
transformRay( ray, invD, oxInvD, node );
continue;
}
}
}
nodeIndex = m_stack.pop();
if ( nodeIndex == hiprtInvalidValue && !m_stack.empty() )
{
instanceIndex = hiprtInvalidValue;
instanceId = hiprtInvalidValue;
nodeIndex = m_stack.pop();
restoreRay( ray, invD, oxInvD );
}
}
hiprtHit2 result{ hiprtInvalidValue, hiprtInvalidValue, { 0.0f, 0.0f }, { 0.0f, 0.0f, 0.0f }, -1.0f, { 0.0f, 0.0f, 0.0f }, { 0.0f, 0.0f, 0.0f } };
if ( hitInstanceId != hiprtInvalidValue )
{
InstanceNode node = m_instanceNodes[hiprt::getNodeAddr( hitInstanceIndex )];
Transform tr( m_frames + node.m_frameIndex, node.m_frameCount );
result.t = hitT;
result.instanceID = hitInstanceId;
result.primID = hitPrimIndex;
result.uv = hitUv;
result.normal = tr.transformNormal( hitNormal, ray.m_time );
result.origin = ray.m_origin;
result.direction = ray.m_direction;
}
if ( m_state != hiprtTraversalStateStackOverflow )
m_state = hiprtTraversalStateFinished;
return result;
}
__device__ static inline void ZLUDA_RT(set_attr_barycentrics)(
uint8_t GLOBAL_SPACE* attr_block,
float2 value
) {
OptixStack SHARED_SPACE* stack = (OptixStack SHARED_SPACE*)0;
if (stack->globals.uv_attribute_offset != ~uint32_t(0)) {
*(float2*)(attr_block + stack->globals.uv_attribute_offset) = value;
}
}
extern "C" {
// HACK END
__device__ uint32_t FUNC(_rt_trace_time_mask_flags_64) (
uint32_t bvh_index,
float ray_origin_x,
float ray_origin_y,
float ray_origin_z,
float ray_direction_x,
float ray_direction_y,
float ray_direction_z,
uint32_t ray_type,
float tmin,
float tmax,
float time,
uint32_t mask,
uint32_t flags,
uint64_t payload_ptr,
uint32_t payload_size,
/* vvv INJECTED vvv */
uint8_t SHARED_SPACE* global_state
) {
OptixStack SHARED_SPACE* stack = (OptixStack SHARED_SPACE*)global_state;
uint32_t launch_x = blockIdx.x * blockDim.x + threadIdx.x;
uint32_t launch_y = blockIdx.y * blockDim.y + threadIdx.y;
uint2::Native_vec_ launch_idx = uint2(launch_x, launch_y).data;
uint32_t size_x = stack->globals.width;
uint32_t size_y = stack->globals.height;
uint2::Native_vec_ launch_dim = uint2(size_x, size_y).data;
hiprtRay ray {
make_float3(ray_origin_x, ray_origin_y, ray_origin_z),
time,
make_float3(ray_direction_x, ray_direction_y, ray_direction_z),
tmax
};
OptixRay optix_ray {
make_float3(ray_origin_x, ray_origin_y, ray_origin_z),
make_float3(ray_direction_x, ray_direction_y, ray_direction_z),
ray_type,
tmin,
tmax
};
PRIVATE_SPACE uint8_t* payload = reinterpret_cast<PRIVATE_SPACE uint8_t*>(payload_ptr);
uint32_t ray_type_count = stack->globals.ray_type_count;
BvhDetails CONSTANT_SPACE* bvh = &stack->globals.scenes[bvh_index];
HitProgramChain GLOBAL_SPACE* programs_chain = bvh->hit_chains[ray_type];
IntersectionPayload intersection_payload {
0,
ray_type,
tmin,
tmax,
time,
global_state,
payload,
tmax,
~uint32_t(0)
};
uint32_t old_stack_bottom = ((OptixStack*)stack)->stack_pointer();
uint32_t new_stack_bottom = ((OptixStack*)stack)->soft_push_attribute_block(&intersection_payload.attribute_block);
ZLUDA_RT(TraversalStack) rt_stack(new_stack_bottom);
if (rt_stack.vacancy() < 1) {
abort();
}
SceneTraversal2<ZLUDA_RT(TraversalStack), hiprtTraversalTerminateAtAnyHit> tr { bvh->scene, ray, mask, bvh->func_set, rt_stack, &intersection_payload };
unsigned int closest_instance_id = hiprtInvalidValue;
unsigned int closest_subinstance_id = 0;
attribute_fn* closest_attr_fn = nullptr;
float2 closest_uv {};
float3 closest_normals(ZLUDA_NAN, ZLUDA_NAN, ZLUDA_NAN);
while (true) {
intersection_payload.result = 0;
intersection_payload.material = ~uint32_t(0);
hiprtHit2 hit = tr.getNextHit2();
if (tr.getCurrentState() == hiprtTraversalStateStackOverflow) {
abort();
}
if (!hit.hasHit())
break;
if (hit.t < tmin)
continue;
tr.set_ray_maxT(hit.t);
// Successful custom intersection hit
if (intersection_payload.material != ~uint32_t(0)) {
if (intersection_payload.result != 1) {
closest_attr_fn = nullptr;
closest_normals = {ZLUDA_NAN, ZLUDA_NAN, ZLUDA_NAN};
closest_instance_id = hit.instanceID;
closest_subinstance_id = intersection_payload.material;
intersection_payload.closest_distance = hit.t;
}
if (intersection_payload.result == 2)
break;
continue;
}
attribute_fn* attr_fn = get_program_block_inner<attribute_fn*>(
global_space_cast(bvh->attribute_programs),
hit.instanceID
);
raytracing_fn* anyhit_fn = get_program_block_outer<raytracing_fn*>(
byte_offset(programs_chain, programs_chain->any_hit_start),
hit.instanceID,
hit.primID
);
uint32_t intersection_result = 0;
if (attr_fn != nullptr) {
OptixRay transformed_ray {
{ hit.origin.x, hit.origin.y, hit.origin.z },
{ hit.direction.x, hit.direction.y, hit.direction.z },
ray_type,
tmin,
tmax
};
intersection_result = (*attr_fn)(
global_state,
hit.primID,
launch_idx,
launch_dim,
(OptixRay PRIVATE_SPACE*)(&transformed_ray),
time,
payload,
hit.t,
hit.uv.data,
zluda_float3{hit.normal.x, hit.normal.y, hit.normal.z},
(uint8_t GLOBAL_SPACE *)(attr_fn),
intersection_payload.attribute_block,
bvh->transform_blocks[hit.instanceID],
reinterpret_cast<uint64_t>(anyhit_fn)
);
} else {
ZLUDA_RT(set_attr_barycentrics)(intersection_payload.attribute_block, hit.uv);
if (anyhit_fn != nullptr) {
OptixRay transformed_ray {
{ hit.origin.x, hit.origin.y, hit.origin.z },
{ hit.direction.x, hit.direction.y, hit.direction.z },
ray_type,
tmin,
tmax
};
intersection_result = (*anyhit_fn)(
global_state,
hit.primID,
launch_idx,
launch_dim,
(OptixRay PRIVATE_SPACE*)(&transformed_ray),
time,
payload,
hit.t,
hit.uv.data,
zluda_float3{hit.normal.x, hit.normal.y, hit.normal.z},
(uint8_t GLOBAL_SPACE *)(anyhit_fn),
intersection_payload.attribute_block,
bvh->transform_blocks[hit.instanceID]
);
}
}
if (intersection_result >= 1024) {
return intersection_result;
}
if (intersection_result != 1) {
closest_attr_fn = attr_fn;
closest_uv = hit.uv;
closest_normals = hit.normal;
closest_instance_id = hit.instanceID;
closest_subinstance_id = hit.primID;
intersection_payload.closest_distance = hit.t;
}
if (intersection_result == 2)
break;
}
if (closest_instance_id != hiprtInvalidValue) {
raytracing_fn* closest_hit_fn = get_program_block_outer<raytracing_fn*>(
byte_offset(programs_chain, programs_chain->closest_hit_start),
closest_instance_id,
closest_subinstance_id
);
if (closest_hit_fn != nullptr) {
((OptixStack*)stack)->stack_pointer() = new_stack_bottom;
uint32_t result = 0;
if (closest_attr_fn != nullptr) {
result = (*closest_attr_fn)(
global_state,
closest_subinstance_id,
launch_idx,
launch_dim,
(OptixRay PRIVATE_SPACE*)(&optix_ray),
time,
payload,
intersection_payload.closest_distance,
closest_uv.data,
zluda_float3{closest_normals.x, closest_normals.y, closest_normals.z},
(uint8_t GLOBAL_SPACE *)(closest_attr_fn),
intersection_payload.attribute_block,
bvh->transform_blocks[closest_instance_id],
reinterpret_cast<uint64_t>(closest_hit_fn)
);
} else {
result = (*closest_hit_fn)(
global_state,
closest_subinstance_id,
launch_idx,
launch_dim,
(OptixRay PRIVATE_SPACE*)(&optix_ray),
time,
payload,
intersection_payload.closest_distance,
float2(0.0, 0.0).data,
zluda_float3{closest_normals.x, closest_normals.y, closest_normals.z},
(uint8_t GLOBAL_SPACE *)(closest_hit_fn),
intersection_payload.attribute_block,
bvh->transform_blocks[closest_instance_id]
);
}
if (result >= 1024) {
return result;
}
((OptixStack*)stack)->stack_pointer() = old_stack_bottom;
}
return 0;
} else {
return call_raytracing_fn_inner(
global_state,
global_space_cast(stack->globals.miss_programs),
ray_type,
launch_idx,
launch_dim,
(OptixRay PRIVATE_SPACE*)(&optix_ray),
0.0,
payload,
0.0,
float2(0.0, 0.0).data,
zluda_float3{0.0, 0.0, 0.0},
nullptr,
nullptr
);
}
}
__global__ void ZLUDA_RT(generate_bounding_box)(
GlobalState globals,
bounding_box_fn bb_func,
uint32_t primitive_count,
float GLOBAL_SPACE* output,
uint8_t GLOBAL_SPACE* variable_block
) {
asm ("" : : : "s105");
asm ("" : : : "v255");
if (threadIdx.x == 0)
ZLUDA_RT(BB_GLOBALS) = globals;
__syncthreads();
uint32_t x = blockIdx.x * blockDim.x + threadIdx.x;
if (x >= primitive_count)
return;
bb_func(x, &output[x*8], (uint8_t SHARED_SPACE*)&ZLUDA_RT(BB_GLOBALS), variable_block);
}
}
#endif