diff --git a/application/platforms/CMakeLists.txt b/application/platforms/CMakeLists.txt index 28619c62b..0ca56f3cc 100644 --- a/application/platforms/CMakeLists.txt +++ b/application/platforms/CMakeLists.txt @@ -31,6 +31,9 @@ elseif (${GRANITE_PLATFORM} MATCHES "SDL") #find_package(SDL3 REQUIRED CONFIG REQUIRED COMPONENTS SDL3-shared) #target_link_libraries(granite-platform PRIVATE SDL3::SDL3-shared) target_link_libraries(granite-platform PRIVATE SDL3-static granite-input-sdl) + if (NOT WIN32) + target_link_libraries(granite-platform PRIVATE dl) + endif() else() message(FATAL "GRANITE_PLATFORM is not set.") endif() diff --git a/application/platforms/application_headless.cpp b/application/platforms/application_headless.cpp index d62e9b2e8..1b8083906 100644 --- a/application/platforms/application_headless.cpp +++ b/application/platforms/application_headless.cpp @@ -248,19 +248,10 @@ struct WSIPlatformHeadless : Granite::GraniteWSIPlatform enc_opts.frame_timebase.den = int(frame_rate); #ifdef HAVE_GRANITE_AUDIO -#if 1 enc_opts.realtime = true; record_stream.reset(Audio::create_default_audio_record_backend("headless", 44100.0f, 2)); if (record_stream) encoder.set_audio_record_stream(record_stream.get()); -#else - auto *mixer = new Audio::Mixer; - auto *audio_dumper = new Audio::DumpBackend( - mixer, 48000.0f, 2, - unsigned(std::ceil(48000.0f / frame_rate))); - Global::install_audio_system(audio_dumper, mixer); - encoder.set_audio_source(audio_dumper); -#endif #endif if (!encoder.init(&app->get_wsi().get_device(), video_encode_path.c_str(), enc_opts)) @@ -284,7 +275,9 @@ struct WSIPlatformHeadless : Granite::GraniteWSIPlatform } #endif +#ifdef HAVE_GRANITE_AUDIO record_stream->start(); +#endif } #endif diff --git a/application/platforms/application_sdl3.cpp b/application/platforms/application_sdl3.cpp index de926e9f0..4e0e6a3a4 100644 --- a/application/platforms/application_sdl3.cpp +++ b/application/platforms/application_sdl3.cpp @@ -40,6 +40,10 @@ #include #endif +#ifdef __linux__ +#include +#endif + namespace Granite { static Key sdl_key_to_granite_key(SDL_Keycode key) @@ -104,6 +108,17 @@ struct WSIPlatformSDL : GraniteWSIPlatform if (options.override_height) height = options.override_height; +#ifdef __linux__ + // RenderDoc doesn't support Wayland, and SDL3 uses Wayland by default. + // Opt in to X11 to avoid having to manually remember to pass down SDL_VIDEO_DRIVER=x11. + void *renderdoc_module = dlopen("librenderdoc.so", RTLD_NOW | RTLD_NOLOAD); + if (renderdoc_module) + { + LOGI("RenderDoc is loaded, disabling Wayland.\n"); + setenv("SDL_VIDEO_DRIVER", "x11", 0); + } +#endif + if (SDL_Init(SDL_INIT_EVENTS | SDL_INIT_GAMEPAD | SDL_INIT_VIDEO) < 0) { LOGE("Failed to init SDL.\n"); diff --git a/assets/shaders/decode/meshlet_decode.comp b/assets/shaders/decode/meshlet_decode.comp new file mode 100644 index 000000000..8056519ac --- /dev/null +++ b/assets/shaders/decode/meshlet_decode.comp @@ -0,0 +1,219 @@ +#version 450 + +#extension GL_EXT_scalar_block_layout : require +#include "../inc/meshlet_payload_constants.h" + +#define MESHLET_PAYLOAD_LARGE_WORKGROUP 1 + +#if MESHLET_PAYLOAD_LARGE_WORKGROUP +#define MESHLET_PAYLOAD_WG_Y MESHLET_PAYLOAD_NUM_CHUNKS +#else +#define MESHLET_PAYLOAD_WG_Y 1 +#endif +layout(local_size_x = 32, local_size_y = MESHLET_PAYLOAD_WG_Y) in; + +layout(constant_id = 0) const uint NUM_U32_STREAMS = MESHLET_PAYLOAD_MAX_STREAMS; +layout(constant_id = 1) const uint NUM_OUTPUT_U32_STREAMS = 1; +layout(constant_id = 2) const bool RAW_PAYLOAD = false; +#define MESHLET_PAYLOAD_NUM_U32_STREAMS NUM_U32_STREAMS +#define MESHLET_PAYLOAD_DESCRIPTOR_SET 0 +#define MESHLET_PAYLOAD_META_BINDING 0 +#define MESHLET_PAYLOAD_STREAM_BINDING 1 +#define MESHLET_PAYLOAD_PAYLOAD_BINDING 2 +#include "../inc/meshlet_payload_decode.h" +#include "../inc/meshlet_attribute_decode.h" + +const int MESH_STYLE = int(NUM_OUTPUT_U32_STREAMS); +const int MESH_STYLE_WIREFRAME = 0; +const int MESH_STYLE_UNTEXTURED = 1; +const int MESH_STYLE_TEXTURED = 2; +const int MESH_STYLE_SKINNED = 3; + +layout(set = 0, binding = 3, scalar) writeonly buffer OutputIndices +{ + uvec3 data[]; +} output_indices32; + +layout(set = 0, binding = 3, scalar) writeonly buffer OutputIndices8 +{ + u8vec3 data[]; +} output_indices8; + +layout(set = 0, binding = 4, std430) writeonly buffer OutputStream0 +{ + uint data[]; +} output_stream_raw; + +layout(set = 0, binding = 4, scalar) writeonly buffer OutputStreamPos +{ + vec3 data[]; +} output_stream_pos; + +struct UntexturedAttr +{ + uint normal; +}; + +layout(set = 0, binding = 5, std430) writeonly buffer OutputStreamUntextured +{ + UntexturedAttr data[]; +} output_stream_untextured_attr; + +struct TexturedAttr +{ + uint normal; + uint tangent; + vec2 uv; +}; + +layout(set = 0, binding = 5, std430) writeonly buffer OutputStreamTextured +{ + TexturedAttr data[]; +} output_stream_textured_attr; + +layout(set = 0, binding = 6, std430) writeonly buffer OutputStreamSkin +{ + uvec2 data[]; +} output_stream_skin; + +layout(set = 0, binding = 7, std430) readonly buffer OutputOffsets +{ + uvec2 data[]; +} output_offset_strides; + +struct IndirectIndexedDraw +{ + uint indexCount; + uint instanceCount; + uint firstIndex; + uint vertexOffset; + uint firstInstance; +}; + +layout(set = 0, binding = 8, std430) writeonly buffer IndirectCommands +{ + IndirectIndexedDraw draws[]; +} indirect_commands; + +layout(push_constant, std430) uniform Registers +{ + uint primitive_offset; + uint vertex_offset; + uint meshlet_offset; +} registers; + +uint pack_a2bgr10(vec4 v) +{ + ivec4 quantized = ivec4(round(clamp(v, vec4(-1.0), vec4(1.0)) * vec4(511.0, 511.0, 511.0, 1.0))) & ivec4(1023, 1023, 1023, 3); + return (quantized.a << 30) | (quantized.b << 20) | (quantized.g << 10) | (quantized.r << 0); +} + +void main() +{ + uint meshlet_index = gl_WorkGroupID.x; + meshlet_init_workgroup(meshlet_index * NUM_U32_STREAMS); + MeshletMetaRaw meta = meshlet_metas_raw.data[meshlet_index]; + + if (!RAW_PAYLOAD) + { + IndirectIndexedDraw draw; + draw.indexCount = 3 * (meta.num_primitives_minus_1 + 1); + draw.instanceCount = 1; + draw.vertexOffset = meta.base_vertex_offset + registers.vertex_offset; + draw.firstIndex = 3 * (output_offset_strides.data[meshlet_index].x + registers.primitive_offset); + draw.firstInstance = 0; + indirect_commands.draws[meshlet_index + registers.meshlet_offset] = draw; + } + +#define INDEX(linear_index, packed_indices) { \ + uint output_offset; \ + if (RAW_PAYLOAD) { \ + uvec3 indices = uvec4(unpack8(packed_indices)).xyz; \ + indices += meta.base_vertex_offset + registers.vertex_offset; \ + output_offset = output_offset_strides.data[meshlet_index * NUM_OUTPUT_U32_STREAMS].x; \ + output_offset += registers.primitive_offset; \ + if (linear_index <= uint(meta.num_primitives_minus_1)) \ + output_indices32.data[output_offset + linear_index] = indices; \ + } else { \ + output_offset = output_offset_strides.data[meshlet_index].x; \ + output_offset += registers.primitive_offset; \ + if (linear_index <= uint(meta.num_primitives_minus_1)) \ + output_indices8.data[output_offset + linear_index] = unpack8(packed_indices).xyz; \ + } \ +} + + { + MESHLET_DECODE_STREAM_32(meshlet_index * NUM_U32_STREAMS, 0, INDEX); + } + + if (RAW_PAYLOAD) + { +#define ATTR(linear_index, packed_decoded) { \ + uvec2 output_offset_stride0 = output_offset_strides.data[meshlet_index * NUM_OUTPUT_U32_STREAMS + i]; \ + output_offset_stride0.x += registers.vertex_offset; \ + if (linear_index <= uint(meta.num_attributes_minus_1)) \ + output_stream_raw.data[output_offset_stride0.x + linear_index * output_offset_stride0.y] = packed_decoded; \ +} + + for (uint i = 1; i < NUM_OUTPUT_U32_STREAMS; i++) + { + MESHLET_DECODE_STREAM_32(meshlet_index * NUM_U32_STREAMS, i, ATTR); + } + } + else + { + uint output_offset = output_offset_strides.data[meshlet_index].y; + output_offset += registers.vertex_offset; + +#define POS(linear_index, packed_decoded) { \ + if (linear_index <= uint(meta.num_attributes_minus_1)) \ + output_stream_pos.data[output_offset + linear_index] = attribute_decode_snorm_exp_position(packed_decoded); \ +} + +#define NORMAL(linear_index, packed_decoded) { \ + if (linear_index <= uint(meta.num_attributes_minus_1)) { \ + if (MESH_STYLE >= MESH_STYLE_TEXTURED) \ + output_stream_textured_attr.data[output_offset + linear_index].normal = pack_a2bgr10(attribute_decode_oct8_normal_tangent(packed_decoded)); \ + else \ + output_stream_untextured_attr.data[output_offset + linear_index].normal = pack_a2bgr10(attribute_decode_oct8_normal_tangent(packed_decoded)); \ + } \ +} + +#define TANGENT(linear_index, packed_decoded) { \ + if (linear_index <= uint(meta.num_attributes_minus_1)) { \ + output_stream_textured_attr.data[output_offset + linear_index].tangent = pack_a2bgr10(attribute_decode_oct8_normal_tangent(packed_decoded)); \ + } \ +} + +#define UV(linear_index, packed_decoded) { \ + if (linear_index <= uint(meta.num_attributes_minus_1)) { \ + output_stream_textured_attr.data[output_offset + linear_index].uv = attribute_decode_snorm_exp_uv(packed_decoded); \ + } \ +} + +#define SKIN(linear_index, packed_decoded) { \ + if (linear_index <= uint(meta.num_attributes_minus_1)) { \ + output_stream_skin.data[output_offset + linear_index] = packed_decoded; \ + } \ +} + { + MESHLET_DECODE_STREAM_64(meshlet_index * NUM_U32_STREAMS, 1, POS); + } + + if (MESH_STYLE >= MESH_STYLE_UNTEXTURED) + { + MESHLET_DECODE_STREAM_32(meshlet_index * NUM_U32_STREAMS, 3, NORMAL); + } + + if (MESH_STYLE >= MESH_STYLE_TEXTURED) + { + MESHLET_DECODE_STREAM_32(meshlet_index * NUM_U32_STREAMS, 4, TANGENT); + MESHLET_DECODE_STREAM_64(meshlet_index * NUM_U32_STREAMS, 5, UV); + } + + if (MESH_STYLE >= MESH_STYLE_SKINNED) + { + MESHLET_DECODE_STREAM_64(meshlet_index * NUM_U32_STREAMS, 7, SKIN); + } + } +} diff --git a/assets/shaders/inc/meshlet_attribute_decode.h b/assets/shaders/inc/meshlet_attribute_decode.h new file mode 100644 index 000000000..51a05bc05 --- /dev/null +++ b/assets/shaders/inc/meshlet_attribute_decode.h @@ -0,0 +1,39 @@ +#ifndef MESHLET_ATTRIBUTE_DECODE_H_ +#define MESHLET_ATTRIBUTE_DECODE_H_ + +vec3 attribute_decode_snorm_exp_position(uvec2 payload) +{ + ivec3 sint_value = ivec3( + bitfieldExtract(int(payload.x), 0, 16), + bitfieldExtract(int(payload.x), 16, 16), + bitfieldExtract(int(payload.y), 0, 16)); + int exp = bitfieldExtract(int(payload.y), 16, 16); + return vec3( + ldexp(float(sint_value.x), exp), + ldexp(float(sint_value.y), exp), + ldexp(float(sint_value.z), exp)); +} + +vec2 attribute_decode_snorm_exp_uv(uvec2 payload) +{ + ivec2 sint_value = ivec2( + bitfieldExtract(int(payload.x), 0, 16), + bitfieldExtract(int(payload.x), 16, 16)); + int exp = bitfieldExtract(int(payload.y), 0, 16); + return vec2( + ldexp(float(sint_value.x), exp), + ldexp(float(sint_value.y), exp)) + 0.5; +} + +// Adapted from: https://knarkowicz.wordpress.com/2014/04/16/octahedron-normal-vector-encoding/ +// https://twitter.com/Stubbesaurus/status/9379947905532272640 +mediump vec4 attribute_decode_oct8_normal_tangent(uint payload) +{ + mediump vec4 f = unpackSnorm4x8(payload); + mediump vec3 n = vec3(f.x, f.y, 1.0 - abs(f.x) - abs(f.y)); + mediump float t = max(-n.z, 0.0); + n.xy += mix(vec2(t), vec2(-t), greaterThanEqual(n.xy, vec2(0.0))); + return vec4(normalize(n), f.w != 0.0 ? -1.0 : 1.0); +} + +#endif \ No newline at end of file diff --git a/assets/shaders/inc/meshlet_payload_constants.h b/assets/shaders/inc/meshlet_payload_constants.h new file mode 100644 index 000000000..2a91ff531 --- /dev/null +++ b/assets/shaders/inc/meshlet_payload_constants.h @@ -0,0 +1,8 @@ +#ifndef MESHLET_PAYLOAD_CONSTANTS_H_ +#define MESHLET_PAYLOAD_CONSTANTS_H_ + +#define MESHLET_PAYLOAD_MAX_ELEMENTS 256 +#define MESHLET_PAYLOAD_NUM_CHUNKS 8 +#define MESHLET_PAYLOAD_MAX_STREAMS 16 + +#endif \ No newline at end of file diff --git a/assets/shaders/inc/meshlet_payload_decode.h b/assets/shaders/inc/meshlet_payload_decode.h new file mode 100644 index 000000000..0673e3a32 --- /dev/null +++ b/assets/shaders/inc/meshlet_payload_decode.h @@ -0,0 +1,305 @@ +#ifndef MESHLET_PAYLOAD_DECODE_H_ +#define MESHLET_PAYLOAD_DECODE_H_ + +#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require +#extension GL_KHR_shader_subgroup_arithmetic : require +#extension GL_KHR_shader_subgroup_ballot : require +#extension GL_KHR_shader_subgroup_shuffle : require +#extension GL_KHR_shader_subgroup_basic : require +#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require +#extension GL_EXT_scalar_block_layout : require +#extension GL_EXT_shader_subgroup_extended_types_int8 : require + +#include "meshlet_payload_constants.h" + +#ifndef MESHLET_PAYLOAD_NUM_U32_STREAMS +#error "Must define MESHLET_PAYLOAD_NUM_U32_STREAMS before including meshlet_payload_decode.h" +#endif + +#ifndef MESHLET_PAYLOAD_LARGE_WORKGROUP +#error "Must define MESHLET_PAYLOAD_LARGE_WORKGROUP" +#endif + +#ifndef MESHLET_PAYLOAD_DESCRIPTOR_SET +#error "Must define MESHLET_PAYLOAD_DESCRIPTOR_SET" +#endif + +#ifndef MESHLET_PAYLOAD_META_BINDING +#error "Must define MESHLET_PAYLOAD_META_BINDING" +#endif + +#ifndef MESHLET_PAYLOAD_STREAM_BINDING +#error "Must define MESHLET_PAYLOAD_STREAM_BINDING" +#endif + +#ifndef MESHLET_PAYLOAD_PAYLOAD_BINDING +#error "Must define MESHLET_PAYLOAD_PAYLOAD_BINDING" +#endif + +struct MeshletStream +{ + u16vec4 predictor_a; + u16vec4 predictor_b; + u8vec4 initial_value; + uint offset_from_base; + uint16_t bitplane_meta[MESHLET_PAYLOAD_NUM_CHUNKS]; +}; + +struct MeshletMetaRaw +{ + uint base_vertex_offset; + uint8_t num_primitives_minus_1; + uint8_t num_attributes_minus_1; + uint16_t reserved; +}; + +struct MeshletMetaRuntime +{ + uint stream_offset; + uint16_t num_primitives; + uint16_t num_attributes; +}; + +layout(set = MESHLET_PAYLOAD_DESCRIPTOR_SET, binding = MESHLET_PAYLOAD_META_BINDING, std430) readonly buffer MeshletMetasRaw +{ + MeshletMetaRaw data[]; +} meshlet_metas_raw; + +layout(set = MESHLET_PAYLOAD_DESCRIPTOR_SET, binding = MESHLET_PAYLOAD_META_BINDING, std430) readonly buffer MeshletMetasRuntime +{ + MeshletMetaRuntime data[]; +} meshlet_metas_runtime; + +layout(set = MESHLET_PAYLOAD_DESCRIPTOR_SET, binding = MESHLET_PAYLOAD_STREAM_BINDING, std430) readonly buffer MeshletStreams +{ + MeshletStream data[]; +} meshlet_streams; + +layout(set = MESHLET_PAYLOAD_DESCRIPTOR_SET, binding = MESHLET_PAYLOAD_PAYLOAD_BINDING, std430) readonly buffer Payload +{ + uint data[]; +} payload; + +#if MESHLET_PAYLOAD_LARGE_WORKGROUP +shared u8vec4 shared_chunk_bit_counts[MESHLET_PAYLOAD_NUM_U32_STREAMS][MESHLET_PAYLOAD_NUM_CHUNKS]; +shared uint shared_chunk_offset[MESHLET_PAYLOAD_NUM_U32_STREAMS][MESHLET_PAYLOAD_NUM_CHUNKS]; +shared uvec2 chunk_values0[MESHLET_PAYLOAD_NUM_CHUNKS]; +shared uvec2 chunk_values1[MESHLET_PAYLOAD_NUM_CHUNKS]; +#endif + +// Hardcodes wave32 atm. Need fallback. + +uvec2 pack_u16vec4_to_uvec2(u16vec4 v) +{ + return uvec2(pack32(v.xy), pack32(v.zw)); +} + +uint repack_uint(uvec2 v) +{ + u16vec4 v16 = u16vec4(unpack16(v.x), unpack16(v.y)); + return pack32(u8vec4(v16)); +} + +void meshlet_compute_stream_offsets(uint stream_index, + out uint out_stream_chunk_offset, out u8vec4 out_bit_counts) +{ + if (gl_SubgroupInvocationID < MESHLET_PAYLOAD_NUM_CHUNKS) + { + uint bitplane_value = uint(meshlet_streams.data[stream_index].bitplane_meta[gl_SubgroupInvocationID]); + u16vec4 bit_counts = (u16vec4(bitplane_value) >> u16vec4(0, 4, 8, 12)) & 0xfus; + u16vec2 bit_counts2 = bit_counts.xy + bit_counts.zw; + uint total_bits = bit_counts2.x + bit_counts2.y; + uint offset = meshlet_streams.data[stream_index].offset_from_base; + out_stream_chunk_offset = subgroupExclusiveAdd(total_bits) + offset; + out_bit_counts = u8vec4(bit_counts); + } +} + +void meshlet_init_workgroup(uint base_stream_index) +{ +#if MESHLET_PAYLOAD_LARGE_WORKGROUP + + for (uint stream_index = gl_SubgroupID; stream_index < MESHLET_PAYLOAD_NUM_U32_STREAMS; stream_index += gl_NumSubgroups) + { + if (gl_SubgroupInvocationID < MESHLET_PAYLOAD_NUM_CHUNKS) + { + // Start by decoding the offset for bitplanes for all u32 streams. + meshlet_compute_stream_offsets(base_stream_index + stream_index, + shared_chunk_offset[stream_index][gl_SubgroupInvocationID], + shared_chunk_bit_counts[stream_index][gl_SubgroupInvocationID]); + } + } + + barrier(); +#endif +} + +uint meshlet_get_linear_index() +{ +#if MESHLET_PAYLOAD_LARGE_WORKGROUP + // Rely on SubgroupInvocationID == LocalInvocationID.x here. + return gl_WorkGroupSize.x * gl_LocalInvocationID.y + gl_SubgroupInvocationID; +#else + return gl_SubgroupInvocationID; +#endif +} + +// Overlap load with consumption. +// Helps RDNA2 quite a lot here! +#define MESHLET_FETCH_BITPLANES(decoded_value, counts, payload_value, offset) \ + for (int i = 0; i < counts; i++) \ + { \ + decoded_value |= bitfieldExtract(payload_value, int(gl_SubgroupInvocationID), 1) << i; \ + payload_value = payload.data[++offset]; \ + } \ + decoded_value = bitfieldExtract(int(decoded_value), 0, counts) + +// Add some specialized variants. + +#define MESHLET_PAYLOAD_DECL_STREAM(unrolled_stream_index, iter) \ + u16vec4 predictor_a##iter = meshlet_streams.data[unrolled_stream_index].predictor_a; \ + u16vec4 predictor_b##iter = meshlet_streams.data[unrolled_stream_index].predictor_b; \ + u8vec4 initial_value_##iter = meshlet_streams.data[unrolled_stream_index].initial_value; \ + uvec2 initial_value##iter = pack_u16vec4_to_uvec2(u16vec4(initial_value_##iter)) + +#if MESHLET_PAYLOAD_LARGE_WORKGROUP +#define MESHLET_PAYLOAD_DECL_CHUNK_OFFSETS(stream_index, chunk_id, iter) \ + uint bitplane_offsets##iter = shared_chunk_offset[stream_index][chunk_id]; \ + ivec4 bit_counts##iter = ivec4(shared_chunk_bit_counts[stream_index][chunk_id]) +#else +#define MESHLET_PAYLOAD_DECL_CHUNK_OFFSETS(stream_index, chunk_id, iter) \ + uint bitplane_offsets##iter = subgroupShuffle(shared_chunk_offset##iter, chunk_id); \ + ivec4 bit_counts##iter = ivec4(subgroupShuffle(shared_chunk_bit_counts##iter, chunk_id)) +#endif + +#define MESHLET_PAYLOAD_PROCESS_CHUNK(stream_index, chunk_id, iter) \ + uvec4 decoded##iter = ivec4(0); \ + MESHLET_PAYLOAD_DECL_CHUNK_OFFSETS(stream_index, chunk_id, iter); \ + uint value##iter = payload.data[bitplane_offsets##iter]; \ + MESHLET_FETCH_BITPLANES(decoded##iter.x, bit_counts##iter.x, value##iter, bitplane_offsets##iter); \ + MESHLET_FETCH_BITPLANES(decoded##iter.y, bit_counts##iter.y, value##iter, bitplane_offsets##iter); \ + MESHLET_FETCH_BITPLANES(decoded##iter.z, bit_counts##iter.z, value##iter, bitplane_offsets##iter); \ + MESHLET_FETCH_BITPLANES(decoded##iter.w, bit_counts##iter.w, value##iter, bitplane_offsets##iter); \ + uvec2 packed_decoded##iter = pack_u16vec4_to_uvec2(u16vec4(decoded##iter)) & 0xff00ffu; \ + if (linear_index == 0) \ + packed_decoded##iter += initial_value##iter; \ + packed_decoded##iter += pack_u16vec4_to_uvec2((predictor_a##iter + predictor_b##iter * uint16_t(linear_index)) >> 8us); \ + packed_decoded##iter = subgroupInclusiveAdd(packed_decoded##iter) + +#if MESHLET_PAYLOAD_LARGE_WORKGROUP +uint meshlet_decode_stream_32_wg256(uint base_stream_index, uint stream_index) +{ + uint unrolled_stream_index = base_stream_index + stream_index; + uint linear_index = meshlet_get_linear_index(); + uint chunk_id = gl_LocalInvocationID.y; + + MESHLET_PAYLOAD_DECL_STREAM(unrolled_stream_index, 0); + MESHLET_PAYLOAD_PROCESS_CHUNK(stream_index, chunk_id, 0); + + barrier(); // Resolve WAR hazard from last iteration. + if (gl_SubgroupInvocationID == MESHLET_PAYLOAD_MAX_ELEMENTS / MESHLET_PAYLOAD_NUM_CHUNKS - 1) + chunk_values0[chunk_id] = packed_decoded0 & 0xff00ffu; + barrier(); + if (gl_SubgroupID == 0u && gl_SubgroupInvocationID < gl_WorkGroupSize.y) + chunk_values0[gl_SubgroupInvocationID] = subgroupInclusiveAdd(chunk_values0[gl_SubgroupInvocationID]); + barrier(); + if (chunk_id != 0) + packed_decoded0 += chunk_values0[chunk_id - 1]; + + return repack_uint(packed_decoded0); +} + +uvec2 meshlet_decode_stream_64_wg256(uint base_stream_index, uint stream_index) +{ + // Dual-pump the computation. VGPR use is quite low either way, so this is fine. + uint unrolled_stream_index = base_stream_index + stream_index; + uint linear_index = meshlet_get_linear_index(); + uint chunk_id = gl_LocalInvocationID.y; + + MESHLET_PAYLOAD_DECL_STREAM(unrolled_stream_index, 0); + MESHLET_PAYLOAD_DECL_STREAM(unrolled_stream_index + 1, 1); + MESHLET_PAYLOAD_PROCESS_CHUNK(stream_index, chunk_id, 0); + MESHLET_PAYLOAD_PROCESS_CHUNK(stream_index + 1, chunk_id, 1); + + barrier(); // Resolve WAR hazard from last iteration. + if (gl_SubgroupInvocationID == gl_SubgroupSize - 1) + { + chunk_values0[chunk_id] = packed_decoded0 & 0xff00ffu; + chunk_values1[chunk_id] = packed_decoded1 & 0xff00ffu; + } + barrier(); + if (gl_SubgroupID == 0u && gl_SubgroupInvocationID < gl_WorkGroupSize.y) + chunk_values0[gl_SubgroupInvocationID] = subgroupInclusiveAdd(chunk_values0[gl_SubgroupInvocationID]); + else if (gl_SubgroupID == 1u && gl_SubgroupInvocationID < gl_WorkGroupSize.y) + chunk_values1[gl_SubgroupInvocationID] = subgroupInclusiveAdd(chunk_values1[gl_SubgroupInvocationID]); + barrier(); + if (chunk_id != 0) + { + packed_decoded0 += chunk_values0[chunk_id - 1]; + packed_decoded1 += chunk_values1[chunk_id - 1]; + } + + return uvec2(repack_uint(packed_decoded0), repack_uint(packed_decoded1)); +} + +// For large workgroups, we imply AMD, where LocalInvocationIndex indexing is preferred. +// We assume that SubgroupInvocationID == LocalInvocationID.x here since it's the only reasonable it would work. +#define MESHLET_DECODE_STREAM_32(meshlet_index, stream_index, report_cb) { \ + uint value = meshlet_decode_stream_32_wg256(meshlet_index, stream_index); \ + report_cb(gl_LocalInvocationIndex, value); } + +#define MESHLET_DECODE_STREAM_64(meshlet_index, stream_index, report_cb) { \ + uvec2 value = meshlet_decode_stream_64_wg256(meshlet_index, stream_index); \ + report_cb(gl_LocalInvocationIndex, value); } + +#else + +// Have to iterate and report once per chunk. Avoids having to spend a lot of LDS memory. +#define MESHLET_DECODE_STREAM_32(base_stream_index, stream_index, report_cb) { \ + uint unrolled_stream_index = base_stream_index + stream_index; \ + uint linear_index = meshlet_get_linear_index(); \ + uvec2 prev_value0 = uvec2(0); \ + uint shared_chunk_offset0; \ + u8vec4 shared_chunk_bit_counts0; \ + meshlet_compute_stream_offsets(unrolled_stream_index, shared_chunk_offset0, shared_chunk_bit_counts0); \ + MESHLET_PAYLOAD_DECL_STREAM(unrolled_stream_index, 0); \ + for (uint chunk_id = 0; chunk_id < MESHLET_PAYLOAD_NUM_CHUNKS; chunk_id++) \ + { \ + MESHLET_PAYLOAD_PROCESS_CHUNK(stream_index, chunk_id, 0); \ + packed_decoded0 += prev_value0; \ + prev_value0 = subgroupBroadcast(packed_decoded0, 31) & 0xff00ffu; \ + report_cb(linear_index, repack_uint(packed_decoded0)); \ + linear_index += gl_SubgroupSize; \ + } \ +} + +// Have to iterate and report once per chunk. Avoids having to spend a lot of LDS memory. +#define MESHLET_DECODE_STREAM_64(base_stream_index, stream_index, report_cb) { \ + uint unrolled_stream_index = base_stream_index + stream_index; \ + uint linear_index = meshlet_get_linear_index(); \ + uvec2 prev_value0 = uvec2(0); \ + uvec2 prev_value1 = uvec2(0); \ + uint shared_chunk_offset0; \ + u8vec4 shared_chunk_bit_counts0; \ + meshlet_compute_stream_offsets(unrolled_stream_index, shared_chunk_offset0, shared_chunk_bit_counts0); \ + uint shared_chunk_offset1; \ + u8vec4 shared_chunk_bit_counts1; \ + meshlet_compute_stream_offsets(unrolled_stream_index + 1, shared_chunk_offset1, shared_chunk_bit_counts1); \ + MESHLET_PAYLOAD_DECL_STREAM(unrolled_stream_index, 0); \ + MESHLET_PAYLOAD_DECL_STREAM(unrolled_stream_index + 1, 1); \ + for (uint chunk_id = 0; chunk_id < MESHLET_PAYLOAD_NUM_CHUNKS; chunk_id++) \ + { \ + MESHLET_PAYLOAD_PROCESS_CHUNK(stream_index, chunk_id, 0); \ + MESHLET_PAYLOAD_PROCESS_CHUNK(stream_index + 1, chunk_id, 1); \ + packed_decoded0 += prev_value0; \ + packed_decoded1 += prev_value1; \ + prev_value0 = subgroupBroadcast(packed_decoded0, 31) & 0xff00ffu; \ + prev_value1 = subgroupBroadcast(packed_decoded1, 31) & 0xff00ffu; \ + report_cb(linear_index, uvec2(repack_uint(packed_decoded0), repack_uint(packed_decoded1))); \ + linear_index += gl_SubgroupSize; \ + } \ +} + +#endif + +#endif \ No newline at end of file diff --git a/filesystem/asset_manager.cpp b/filesystem/asset_manager.cpp index c49d8a52e..6e3b1f1b8 100644 --- a/filesystem/asset_manager.cpp +++ b/filesystem/asset_manager.cpp @@ -29,6 +29,8 @@ namespace Granite { AssetManager::AssetManager() { + asset_bank.reserve(AssetID::MaxIDs); + sorted_assets.reserve(AssetID::MaxIDs); signal = std::make_unique(); for (uint64_t i = 0; i < timestamp; i++) signal->signal_increment(); @@ -36,40 +38,40 @@ AssetManager::AssetManager() AssetManager::~AssetManager() { + set_asset_instantiator_interface(nullptr); signal->wait_until_at_least(timestamp); - for (auto *a : asset_bank) - pool.free(a); + for (uint32_t i = 0; i < id_count; i++) + pool.free(asset_bank[i]); } -ImageAssetID AssetManager::register_image_resource_nolock(FileHandle file, ImageClass image_class, int prio) +AssetID AssetManager::register_asset_nolock(FileHandle file, AssetClass asset_class, int prio) { auto *info = pool.allocate(); info->handle = std::move(file); - info->id.id = id_count++; + info->id.id = id_count; info->prio = prio; - info->image_class = image_class; - ImageAssetID ret = info->id; - asset_bank.push_back(info); - sorted_assets.reserve(asset_bank.size()); + info->asset_class = asset_class; + AssetID ret = info->id; + asset_bank[id_count++] = info; if (iface) { iface->set_id_bounds(id_count); - iface->set_image_class(info->id, image_class); + iface->set_asset_class(info->id, asset_class); } return ret; } -void AssetInstantiatorInterface::set_image_class(ImageAssetID, ImageClass) +void AssetInstantiatorInterface::set_asset_class(AssetID, AssetClass) { } -ImageAssetID AssetManager::register_image_resource(FileHandle file, ImageClass image_class, int prio) +AssetID AssetManager::register_asset(FileHandle file, AssetClass asset_class, int prio) { std::lock_guard holder{asset_bank_lock}; - return register_image_resource_nolock(std::move(file), image_class, prio); + return register_asset_nolock(std::move(file), asset_class, prio); } -ImageAssetID AssetManager::register_image_resource(Filesystem &fs, const std::string &path, ImageClass image_class, int prio) +AssetID AssetManager::register_asset(Filesystem &fs, const std::string &path, AssetClass asset_class, int prio) { std::lock_guard holder{asset_bank_lock}; @@ -82,13 +84,13 @@ ImageAssetID AssetManager::register_image_resource(Filesystem &fs, const std::st if (!file) return {}; - auto id = register_image_resource_nolock(std::move(file), image_class, prio); + auto id = register_asset_nolock(std::move(file), asset_class, prio); asset_bank[id.id]->set_hash(h.get()); file_to_assets.insert_replace(asset_bank[id.id]); return id; } -void AssetManager::update_cost(ImageAssetID id, uint64_t cost) +void AssetManager::update_cost(AssetID id, uint64_t cost) { std::lock_guard holder{cost_update_lock}; thread_cost_updates.push_back({ id, cost }); @@ -100,11 +102,12 @@ void AssetManager::set_asset_instantiator_interface(AssetInstantiatorInterface * { signal->wait_until_at_least(timestamp); for (uint32_t id = 0; id < id_count; id++) - iface->release_image_resource({ id }); + iface->release_asset(AssetID{id}); } - for (auto *a : asset_bank) + for (uint32_t i = 0; i < id_count; i++) { + auto *a = asset_bank[i]; a->consumed = 0; a->pending_consumed = 0; a->last_used = 0; @@ -116,29 +119,29 @@ void AssetManager::set_asset_instantiator_interface(AssetInstantiatorInterface * { iface->set_id_bounds(id_count); for (uint32_t i = 0; i < id_count; i++) - iface->set_image_class({ i }, asset_bank[i]->image_class); + iface->set_asset_class(AssetID{i}, asset_bank[i]->asset_class); } } -void AssetManager::mark_used_resource(ImageAssetID id) +void AssetManager::mark_used_asset(AssetID id) { lru_append.push(id); } -void AssetManager::set_image_budget(uint64_t cost) +void AssetManager::set_asset_budget(uint64_t cost) { - image_budget = cost; + transfer_budget = cost; } -void AssetManager::set_image_budget_per_iteration(uint64_t cost) +void AssetManager::set_asset_budget_per_iteration(uint64_t cost) { - image_budget_per_iteration = cost; + transfer_budget_per_iteration = cost; } -bool AssetManager::set_image_residency_priority(ImageAssetID id, int prio) +bool AssetManager::set_asset_residency_priority(AssetID id, int prio) { std::lock_guard holder{asset_bank_lock}; - if (id.id >= asset_bank.size()) + if (id.id >= id_count) return false; asset_bank[id.id]->prio = prio; return true; @@ -146,7 +149,7 @@ bool AssetManager::set_image_residency_priority(ImageAssetID id, int prio) void AssetManager::adjust_update(const CostUpdate &update) { - if (update.id.id < asset_bank.size()) + if (update.id.id < id_count) { auto *a = asset_bank[update.id.id]; total_consumed += update.cost - (a->consumed + a->pending_consumed); @@ -178,15 +181,15 @@ void AssetManager::update_costs_locked_assets() void AssetManager::update_lru_locked_assets() { - lru_append.for_each_ranged([this](const ImageAssetID *id, size_t count) { + lru_append.for_each_ranged([this](const AssetID *id, size_t count) { for (size_t i = 0; i < count; i++) - if (id[i].id < asset_bank.size()) + if (id[i].id < id_count) asset_bank[id[i].id]->last_used = timestamp; }); lru_append.clear(); } -bool AssetManager::iterate_blocking(ThreadGroup &group, ImageAssetID id) +bool AssetManager::iterate_blocking(ThreadGroup &group, AssetID id) { if (!iface) return false; @@ -202,12 +205,12 @@ bool AssetManager::iterate_blocking(ThreadGroup &group, ImageAssetID id) if (candidate->consumed != 0 || candidate->pending_consumed != 0) return true; - uint64_t estimate = iface->estimate_cost_image_resource(candidate->id, *candidate->handle); + uint64_t estimate = iface->estimate_cost_asset(candidate->id, *candidate->handle); auto task = group.create_task(); task->set_task_class(TaskClass::Background); task->set_fence_counter_signal(signal.get()); task->set_desc("asset-manager-instantiate-single"); - iface->instantiate_image_resource(*this, task.get(), candidate->id, *candidate->handle); + iface->instantiate_asset(*this, task.get(), candidate->id, *candidate->handle); candidate->pending_consumed = estimate; candidate->last_used = timestamp; total_consumed += estimate; @@ -251,8 +254,8 @@ void AssetManager::iterate(ThreadGroup *group) update_costs_locked_assets(); update_lru_locked_assets(); - sorted_assets = asset_bank; - std::sort(sorted_assets.begin(), sorted_assets.end(), [](const AssetInfo *a, const AssetInfo *b) -> bool { + memcpy(sorted_assets.data(), asset_bank.data(), id_count * sizeof(sorted_assets[0])); + std::sort(sorted_assets.data(), sorted_assets.data() + id_count, [](const AssetInfo *a, const AssetInfo *b) -> bool { // High prios come first since they will be activated. // Then we sort by LRU. // High consumption should be moved last, so they are candidates to be paged out if we're over budget. @@ -272,7 +275,7 @@ void AssetManager::iterate(ThreadGroup *group) return a->id.id < b->id.id; }); - size_t release_index = sorted_assets.size(); + size_t release_index = id_count; uint64_t activated_cost_this_iteration = 0; unsigned activation_count = 0; size_t activate_index = 0; @@ -281,8 +284,8 @@ void AssetManager::iterate(ThreadGroup *group) // Activate in order from highest priority to lowest. bool can_activate = true; while (can_activate && - total_consumed < image_budget && - activated_cost_this_iteration < image_budget_per_iteration && + total_consumed < transfer_budget && + activated_cost_this_iteration < transfer_budget_per_iteration && activate_index != release_index) { auto *candidate = sorted_assets[activate_index]; @@ -296,26 +299,26 @@ void AssetManager::iterate(ThreadGroup *group) continue; } - uint64_t estimate = iface->estimate_cost_image_resource(candidate->id, *candidate->handle); + uint64_t estimate = iface->estimate_cost_asset(candidate->id, *candidate->handle); - can_activate = (total_consumed + estimate <= image_budget) || (candidate->prio >= persistent_prio()); + can_activate = (total_consumed + estimate <= transfer_budget) || (candidate->prio >= persistent_prio()); while (!can_activate && activate_index + 1 != release_index) { auto *release_candidate = sorted_assets[--release_index]; if (release_candidate->consumed) { LOGI("Releasing ID %u due to page-in pressure.\n", release_candidate->id.id); - iface->release_image_resource(release_candidate->id); + iface->release_asset(release_candidate->id); total_consumed -= release_candidate->consumed; release_candidate->consumed = 0; } - can_activate = total_consumed + estimate <= image_budget; + can_activate = total_consumed + estimate <= transfer_budget; } if (can_activate) { // We're trivially in budget. - iface->instantiate_image_resource(*this, task.get(), candidate->id, *candidate->handle); + iface->instantiate_asset(*this, task.get(), candidate->id, *candidate->handle); activation_count++; candidate->pending_consumed = estimate; @@ -328,7 +331,7 @@ void AssetManager::iterate(ThreadGroup *group) } // If we're 75% of budget, start garbage collecting non-resident resources ahead of time. - const uint64_t low_image_budget = (image_budget * 3) / 4; + const uint64_t low_image_budget = (transfer_budget * 3) / 4; const auto should_release = [&]() -> bool { if (release_index == activate_index) @@ -336,7 +339,7 @@ void AssetManager::iterate(ThreadGroup *group) if (sorted_assets[release_index - 1]->prio == persistent_prio()) return false; - if (total_consumed > image_budget) + if (total_consumed > transfer_budget) return true; else if (total_consumed > low_image_budget && sorted_assets[release_index - 1]->prio == 0) return true; @@ -351,7 +354,7 @@ void AssetManager::iterate(ThreadGroup *group) if (candidate->consumed) { LOGI("Releasing 0-prio ID %u due to page-in pressure.\n", candidate->id.id); - iface->release_image_resource(candidate->id); + iface->release_asset(candidate->id); total_consumed -= candidate->consumed; candidate->consumed = 0; candidate->last_used = 0; diff --git a/filesystem/asset_manager.hpp b/filesystem/asset_manager.hpp index 75b541072..6c613d67a 100644 --- a/filesystem/asset_manager.hpp +++ b/filesystem/asset_manager.hpp @@ -26,35 +26,42 @@ #include "filesystem.hpp" #include "object_pool.hpp" #include "intrusive_hash_map.hpp" +#include "dynamic_array.hpp" #include #include #include namespace Granite { -struct ImageAssetID +struct AssetID { uint32_t id = uint32_t(-1); + enum { MaxIDs = 1u << 18 }; + AssetID() = default; + explicit AssetID(uint32_t id_) : id{id_} {} explicit inline operator bool() const { return id != uint32_t(-1); } + inline bool operator==(const AssetID &other) const { return id == other.id; } + inline bool operator!=(const AssetID &other) const { return !(*this == other); } }; class AssetManager; // If we have to fall back due to no image being present, // lets asset instantiator know what to substitute. -enum class ImageClass +enum class AssetClass { // Substitute with 0. - Zeroable, + ImageZeroable, // Substitute with missing color. - Color, + ImageColor, // Substitute with RG8_UNORM 0.5 - Normal, + ImageNormal, // Substitute with M = 0, R = 1. - MetallicRoughness, + ImageMetallicRoughness, // Substitute with mid-gray (0.5, 0.5, 0.5, 1.0) UNORM8. // Somewhat compatible with everything. - Generic + ImageGeneric, + Mesh }; class ThreadGroup; @@ -67,16 +74,17 @@ class AssetInstantiatorInterface virtual ~AssetInstantiatorInterface() = default; // This estimate should be an upper bound. - virtual uint64_t estimate_cost_image_resource(ImageAssetID id, File &mapping) = 0; + virtual uint64_t estimate_cost_asset(AssetID id, File &mapping) = 0; // When instantiation completes, manager.update_cost() must be called with the real cost. // The real cost may only be known after async parsing of the file. - virtual void instantiate_image_resource(AssetManager &manager, TaskGroup *group, ImageAssetID id, File &mapping) = 0; + virtual void instantiate_asset(AssetManager &manager, TaskGroup *group, AssetID id, File &mapping) = 0; // Will only be called after an upload completes through manager.update_cost(). - virtual void release_image_resource(ImageAssetID id) = 0; + virtual void release_asset(AssetID id) = 0; + virtual void set_id_bounds(uint32_t bound) = 0; - virtual void set_image_class(ImageAssetID id, ImageClass image_class); + virtual void set_asset_class(AssetID id, AssetClass asset_class); // Called in AssetManager::iterate(). virtual void latch_handles() = 0; @@ -92,24 +100,26 @@ class AssetManager final : public AssetManagerInterface ~AssetManager() override; void set_asset_instantiator_interface(AssetInstantiatorInterface *iface); - void set_image_budget(uint64_t cost); - void set_image_budget_per_iteration(uint64_t cost); + + // We might want to consider different budgets per asset class. + void set_asset_budget(uint64_t cost); + void set_asset_budget_per_iteration(uint64_t cost); // FileHandle is intended to be used with FileSlice or similar here so that we don't need // a ton of open files at once. - ImageAssetID register_image_resource(FileHandle file, ImageClass image_class, int prio = 1); - ImageAssetID register_image_resource(Filesystem &fs, const std::string &path, ImageClass image_class, int prio = 1); + AssetID register_asset(FileHandle file, AssetClass asset_class, int prio = 1); + AssetID register_asset(Filesystem &fs, const std::string &path, AssetClass asset_class, int prio = 1); // Prio 0: Not resident, resource may not exist. - bool set_image_residency_priority(ImageAssetID id, int prio); + bool set_asset_residency_priority(AssetID id, int prio); // Intended to be called in Application::post_frame(). Not thread safe. // This function updates internal state. void iterate(ThreadGroup *group); - bool iterate_blocking(ThreadGroup &group, ImageAssetID id); + bool iterate_blocking(ThreadGroup &group, AssetID id); // Always thread safe, used by AssetInstantiatorInterfaces to update cost estimates. - void update_cost(ImageAssetID id, uint64_t cost); + void update_cost(AssetID id, uint64_t cost); // May be called concurrently, except when calling iterate(). uint64_t get_current_total_consumed() const; @@ -117,7 +127,7 @@ class AssetManager final : public AssetManagerInterface // May be called concurrently, except when calling iterate(). // Intended to be called by asset instantiator interface or similar. // When a resource is actually accessed, this is called. - void mark_used_resource(ImageAssetID id); + void mark_used_asset(AssetID id); private: struct AssetInfo : Util::IntrusiveHashMapEnabled @@ -126,29 +136,29 @@ class AssetManager final : public AssetManagerInterface uint64_t consumed = 0; uint64_t last_used = 0; FileHandle handle; - ImageAssetID id = {}; - ImageClass image_class = ImageClass::Zeroable; + AssetID id = {}; + AssetClass asset_class = AssetClass::ImageZeroable; int prio = 0; }; - std::vector sorted_assets; + Util::DynamicArray sorted_assets; + Util::DynamicArray asset_bank; std::mutex asset_bank_lock; - std::vector asset_bank; Util::ObjectPool pool; - Util::AtomicAppendBuffer lru_append; + Util::AtomicAppendBuffer lru_append; Util::IntrusiveHashMapHolder file_to_assets; AssetInstantiatorInterface *iface = nullptr; uint32_t id_count = 0; uint64_t total_consumed = 0; - uint64_t image_budget = 0; - uint64_t image_budget_per_iteration = 0; + uint64_t transfer_budget = 0; + uint64_t transfer_budget_per_iteration = 0; uint64_t timestamp = 1; uint32_t blocking_signals = 0; struct CostUpdate { - ImageAssetID id; + AssetID id; uint64_t cost = 0; }; std::mutex cost_update_lock; @@ -157,7 +167,7 @@ class AssetManager final : public AssetManagerInterface void adjust_update(const CostUpdate &update); std::unique_ptr signal; - ImageAssetID register_image_resource_nolock(FileHandle file, ImageClass image_class, int prio); + AssetID register_asset_nolock(FileHandle file, AssetClass asset_class, int prio); void update_costs_locked_assets(); void update_lru_locked_assets(); diff --git a/renderer/common_renderer_data.cpp b/renderer/common_renderer_data.cpp index dc9fda342..b1a178d1a 100644 --- a/renderer/common_renderer_data.cpp +++ b/renderer/common_renderer_data.cpp @@ -112,7 +112,7 @@ void LightMesh::on_device_destroyed(const Vulkan::DeviceCreatedEvent &) void CommonRendererData::initialize_static_assets(AssetManager *iface, Filesystem *fs) { LOGI("Initializing static assets.\n"); - brdf_tables = iface->register_image_resource(*fs, "builtin://textures/ibl_brdf_lut.gtx", ImageClass::Zeroable, - AssetManager::persistent_prio()); + brdf_tables = iface->register_asset(*fs, "builtin://textures/ibl_brdf_lut.gtx", AssetClass::ImageZeroable, + AssetManager::persistent_prio()); } } diff --git a/renderer/common_renderer_data.hpp b/renderer/common_renderer_data.hpp index 4e147860c..0465a926c 100644 --- a/renderer/common_renderer_data.hpp +++ b/renderer/common_renderer_data.hpp @@ -57,7 +57,7 @@ class CommonRendererData final : public CommonRendererDataInterface { public: LightMesh light_mesh; - ImageAssetID brdf_tables; + AssetID brdf_tables; void initialize_static_assets(AssetManager *iface, Filesystem *file_iface); }; } \ No newline at end of file diff --git a/renderer/formats/scene_formats.cpp b/renderer/formats/scene_formats.cpp index 7dbda1a8b..6802e5c69 100644 --- a/renderer/formats/scene_formats.cpp +++ b/renderer/formats/scene_formats.cpp @@ -44,15 +44,20 @@ static vec3 compute_normal(const vec3 &a, const vec3 &b, const vec3 &c) struct IndexRemapping { - std::vector index_remap; - std::vector unique_attrib_to_source_index; + std::vector index_remap; + std::vector unique_attrib_to_source_index; }; // Find duplicate indices. -static IndexRemapping build_index_remap_list(const Mesh &mesh) +static IndexRemapping build_attribute_remap_indices(const Mesh &mesh) { - unsigned attribute_count = unsigned(mesh.positions.size() / mesh.position_stride); - std::unordered_map attribute_remapper; + auto attribute_count = unsigned(mesh.positions.size() / mesh.position_stride); + struct RemappedAttribute + { + unsigned unique_index; + unsigned source_index; + }; + std::unordered_map attribute_remapper; IndexRemapping remapped; remapped.index_remap.reserve(attribute_count); @@ -66,13 +71,41 @@ static IndexRemapping build_index_remap_list(const Mesh &mesh) auto hash = h.get(); auto itr = attribute_remapper.find(hash); + bool is_unique; + if (itr != end(attribute_remapper)) { - remapped.index_remap.push_back(itr->second); + bool match = true; + if (memcmp(mesh.positions.data() + i * mesh.position_stride, + mesh.positions.data() + itr->second.source_index * mesh.position_stride, + mesh.position_stride) != 0) + { + match = false; + } + + if (match && !mesh.attributes.empty() && + memcmp(mesh.attributes.data() + i * mesh.attribute_stride, + mesh.attributes.data() + itr->second.source_index * mesh.attribute_stride, + mesh.attribute_stride) != 0) + { + match = false; + } + + if (match) + remapped.index_remap.push_back(itr->second.unique_index); + else + LOGW("Hash collision in vertex dedup.\n"); + + is_unique = !match; } else { - attribute_remapper[hash] = unique_count; + attribute_remapper[hash] = { unique_count, i }; + is_unique = true; + } + + if (is_unique) + { remapped.index_remap.push_back(unique_count); remapped.unique_attrib_to_source_index.push_back(i); unique_count++; @@ -82,28 +115,15 @@ static IndexRemapping build_index_remap_list(const Mesh &mesh) return remapped; } -static std::vector build_canonical_index_buffer(const Mesh &mesh, const std::vector &index_remap) +static std::vector build_remapped_index_buffer(const Mesh &mesh, const std::vector &index_remap) { - std::vector index_buffer; - if (mesh.indices.empty()) - { - index_buffer.reserve(mesh.count); - for (unsigned i = 0; i < mesh.count; i++) - index_buffer.push_back(index_remap[i]); - } - else if (mesh.index_type == VK_INDEX_TYPE_UINT32) - { - index_buffer.reserve(mesh.count); - for (unsigned i = 0; i < mesh.count; i++) - index_buffer.push_back(index_remap[reinterpret_cast(mesh.indices.data())[i]]); - } - else if (mesh.index_type == VK_INDEX_TYPE_UINT16) - { - index_buffer.reserve(mesh.count); - for (unsigned i = 0; i < mesh.count; i++) - index_buffer.push_back(index_remap[reinterpret_cast(mesh.indices.data())[i]]); - } + assert(mesh.topology == VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST && mesh.index_type == VK_INDEX_TYPE_UINT32); + std::vector index_buffer; + index_buffer.reserve(mesh.count); + const auto *indices = reinterpret_cast(mesh.indices.data()); + for (unsigned i = 0; i < mesh.count; i++) + index_buffer.push_back(index_remap[indices[i]]); return index_buffer; } @@ -190,7 +210,7 @@ static bool mesh_unroll_vertices(Mesh &mesh) if (mesh.index_type == VK_INDEX_TYPE_UINT32) { - const uint32_t *ibo = reinterpret_cast(mesh.indices.data()); + const auto *ibo = reinterpret_cast(mesh.indices.data()); for (unsigned i = 0; i < mesh.count; i++) { uint32_t index = ibo[i]; @@ -204,7 +224,21 @@ static bool mesh_unroll_vertices(Mesh &mesh) } else if (mesh.index_type == VK_INDEX_TYPE_UINT16) { - const uint16_t *ibo = reinterpret_cast(mesh.indices.data()); + const auto *ibo = reinterpret_cast(mesh.indices.data()); + for (unsigned i = 0; i < mesh.count; i++) + { + uint16_t index = ibo[i]; + memcpy(positions.data() + i * mesh.position_stride, + mesh.positions.data() + index * mesh.position_stride, + mesh.position_stride); + memcpy(attributes.data() + i * mesh.attribute_stride, + mesh.attributes.data() + index * mesh.attribute_stride, + mesh.attribute_stride); + } + } + else if (mesh.index_type == VK_INDEX_TYPE_UINT8_EXT) + { + const auto *ibo = mesh.indices.data(); for (unsigned i = 0; i < mesh.count; i++) { uint16_t index = ibo[i]; @@ -223,56 +257,122 @@ static bool mesh_unroll_vertices(Mesh &mesh) return true; } +bool mesh_canonicalize_indices(SceneFormats::Mesh &mesh) +{ + if (mesh.topology != VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST && + mesh.topology != VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP) + { + LOGE("Topology must be trilist or tristrip.\n"); + return false; + } + + std::vector unrolled_indices; + unrolled_indices.reserve(mesh.count); + + if (mesh.indices.empty()) + { + for (unsigned i = 0; i < mesh.count; i++) + unrolled_indices.push_back(i); + mesh.index_type = VK_INDEX_TYPE_UINT32; + } + else if (mesh.index_type == VK_INDEX_TYPE_UINT32) + { + auto *indices = reinterpret_cast(mesh.indices.data()); + for (unsigned i = 0; i < mesh.count; i++) + unrolled_indices.push_back(indices[i]); + } + else if (mesh.index_type == VK_INDEX_TYPE_UINT16) + { + auto *indices = reinterpret_cast(mesh.indices.data()); + for (unsigned i = 0; i < mesh.count; i++) + unrolled_indices.push_back(mesh.primitive_restart && indices[i] == UINT16_MAX ? UINT32_MAX : indices[i]); + } + else if (mesh.index_type == VK_INDEX_TYPE_UINT8_EXT) + { + auto *indices = reinterpret_cast(mesh.indices.data()); + for (unsigned i = 0; i < mesh.count; i++) + unrolled_indices.push_back(mesh.primitive_restart && indices[i] == UINT8_MAX ? UINT32_MAX : indices[i]); + } + + if (mesh.topology == VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP) + { + std::vector unstripped_indices; + unstripped_indices.reserve(mesh.count * 3); + unsigned primitive_count_since_restart = 0; + + for (unsigned i = 2; i < mesh.count; i++) + { + bool emit_primitive = true; + if (mesh.primitive_restart && + unrolled_indices[i - 2] == UINT32_MAX && + unrolled_indices[i - 1] == UINT32_MAX && + unrolled_indices[i - 0] == UINT32_MAX) + { + emit_primitive = false; + primitive_count_since_restart = 0; + } + + if (emit_primitive) + { + unstripped_indices.push_back(unrolled_indices[i - 2]); + unstripped_indices.push_back(unrolled_indices[i - (1 ^ (primitive_count_since_restart & 1))]); + unstripped_indices.push_back(unrolled_indices[i - (primitive_count_since_restart & 1)]); + primitive_count_since_restart++; + } + } + + unrolled_indices = std::move(unstripped_indices); + mesh.topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST; + } + + mesh.index_type = VK_INDEX_TYPE_UINT32; + mesh.count = uint32_t(unrolled_indices.size()); + mesh.indices.resize(unrolled_indices.size() * sizeof(uint32_t)); + memcpy(mesh.indices.data(), unrolled_indices.data(), mesh.indices.size()); + return true; +} + void mesh_deduplicate_vertices(Mesh &mesh) { - auto index_remap = build_index_remap_list(mesh); - auto index_buffer = build_canonical_index_buffer(mesh, index_remap.index_remap); + mesh_canonicalize_indices(mesh); + auto index_remap = build_attribute_remap_indices(mesh); + auto index_buffer = build_remapped_index_buffer(mesh, index_remap.index_remap); rebuild_new_attributes_remap_src(mesh.positions, mesh.position_stride, mesh.attributes, mesh.attribute_stride, mesh.positions, mesh.attributes, index_remap.unique_attrib_to_source_index); - mesh.index_type = VK_INDEX_TYPE_UINT32; mesh.indices.resize(index_buffer.size() * sizeof(uint32_t)); - size_t count = index_buffer.size(); - for (size_t i = 0; i < count; i++) - reinterpret_cast(mesh.indices.data())[i] = index_buffer[i]; + memcpy(mesh.indices.data(), index_buffer.data(), index_buffer.size() * sizeof(uint32_t)); mesh.count = unsigned(index_buffer.size()); } -Mesh mesh_optimize_index_buffer(const Mesh &mesh, bool stripify) +bool mesh_optimize_index_buffer(Mesh &mesh, const IndexBufferOptimizeOptions &options) { - if (mesh.topology != VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST) - return mesh; - - Mesh optimized; - optimized.position_stride = mesh.position_stride; - optimized.attribute_stride = mesh.attribute_stride; + if (!mesh_canonicalize_indices(mesh) || mesh.topology != VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST) + return false; // Remove redundant indices and rewrite index and attribute buffers. - auto index_remap = build_index_remap_list(mesh); - auto index_buffer = build_canonical_index_buffer(mesh, index_remap.index_remap); - rebuild_new_attributes_remap_src(optimized.positions, optimized.position_stride, - optimized.attributes, optimized.attribute_stride, + auto index_remap = build_attribute_remap_indices(mesh); + auto index_buffer = build_remapped_index_buffer(mesh, index_remap.index_remap); + rebuild_new_attributes_remap_src(mesh.positions, mesh.position_stride, + mesh.attributes, mesh.attribute_stride, mesh.positions, mesh.attributes, index_remap.unique_attrib_to_source_index); - size_t vertex_count = optimized.positions.size() / optimized.position_stride; + size_t vertex_count = mesh.positions.size() / mesh.position_stride; // Optimize for vertex cache. meshopt_optimizeVertexCache(index_buffer.data(), index_buffer.data(), index_buffer.size(), vertex_count); // Remap vertex fetch to get contiguous indices as much as possible. - std::vector remap_table(optimized.positions.size() / optimized.position_stride); + std::vector remap_table(mesh.positions.size() / mesh.position_stride); meshopt_optimizeVertexFetchRemap(remap_table.data(), index_buffer.data(), index_buffer.size(), vertex_count); index_buffer = remap_indices(index_buffer, remap_table); - rebuild_new_attributes_remap_dst(optimized.positions, optimized.position_stride, - optimized.attributes, optimized.attribute_stride, - optimized.positions, optimized.attributes, remap_table); - - optimized.topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST; - optimized.primitive_restart = false; + rebuild_new_attributes_remap_dst(mesh.positions, mesh.position_stride, + mesh.attributes, mesh.attribute_stride, + mesh.positions, mesh.attributes, remap_table); - if (stripify) + if (options.stripify) { // Try to stripify the mesh. If we end up with fewer indices, use that. std::vector stripped_index_buffer((index_buffer.size() / 3) * 4); @@ -283,45 +383,41 @@ Mesh mesh_optimize_index_buffer(const Mesh &mesh, bool stripify) stripped_index_buffer.resize(stripped_index_count); if (stripped_index_count < index_buffer.size()) { - optimized.topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP; + mesh.topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP; index_buffer = std::move(stripped_index_buffer); - optimized.primitive_restart = true; + mesh.primitive_restart = true; } } - uint32_t max_index = 0; - for (auto &i : index_buffer) - if (i != ~0u) - max_index = muglm::max(max_index, i); - - if (max_index <= 0xffff) // 16-bit indices are enough. + bool emit_u32 = true; + if (options.narrow_index_buffer) { - optimized.index_type = VK_INDEX_TYPE_UINT16; - optimized.indices.resize(index_buffer.size() * sizeof(uint16_t)); - size_t count = index_buffer.size(); - for (size_t i = 0; i < count; i++) + uint32_t max_index = 0; + for (auto &i: index_buffer) + if (i != ~0u) + max_index = muglm::max(max_index, i); + + if (max_index <= 0xffff) // 16-bit indices are enough. { - reinterpret_cast(optimized.indices.data())[i] = - index_buffer[i] == ~0u ? uint16_t(0xffffu) : uint16_t(index_buffer[i]); + mesh.index_type = VK_INDEX_TYPE_UINT16; + mesh.indices.resize(index_buffer.size() * sizeof(uint16_t)); + size_t count = index_buffer.size(); + emit_u32 = false; + + auto *out_indices = reinterpret_cast(mesh.indices.data()); + for (size_t i = 0; i < count; i++) + out_indices[i] = index_buffer[i] == ~0u ? uint16_t(0xffffu) : uint16_t(index_buffer[i]); } } - else + + if (emit_u32) { - optimized.index_type = VK_INDEX_TYPE_UINT32; - optimized.indices.resize(index_buffer.size() * sizeof(uint32_t)); - size_t count = index_buffer.size(); - for (size_t i = 0; i < count; i++) - reinterpret_cast(optimized.indices.data())[i] = index_buffer[i]; + mesh.indices.resize(index_buffer.size() * sizeof(uint32_t)); + memcpy(mesh.indices.data(), index_buffer.data(), index_buffer.size() * sizeof(uint32_t)); } - optimized.count = unsigned(index_buffer.size()); - - memcpy(optimized.attribute_layout, mesh.attribute_layout, sizeof(mesh.attribute_layout)); - optimized.material_index = mesh.material_index; - optimized.has_material = mesh.has_material; - optimized.static_aabb = mesh.static_aabb; - - return optimized; + mesh.count = unsigned(index_buffer.size()); + return true; } bool mesh_recompute_tangents(Mesh &mesh) diff --git a/renderer/formats/scene_formats.hpp b/renderer/formats/scene_formats.hpp index 18661805e..ab2735803 100644 --- a/renderer/formats/scene_formats.hpp +++ b/renderer/formats/scene_formats.hpp @@ -256,7 +256,14 @@ bool mesh_flip_tangents_w(Mesh &mesh); bool extract_collision_mesh(CollisionMesh &collision_mesh, const Mesh &mesh); void mesh_deduplicate_vertices(Mesh &mesh); -Mesh mesh_optimize_index_buffer(const Mesh &mesh, bool stripify); +bool mesh_canonicalize_indices(Mesh &mesh); + +struct IndexBufferOptimizeOptions +{ + bool narrow_index_buffer; + bool stripify; +}; +bool mesh_optimize_index_buffer(Mesh &mesh, const IndexBufferOptimizeOptions &options); std::unordered_set build_used_nodes_in_scene(const SceneNodes &scene, const std::vector &nodes); } } diff --git a/renderer/ground.cpp b/renderer/ground.cpp index c25def4af..57072e87a 100644 --- a/renderer/ground.cpp +++ b/renderer/ground.cpp @@ -182,12 +182,14 @@ Ground::Ground(unsigned size_, const TerrainInfo &info_) num_patches_z = size / info.base_patch_size; patch_lods.resize(num_patches_x * num_patches_z); - heights = GRANITE_ASSET_MANAGER()->register_image_resource(*GRANITE_FILESYSTEM(), info.heightmap, ImageClass::Zeroable); - normals = GRANITE_ASSET_MANAGER()->register_image_resource(*GRANITE_FILESYSTEM(), info.normalmap, ImageClass::Normal); - occlusion = GRANITE_ASSET_MANAGER()->register_image_resource(*GRANITE_FILESYSTEM(), info.occlusionmap, ImageClass::Zeroable); - normals_fine = GRANITE_ASSET_MANAGER()->register_image_resource(*GRANITE_FILESYSTEM(), info.normalmap_fine, ImageClass::Normal); - base_color = GRANITE_ASSET_MANAGER()->register_image_resource(*GRANITE_FILESYSTEM(), info.base_color, ImageClass::Color); - type_map = GRANITE_ASSET_MANAGER()->register_image_resource(*GRANITE_FILESYSTEM(), info.splatmap, ImageClass::Zeroable); + heights = GRANITE_ASSET_MANAGER()->register_asset(*GRANITE_FILESYSTEM(), info.heightmap, AssetClass::ImageZeroable); + normals = GRANITE_ASSET_MANAGER()->register_asset(*GRANITE_FILESYSTEM(), info.normalmap, AssetClass::ImageNormal); + occlusion = GRANITE_ASSET_MANAGER()->register_asset(*GRANITE_FILESYSTEM(), info.occlusionmap, + AssetClass::ImageZeroable); + normals_fine = GRANITE_ASSET_MANAGER()->register_asset(*GRANITE_FILESYSTEM(), info.normalmap_fine, + AssetClass::ImageNormal); + base_color = GRANITE_ASSET_MANAGER()->register_asset(*GRANITE_FILESYSTEM(), info.base_color, AssetClass::ImageColor); + type_map = GRANITE_ASSET_MANAGER()->register_asset(*GRANITE_FILESYSTEM(), info.splatmap, AssetClass::ImageZeroable); EVENT_MANAGER_REGISTER_LATCH(Ground, on_device_created, on_device_destroyed, DeviceCreatedEvent); } diff --git a/renderer/ground.hpp b/renderer/ground.hpp index 273338708..3377f711d 100644 --- a/renderer/ground.hpp +++ b/renderer/ground.hpp @@ -149,7 +149,7 @@ class Ground : public Util::IntrusivePtrEnabled, public PerFrameRefresha void refresh(const RenderContext &context, TaskComposer &composer) override; - ImageAssetID heights, normals, occlusion, normals_fine, base_color, type_map; + AssetID heights, normals, occlusion, normals_fine, base_color, type_map; Vulkan::ImageHandle lod_map; void on_device_created(const Vulkan::DeviceCreatedEvent &e); void on_device_destroyed(const Vulkan::DeviceCreatedEvent &e); diff --git a/renderer/lights/decal_volume.cpp b/renderer/lights/decal_volume.cpp index 3e5e9cb66..30095675b 100644 --- a/renderer/lights/decal_volume.cpp +++ b/renderer/lights/decal_volume.cpp @@ -29,9 +29,9 @@ namespace Granite { VolumetricDecal::VolumetricDecal() { - tex = GRANITE_ASSET_MANAGER()->register_image_resource(*GRANITE_FILESYSTEM(), - "builtin://textures/decal.png", - ImageClass::Color); + tex = GRANITE_ASSET_MANAGER()->register_asset(*GRANITE_FILESYSTEM(), + "builtin://textures/decal.png", + AssetClass::ImageColor); } const Vulkan::ImageView *VolumetricDecal::get_decal_view(Vulkan::Device &device) const diff --git a/renderer/lights/decal_volume.hpp b/renderer/lights/decal_volume.hpp index 62f86d556..062370cf9 100644 --- a/renderer/lights/decal_volume.hpp +++ b/renderer/lights/decal_volume.hpp @@ -38,6 +38,6 @@ class VolumetricDecal static const AABB &get_static_aabb(); private: - ImageAssetID tex; + AssetID tex; }; } diff --git a/renderer/material.hpp b/renderer/material.hpp index b7c4ce1ed..c051a83d8 100644 --- a/renderer/material.hpp +++ b/renderer/material.hpp @@ -85,19 +85,19 @@ struct Material { info = std::move(info_); - static const ImageClass image_classes[] = { - ImageClass::Color, - ImageClass::Normal, - ImageClass::MetallicRoughness, - ImageClass::Color, - ImageClass::Color, + static const AssetClass image_classes[] = { + AssetClass::ImageColor, + AssetClass::ImageNormal, + AssetClass::ImageMetallicRoughness, + AssetClass::ImageColor, + AssetClass::ImageColor, }; for (unsigned i = 0; i < Util::ecast(TextureKind::Count); i++) { if (!info.paths[i].empty()) { - textures[i] = GRANITE_ASSET_MANAGER()->register_image_resource( + textures[i] = GRANITE_ASSET_MANAGER()->register_asset( *GRANITE_FILESYSTEM(), info.paths[i], image_classes[i]); } } @@ -116,7 +116,7 @@ struct Material return info; } - ImageAssetID textures[Util::ecast(TextureKind::Count)]; + AssetID textures[Util::ecast(TextureKind::Count)]; bool needs_emissive = false; uint32_t shader_variant = 0; diff --git a/renderer/mesh_util.cpp b/renderer/mesh_util.cpp index 69e0b6829..f89f29816 100644 --- a/renderer/mesh_util.cpp +++ b/renderer/mesh_util.cpp @@ -890,8 +890,8 @@ SkyCylinder::SkyCylinder(const std::string &bg_path) { if (!bg_path.empty()) { - texture = GRANITE_ASSET_MANAGER()->register_image_resource( - *GRANITE_FILESYSTEM(), bg_path, ImageClass::Color); + texture = GRANITE_ASSET_MANAGER()->register_asset( + *GRANITE_FILESYSTEM(), bg_path, AssetClass::ImageColor); } EVENT_MANAGER_REGISTER_LATCH(SkyCylinder, on_device_created, on_device_destroyed, DeviceCreatedEvent); @@ -1056,12 +1056,12 @@ Skybox::Skybox(const std::string &bg_path) { if (!bg_path.empty()) { - texture = GRANITE_ASSET_MANAGER()->register_image_resource( - *GRANITE_FILESYSTEM(), bg_path, ImageClass::Color); + texture = GRANITE_ASSET_MANAGER()->register_asset( + *GRANITE_FILESYSTEM(), bg_path, AssetClass::ImageColor); } } -void Skybox::set_image(ImageAssetID skybox) +void Skybox::set_image(AssetID skybox) { texture = skybox; } @@ -1195,8 +1195,8 @@ static void texture_plane_render(CommandBuffer &cmd, const RenderQueueData *info TexturePlane::TexturePlane(const std::string &normal_path) { - normalmap = GRANITE_ASSET_MANAGER()->register_image_resource( - *GRANITE_FILESYSTEM(), normal_path, ImageClass::Normal); + normalmap = GRANITE_ASSET_MANAGER()->register_asset( + *GRANITE_FILESYSTEM(), normal_path, AssetClass::ImageNormal); } void TexturePlane::setup_render_pass_resources(RenderGraph &graph) diff --git a/renderer/mesh_util.hpp b/renderer/mesh_util.hpp index 3394f7884..1b1b4fc13 100644 --- a/renderer/mesh_util.hpp +++ b/renderer/mesh_util.hpp @@ -195,7 +195,7 @@ class Skybox : public AbstractRenderable, public EventHandler { public: Skybox(const std::string &bg_path = ""); - void set_image(ImageAssetID skybox); + void set_image(AssetID skybox); void get_render_info(const RenderContext &context, const RenderInfoComponent *transform, RenderQueue &queue) const override; @@ -207,7 +207,7 @@ class Skybox : public AbstractRenderable, public EventHandler private: vec3 color = vec3(1.0f); - ImageAssetID texture; + AssetID texture; }; class SkyCylinder : public AbstractRenderable, public EventHandler @@ -231,7 +231,7 @@ class SkyCylinder : public AbstractRenderable, public EventHandler private: vec3 color = vec3(1.0f); float scale = 1.0f; - ImageAssetID texture; + AssetID texture; void on_device_created(const Vulkan::DeviceCreatedEvent &event); void on_device_destroyed(const Vulkan::DeviceCreatedEvent &event); @@ -284,7 +284,7 @@ class TexturePlane : public AbstractRenderable, public RenderPassCreator private: const Vulkan::ImageView *reflection = nullptr; const Vulkan::ImageView *refraction = nullptr; - ImageAssetID normalmap; + AssetID normalmap; RenderQueue internal_queue; vec3 position; diff --git a/renderer/post/smaa.cpp b/renderer/post/smaa.cpp index 8952fcb7e..1912bc0fc 100644 --- a/renderer/post/smaa.cpp +++ b/renderer/post/smaa.cpp @@ -145,10 +145,12 @@ void setup_smaa_postprocess(RenderGraph &graph, TemporalJitter &jitter, return true; }); - auto area = GRANITE_ASSET_MANAGER()->register_image_resource( - *GRANITE_FILESYSTEM(), "builtin://textures/smaa/area.gtx", ImageClass::Zeroable, AssetManager::persistent_prio()); - auto search = GRANITE_ASSET_MANAGER()->register_image_resource( - *GRANITE_FILESYSTEM(), "builtin://textures/smaa/search.gtx", ImageClass::Zeroable, AssetManager::persistent_prio()); + auto area = GRANITE_ASSET_MANAGER()->register_asset( + *GRANITE_FILESYSTEM(), "builtin://textures/smaa/area.gtx", AssetClass::ImageZeroable, + AssetManager::persistent_prio()); + auto search = GRANITE_ASSET_MANAGER()->register_asset( + *GRANITE_FILESYSTEM(), "builtin://textures/smaa/search.gtx", AssetClass::ImageZeroable, + AssetManager::persistent_prio()); smaa_weight.set_build_render_pass([&, area, search, edge = masked_edge, q = smaa_quality](Vulkan::CommandBuffer &cmd) { auto &input_image = graph.get_physical_texture_resource(weight_input_res); diff --git a/renderer/sprite.hpp b/renderer/sprite.hpp index bb2446ffa..7b078e5b4 100644 --- a/renderer/sprite.hpp +++ b/renderer/sprite.hpp @@ -61,8 +61,8 @@ struct SpriteRenderInfo struct Sprite : AbstractRenderable { DrawPipeline pipeline = DrawPipeline::Opaque; - ImageAssetID texture; - ImageAssetID texture_alt; + AssetID texture; + AssetID texture_alt; Vulkan::StockSampler sampler = Vulkan::StockSampler::LinearWrap; enum ShaderVariantFlagBits diff --git a/scene-export/CMakeLists.txt b/scene-export/CMakeLists.txt index b217e6f3d..fa741831a 100644 --- a/scene-export/CMakeLists.txt +++ b/scene-export/CMakeLists.txt @@ -6,6 +6,7 @@ add_granite_internal_lib(granite-scene-export gltf_export.cpp gltf_export.hpp rgtc_compressor.cpp rgtc_compressor.hpp tmx_parser.cpp tmx_parser.hpp + meshlet_export.cpp meshlet_export.hpp texture_utils.cpp texture_utils.hpp) target_include_directories(granite-scene-export PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) diff --git a/scene-export/gltf_export.cpp b/scene-export/gltf_export.cpp index a2c0084a3..9aa9e1cbb 100644 --- a/scene-export/gltf_export.cpp +++ b/scene-export/gltf_export.cpp @@ -892,7 +892,17 @@ void RemapState::emit_mesh(unsigned remapped_index) { Mesh new_mesh; if (options->optimize_meshes) - new_mesh = mesh_optimize_index_buffer(*mesh.info[remapped_index], options->stripify_meshes); + { + new_mesh = *mesh.info[remapped_index]; + IndexBufferOptimizeOptions opts = {}; + opts.narrow_index_buffer = true; + opts.stripify = options->stripify_meshes; + if (!mesh_optimize_index_buffer(new_mesh, opts)) + { + LOGE("Failed to optimize index buffer.\n"); + return; + } + } auto &output_mesh = options->optimize_meshes ? new_mesh : *mesh.info[remapped_index]; mesh_cache.resize(std::max(mesh_cache.size(), remapped_index + 1)); diff --git a/scene-export/meshlet_export.cpp b/scene-export/meshlet_export.cpp new file mode 100644 index 000000000..4f39eacc2 --- /dev/null +++ b/scene-export/meshlet_export.cpp @@ -0,0 +1,735 @@ +/* Copyright (c) 2017-2023 Hans-Kristian Arntzen + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "meshlet_export.hpp" +#include "meshoptimizer.h" +#include "enum_cast.hpp" +#include "math.hpp" +#include "filesystem.hpp" +#include "meshlet.hpp" + +namespace Granite +{ +namespace Meshlet +{ +using namespace Vulkan::Meshlet; + +struct Metadata : Header +{ + Bound bound; + Stream u32_streams[MaxU32Streams]; +}; + +struct CombinedMesh +{ + uint32_t stream_count; + MeshStyle mesh_style; + + std::vector meshlets; +}; + +struct Encoded +{ + std::vector payload; + CombinedMesh mesh; +}; + +struct Meshlet +{ + uint32_t offset; + uint32_t count; +}; + +struct PrimitiveAnalysisResult +{ + uint32_t num_primitives; + uint32_t num_vertices; +}; + +static i16vec4 encode_vec3_to_snorm_exp(vec3 v) +{ + vec3 vabs = abs(v); + float max_scale = max(max(vabs.x, vabs.y), vabs.z); + int max_scale_log2 = int(muglm::floor(log2(max_scale))); + int scale_log2 = 14 - max_scale_log2; + + // Maximum component should have range of [1, 2) since we use floor of log2, so scale with 2^14 instead of 15. + v.x = ldexpf(v.x, scale_log2); + v.y = ldexpf(v.y, scale_log2); + v.z = ldexpf(v.z, scale_log2); + v = clamp(round(v), vec3(-0x8000), vec3(0x7fff)); + + return i16vec4(i16vec3(v), int16_t(-scale_log2)); +} + +static i16vec3 encode_vec2_to_snorm_exp(vec2 v) +{ + vec2 vabs = abs(v); + float max_scale = max(vabs.x, vabs.y); + int max_scale_log2 = int(muglm::floor(log2(max_scale))); + int scale_log2 = 14 - max_scale_log2; + + // UVs are unorm scaled, don't need more accuracy than this. + // If all UVs are in range of [0, 1] space, we should get a constant exponent which aids compression. + scale_log2 = min(scale_log2, 15); + + // Maximum component should have range of [1, 2) since we use floor of log2, so scale with 2^14 instead of 15. + v.x = ldexpf(v.x, scale_log2); + v.y = ldexpf(v.y, scale_log2); + v = clamp(round(v), vec2(-0x8000), vec2(0x7fff)); + + return i16vec3(i16vec2(v), int16_t(-scale_log2)); +} + +static std::vector mesh_extract_position_snorm_exp(const SceneFormats::Mesh &mesh) +{ + std::vector encoded_positions; + std::vector positions; + + size_t num_positions = mesh.positions.size() / mesh.position_stride; + positions.resize(num_positions); + auto &layout = mesh.attribute_layout[Util::ecast(MeshAttribute::Position)]; + auto fmt = layout.format; + + if (fmt == VK_FORMAT_R32G32B32A32_SFLOAT || fmt == VK_FORMAT_R32G32B32_SFLOAT) + { + for (size_t i = 0; i < num_positions; i++) + memcpy(positions[i].data, mesh.positions.data() + i * mesh.position_stride + layout.offset, sizeof(float) * 3); + } + else if (fmt == VK_FORMAT_UNDEFINED) + return {}; + else + { + LOGE("Unexpected format %u.\n", fmt); + return {}; + } + + encoded_positions.reserve(positions.size()); + for (auto &pos : positions) + encoded_positions.push_back(encode_vec3_to_snorm_exp(pos)); + + return encoded_positions; +} + +static std::vector mesh_extract_normal_tangent_oct8(const SceneFormats::Mesh &mesh, MeshAttribute attr) +{ + std::vector encoded_attributes; + std::vector normals; + + auto &layout = mesh.attribute_layout[Util::ecast(attr)]; + auto fmt = layout.format; + + size_t num_attrs = mesh.attributes.size() / mesh.attribute_stride; + normals.resize(num_attrs); + + if (fmt == VK_FORMAT_R32G32B32_SFLOAT) + { + for (size_t i = 0; i < num_attrs; i++) + { + memcpy(normals[i].data, + mesh.attributes.data() + i * mesh.attribute_stride + layout.offset, + sizeof(float) * 3); + normals[i].w = 0.0f; + } + } + else if (fmt == VK_FORMAT_R32G32B32A32_SFLOAT) + { + for (size_t i = 0; i < num_attrs; i++) + { + memcpy(normals[i].data, + mesh.attributes.data() + i * mesh.attribute_stride + layout.offset, + sizeof(float) * 4); + } + } + else if (fmt == VK_FORMAT_UNDEFINED) + return {}; + else + { + LOGE("Unexpected format %u.\n", fmt); + return {}; + } + + encoded_attributes.resize(normals.size()); + meshopt_encodeFilterOct(encoded_attributes.data(), encoded_attributes.size(), + sizeof(i8vec4), 8, normals[0].data); + for (auto &n : encoded_attributes) + n.w = n.w <= 0 ? -1 : 0; + + return encoded_attributes; +} + +static i16vec4 encode_uv_to_snorm_scale(vec2 uv) +{ + // UVs tend to be in [0, 1] range. Readjust to use more of the available range. + uv = 2.0f * uv - 1.0f; + return i16vec4(encode_vec2_to_snorm_exp(uv), 0); +} + +static std::vector mesh_extract_uv_snorm_scale(const SceneFormats::Mesh &mesh) +{ + std::vector encoded_uvs; + std::vector uvs; + + size_t num_uvs = mesh.attributes.size() / mesh.attribute_stride; + uvs.resize(num_uvs); + auto &layout = mesh.attribute_layout[Util::ecast(MeshAttribute::UV)]; + auto fmt = layout.format; + + if (fmt == VK_FORMAT_R32G32_SFLOAT) + { + for (size_t i = 0; i < num_uvs; i++) + memcpy(uvs[i].data, mesh.attributes.data() + i * mesh.attribute_stride + layout.offset, sizeof(float) * 2); + } + else if (fmt == VK_FORMAT_R16G16_UNORM) + { + for (size_t i = 0; i < num_uvs; i++) + { + u16vec2 u16; + memcpy(u16.data, mesh.attributes.data() + i * mesh.attribute_stride + layout.offset, sizeof(uint16_t) * 2); + uvs[i] = vec2(u16) * float(1.0f / 0xffff); + } + } + else if (fmt == VK_FORMAT_UNDEFINED) + return {}; + else + { + LOGE("Unexpected format %u.\n", fmt); + return {}; + } + + encoded_uvs.reserve(uvs.size()); + for (auto &uv : uvs) + encoded_uvs.push_back(encode_uv_to_snorm_scale(uv)); + + return encoded_uvs; +} + +static vec3 decode_snorm_exp(i16vec4 p) +{ + vec3 result; + result.x = ldexpf(float(p.x), p.w); + result.y = ldexpf(float(p.y), p.w); + result.z = ldexpf(float(p.z), p.w); + return result; +} + +static PrimitiveAnalysisResult analyze_primitive_count(std::unordered_map &vertex_remap, + const uint32_t *index_buffer, uint32_t max_num_primitives) +{ + PrimitiveAnalysisResult result = {}; + uint32_t vertex_count = 0; + + // We can reference a maximum of 256 vertices. + vertex_remap.clear(); + + for (uint32_t i = 0; i < max_num_primitives; i++) + { + uint32_t index0 = index_buffer[3 * i + 0]; + uint32_t index1 = index_buffer[3 * i + 1]; + uint32_t index2 = index_buffer[3 * i + 2]; + + vertex_count = uint32_t(vertex_remap.size()); + + vertex_remap.insert({index0, uint32_t(vertex_remap.size())}); + vertex_remap.insert({index1, uint32_t(vertex_remap.size())}); + vertex_remap.insert({index2, uint32_t(vertex_remap.size())}); + + // If this primitive causes us to go out of bounds, reset. + if (vertex_remap.size() > MaxVertices) + { + max_num_primitives = i; + break; + } + + vertex_count = uint32_t(vertex_remap.size()); + } + + result.num_primitives = max_num_primitives; + result.num_vertices = vertex_count; + return result; +} + +// Analyze bits required to encode a signed delta. +static uvec4 compute_required_bits_unsigned(u8vec4 delta) +{ + uvec4 result; + for (unsigned i = 0; i < 4; i++) + { + uint32_t v = delta[i]; + result[i] = v == 0 ? 0 : (32 - leading_zeroes(v)); + } + return result; +} + +static uvec4 compute_required_bits_signed(u8vec4 delta) +{ + uvec4 result; + for (unsigned i = 0; i < 4; i++) + { + uint32_t v = delta[i]; + + if (v == 0) + { + result[i] = 0; + } else + { + if (v >= 0x80u) + v ^= 0xffu; + result[i] = v == 0 ? 1 : (33 - leading_zeroes(v)); + } + } + return result; +} + +static uint32_t extract_bit_plane(const uint8_t *bytes, unsigned bit_index) +{ + uint32_t u32 = 0; + for (unsigned i = 0; i < 32; i++) + u32 |= ((bytes[4 * i] >> bit_index) & 1u) << i; + return u32; +} + +static void find_linear_predictor(uint16_t *predictor, + const u8vec4 (&stream_buffer)[MaxElements], + unsigned num_elements) +{ + // Sign-extend since the deltas are considered to be signed ints. + ivec4 unrolled_data[MaxElements]; + for (unsigned i = 0; i < num_elements; i++) + unrolled_data[i] = ivec4(i8vec4(stream_buffer[i])); + + // Simple linear regression. + // Pilfered from: https://www.codesansar.com/numerical-methods/linear-regression-method-using-c-programming.htm + ivec4 x{0}, x2{0}, y{0}, xy{0}; + for (unsigned i = 0; i < num_elements; i++) + { + x += int(i); + x2 += int(i * i); + y += unrolled_data[i]; + xy += int(i) * unrolled_data[i]; + } + + int n = int(num_elements); + ivec4 b_denom = (n * x2 - x * x); + b_denom = select(b_denom, ivec4(1), equal(ivec4(0), b_denom)); + + // Encode in u8.8 fixed point. + ivec4 b = (ivec4(256) * (n * xy - x * y)) / b_denom; + ivec4 a = ((ivec4(256) * y - b * x)) / n; + + for (unsigned i = 0; i < 4; i++) + predictor[i] = uint16_t(a[i]); + for (unsigned i = 0; i < 4; i++) + predictor[4 + i] = uint16_t(b[i]); +} + +static void encode_stream(std::vector &out_payload_buffer, + Stream &stream, u8vec4 (&stream_buffer)[MaxElements], + unsigned num_elements) +{ + stream.offset_from_base_u32 = uint32_t(out_payload_buffer.size()); + + // Delta-encode + u8vec4 current_value; + if (num_elements > 1) + current_value = u8vec4(2) * stream_buffer[0] - stream_buffer[1]; + else + current_value = stream_buffer[0]; + u8vec4 bias_value = current_value; + + for (unsigned i = 0; i < num_elements; i++) + { + u8vec4 next_value = stream_buffer[i]; + stream_buffer[i] = next_value - current_value; + current_value = next_value; + } + + // Find optimal linear predictor. + find_linear_predictor(stream.predictor, stream_buffer, num_elements); + + // u8.8 fixed point. + auto base_predictor = u16vec4(stream.predictor[0], stream.predictor[1], stream.predictor[2], stream.predictor[3]); + auto linear_predictor = u16vec4(stream.predictor[4], stream.predictor[5], stream.predictor[6], stream.predictor[7]); + + for (unsigned i = 0; i < num_elements; i++) + { + // Only predict in-bounds elements, since we want all out of bounds elements to be encoded to 0 delta + // without having them affect the predictor. + stream_buffer[i] -= u8vec4((base_predictor + linear_predictor * uint16_t(i)) >> uint16_t(8)); + } + + for (unsigned i = num_elements; i < MaxElements; i++) + stream_buffer[i] = u8vec4(0); + + // Try to adjust the range such that it can fit in fewer bits. + // We can use the constant term in the linear predictor to nudge values in place. + i8vec4 lo(127); + i8vec4 hi(-128); + + for (unsigned i = 0; i < num_elements; i++) + { + lo = min(lo, i8vec4(stream_buffer[i])); + hi = max(hi, i8vec4(stream_buffer[i])); + } + + uvec4 full_bits = compute_required_bits_unsigned(u8vec4(hi - lo)); + u8vec4 target_lo_value = u8vec4(-((uvec4(1) << full_bits) >> 1u)); + u8vec4 bias = target_lo_value - u8vec4(lo); + + for (unsigned i = 0; i < num_elements; i++) + stream_buffer[i] += bias; + + for (unsigned i = 0; i < 4; i++) + stream.predictor[i] -= uint16_t(bias[i]) << 8; + + // Based on the linear predictor, it's possible that the encoded value in stream_buffer[0] becomes non-zero again. + // This is undesirable, since we can use the initial value to force a delta of 0 here, saving precious bits. + bias_value += stream_buffer[0]; + stream_buffer[0] = u8vec4(0); + + // Simple linear predictor, base equal elements[0], gradient = 0. + stream.predictor[8] = uint16_t((bias_value.y << 8) | bias_value.x); + stream.predictor[9] = uint16_t((bias_value.w << 8) | bias_value.z); + + // Encode 32 elements at once. + for (unsigned chunk_index = 0; chunk_index < MaxElements / 32; chunk_index++) + { + uvec4 required_bits = {}; + for (unsigned i = 0; i < 32; i++) + required_bits = max(required_bits, compute_required_bits_signed(stream_buffer[chunk_index * 32 + i])); + + // Encode bit counts. + stream.bitplane_meta[chunk_index] = uint16_t((required_bits.x << 0) | (required_bits.y << 4) | + (required_bits.z << 8) | (required_bits.w << 12)); + + for (unsigned i = 0; i < required_bits.x; i++) + out_payload_buffer.push_back(extract_bit_plane(&stream_buffer[chunk_index * 32][0], i)); + for (unsigned i = 0; i < required_bits.y; i++) + out_payload_buffer.push_back(extract_bit_plane(&stream_buffer[chunk_index * 32][1], i)); + for (unsigned i = 0; i < required_bits.z; i++) + out_payload_buffer.push_back(extract_bit_plane(&stream_buffer[chunk_index * 32][2], i)); + for (unsigned i = 0; i < required_bits.w; i++) + out_payload_buffer.push_back(extract_bit_plane(&stream_buffer[chunk_index * 32][3], i)); + } +} + +static void encode_mesh(Encoded &encoded, + const Meshlet *meshlets, size_t num_meshlets, + const uint32_t *index_buffer, uint32_t primitive_count, + const uint32_t *attributes, + unsigned num_u32_streams) +{ + encoded = {}; + auto &mesh = encoded.mesh; + mesh.stream_count = num_u32_streams + 1; + mesh.meshlets.reserve(num_meshlets); + uint32_t base_vertex_offset = 0; + + std::unordered_map vbo_remap; + uint32_t primitive_index = 0; + + for (uint32_t meshlet_index = 0; meshlet_index < num_meshlets; meshlet_index++) + { + uint32_t primitives_to_process = min(primitive_count - primitive_index, meshlets[meshlet_index].count); + assert(primitives_to_process); + assert(primitive_count > primitive_index); + + primitive_index = meshlets[meshlet_index].offset; + + auto analysis_result = analyze_primitive_count( + vbo_remap, index_buffer + 3 * primitive_index, + primitives_to_process); + + assert(analysis_result.num_primitives); + assert(analysis_result.num_vertices); + + primitives_to_process = analysis_result.num_primitives; + + Metadata meshlet = {}; + u8vec4 stream_buffer[MaxElements]; + + meshlet.base_vertex_offset = base_vertex_offset; + meshlet.num_primitives_minus_1 = analysis_result.num_primitives - 1; + meshlet.num_attributes_minus_1 = analysis_result.num_vertices - 1; + meshlet.reserved = 0; + + // Encode index buffer. + for (uint32_t i = 0; i < analysis_result.num_primitives; i++) + { + uint8_t i0 = vbo_remap[index_buffer[3 * (primitive_index + i) + 0]]; + uint8_t i1 = vbo_remap[index_buffer[3 * (primitive_index + i) + 1]]; + uint8_t i2 = vbo_remap[index_buffer[3 * (primitive_index + i) + 2]]; + stream_buffer[i] = u8vec4(i0, i1, i2, 0); + } + + encode_stream(encoded.payload, meshlet.u32_streams[0], stream_buffer, analysis_result.num_primitives); + + // Handle spill region just in case. + uint64_t vbo_remapping[MaxVertices + 3]; + unsigned vbo_index = 0; + for (auto &v: vbo_remap) + { + assert(vbo_index < MaxVertices + 3); + vbo_remapping[vbo_index++] = (uint64_t(v.second) << 32) | v.first; + } + std::sort(vbo_remapping, vbo_remapping + vbo_index); + + for (uint32_t stream_index = 0; stream_index < num_u32_streams; stream_index++) + { + for (uint32_t i = 0; i < analysis_result.num_vertices; i++) + { + auto vertex_index = uint32_t(vbo_remapping[i]); + uint32_t payload = attributes[stream_index + num_u32_streams * vertex_index]; + memcpy(stream_buffer[i].data, &payload, sizeof(payload)); + } + + encode_stream(encoded.payload, meshlet.u32_streams[stream_index + 1], stream_buffer, + analysis_result.num_vertices); + } + + mesh.meshlets.push_back(meshlet); + base_vertex_offset += analysis_result.num_vertices; + primitive_index += primitives_to_process; + } +} + +static bool export_encoded_mesh(const std::string &path, const Encoded &encoded) +{ + size_t required_size = 0; + + FormatHeader header = {}; + + header.style = encoded.mesh.mesh_style; + header.u32_stream_count = encoded.mesh.stream_count; + header.meshlet_count = uint32_t(encoded.mesh.meshlets.size()); + header.payload_size_words = uint32_t(encoded.payload.size()); + + required_size += sizeof(magic); + required_size += sizeof(FormatHeader); + + // Per-meshlet metadata. + required_size += encoded.mesh.meshlets.size() * sizeof(Header); + + // Bounds. + required_size += encoded.mesh.meshlets.size() * sizeof(Bound); + + // Stream metadata. + required_size += encoded.mesh.stream_count * encoded.mesh.meshlets.size() * sizeof(Stream); + + // Payload. + // Need a padding word to speed up decoder. + required_size += (encoded.payload.size() + 1) * sizeof(uint32_t); + + auto file = GRANITE_FILESYSTEM()->open(path, FileMode::WriteOnly); + if (!file) + return false; + + auto mapping = file->map_write(required_size); + if (!mapping) + return false; + + auto *ptr = mapping->mutable_data(); + + memcpy(ptr, magic, sizeof(magic)); + ptr += sizeof(magic); + memcpy(ptr, &header, sizeof(header)); + ptr += sizeof(header); + + for (uint32_t i = 0; i < header.meshlet_count; i++) + { + auto &gpu = static_cast(encoded.mesh.meshlets[i]); + memcpy(ptr, &gpu, sizeof(gpu)); + ptr += sizeof(gpu); + } + + for (uint32_t i = 0; i < header.meshlet_count; i++) + { + auto &bound = encoded.mesh.meshlets[i].bound; + memcpy(ptr, &bound, sizeof(bound)); + ptr += sizeof(bound); + } + + for (uint32_t i = 0; i < header.meshlet_count; i++) + { + for (uint32_t j = 0; j < header.u32_stream_count; j++) + { + memcpy(ptr, &encoded.mesh.meshlets[i].u32_streams[j], sizeof(Stream)); + ptr += sizeof(Stream); + } + } + + memcpy(ptr, encoded.payload.data(), encoded.payload.size() * sizeof(uint32_t)); + ptr += encoded.payload.size() * sizeof(uint32_t); + memset(ptr, 0, sizeof(uint32_t)); + return true; +} + +bool export_mesh_to_meshlet(const std::string &path, SceneFormats::Mesh mesh, MeshStyle style) +{ + if (!mesh_optimize_index_buffer(mesh, {})) + return false; + + std::vector positions, uv; + std::vector normals, tangent; + + unsigned num_u32_streams = 0; + + switch (style) + { + case MeshStyle::Skinned: + LOGE("Unimplemented.\n"); + return false; + case MeshStyle::Textured: + uv = mesh_extract_uv_snorm_scale(mesh); + num_u32_streams += 2; + if (uv.empty()) + { + LOGE("No UVs.\n"); + return false; + } + // Fallthrough + case MeshStyle::Untextured: + normals = mesh_extract_normal_tangent_oct8(mesh, MeshAttribute::Normal); + tangent = mesh_extract_normal_tangent_oct8(mesh, MeshAttribute::Tangent); + if (normals.empty() || tangent.empty()) + { + LOGE("No normal or tangent.\n"); + return false; + } + num_u32_streams += 2; + // Fallthrough + case MeshStyle::Wireframe: + positions = mesh_extract_position_snorm_exp(mesh); + if (positions.empty()) + { + LOGE("No positions.\n"); + return false; + } + num_u32_streams += 2; + break; + + default: + LOGE("Unknown mesh style.\n"); + return false; + } + + std::vector attributes(num_u32_streams * positions.size()); + uint32_t *ptr = attributes.data(); + for (size_t i = 0, n = positions.size(); i < n; i++) + { + memcpy(ptr, positions[i].data, sizeof(positions.front())); + ptr += sizeof(positions.front()) / sizeof(uint32_t); + + if (!normals.empty()) + { + memcpy(ptr, normals[i].data, sizeof(normals.front())); + ptr += sizeof(normals.front()) / sizeof(uint32_t); + } + + if (!tangent.empty()) + { + memcpy(ptr, tangent[i].data, sizeof(tangent.front())); + ptr += sizeof(tangent.front()) / sizeof(uint32_t); + } + + if (!uv.empty()) + { + memcpy(ptr, uv[i].data, sizeof(uv.front())); + ptr += sizeof(uv.front()) / sizeof(uint32_t); + } + } + + // Use quantized position to guide the clustering. + std::vector position_buffer; + position_buffer.reserve(positions.size()); + for (auto &p: positions) + position_buffer.push_back(decode_snorm_exp(p)); + + // Special meshoptimizer limit. + constexpr unsigned max_vertices = 255; + constexpr unsigned max_primitives = 256; + size_t num_meshlets = meshopt_buildMeshletsBound(mesh.count, max_vertices, max_primitives); + + std::vector out_vertex_redirection_buffer(num_meshlets * max_vertices); + std::vector local_index_buffer(num_meshlets * max_primitives * 3); + std::vector meshlets(num_meshlets); + + num_meshlets = meshopt_buildMeshlets(meshlets.data(), + out_vertex_redirection_buffer.data(), local_index_buffer.data(), + reinterpret_cast(mesh.indices.data()), mesh.count, + position_buffer[0].data, positions.size(), sizeof(vec3), + max_vertices, max_primitives, 0.75f); + + meshlets.resize(num_meshlets); + + std::vector out_meshlets; + std::vector out_index_buffer; + + out_meshlets.reserve(num_meshlets); + for (auto &meshlet: meshlets) + { + Meshlet m = {}; + m.offset = uint32_t(out_index_buffer.size()); + m.count = meshlet.triangle_count; + out_meshlets.push_back(m); + + auto *local_indices = local_index_buffer.data() + meshlet.triangle_offset; + for (unsigned i = 0; i < meshlet.triangle_count; i++) + { + out_index_buffer.emplace_back( + out_vertex_redirection_buffer[local_indices[3 * i + 0] + meshlet.vertex_offset], + out_vertex_redirection_buffer[local_indices[3 * i + 1] + meshlet.vertex_offset], + out_vertex_redirection_buffer[local_indices[3 * i + 2] + meshlet.vertex_offset]); + } + } + + std::vector bounds; + bounds.clear(); + bounds.reserve(num_meshlets); + for (auto &meshlet: out_meshlets) + { + auto bound = meshopt_computeClusterBounds( + out_index_buffer[meshlet.offset].data, meshlet.count * 3, + position_buffer[0].data, positions.size(), sizeof(vec3)); + bounds.push_back(bound); + } + + Encoded encoded; + encode_mesh(encoded, out_meshlets.data(), out_meshlets.size(), + out_index_buffer[0].data, out_index_buffer.size(), + attributes.data(), num_u32_streams); + encoded.mesh.mesh_style = style; + + assert(bounds.size() == encoded.mesh.meshlets.size()); + const auto *pbounds = bounds.data(); + for (auto &meshlet: encoded.mesh.meshlets) + { + memcpy(meshlet.bound.center, pbounds->center, sizeof(float) * 3); + meshlet.bound.radius = pbounds->radius; + memcpy(meshlet.bound.cone_axis_cutoff, pbounds->cone_axis_s8, sizeof(pbounds->cone_axis_s8)); + meshlet.bound.cone_axis_cutoff[3] = pbounds->cone_cutoff_s8; + } + + return export_encoded_mesh(path, encoded); +} +} +} diff --git a/scene-export/meshlet_export.hpp b/scene-export/meshlet_export.hpp new file mode 100644 index 000000000..7527e80c4 --- /dev/null +++ b/scene-export/meshlet_export.hpp @@ -0,0 +1,36 @@ +/* Copyright (c) 2017-2023 Hans-Kristian Arntzen + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#pragma once + +#include +#include +#include "scene_formats.hpp" +#include "meshlet.hpp" + +namespace Granite +{ +namespace Meshlet +{ +bool export_mesh_to_meshlet(const std::string &path, SceneFormats::Mesh mesh, Vulkan::Meshlet::MeshStyle style); +} +} diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index aceb84851..6c123829e 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -161,8 +161,22 @@ add_granite_offline_tool(linkage-test linkage_test.cpp) add_granite_offline_tool(external-objects external_objects.cpp) add_granite_offline_tool(performance-query performance_query.cpp) add_granite_offline_tool(asset-manager-test asset_manager_test.cpp) + +add_granite_offline_tool(meshopt-sandbox meshopt_sandbox.cpp) +if (NOT ANDROID) + target_compile_definitions(meshopt-sandbox PRIVATE ASSET_DIRECTORY=\"${CMAKE_CURRENT_SOURCE_DIR}/assets\") +endif() +target_link_libraries(meshopt-sandbox PRIVATE granite-scene-export) + +add_granite_application(meshlet-viewer meshlet_viewer.cpp) +if (NOT ANDROID) + target_compile_definitions(meshlet-viewer PRIVATE ASSET_DIRECTORY=\"${CMAKE_CURRENT_SOURCE_DIR}/assets\") +endif() +target_link_libraries(meshlet-viewer PRIVATE granite-scene-export) + add_granite_application(dgc-test-graphics dgc_test_graphics.cpp) add_granite_application(dgc-test-compute dgc_test_compute.cpp) + if (NOT ANDROID) target_compile_definitions(dgc-test-graphics PRIVATE ASSET_DIRECTORY=\"${CMAKE_CURRENT_SOURCE_DIR}/assets\") target_compile_definitions(dgc-test-compute PRIVATE ASSET_DIRECTORY=\"${CMAKE_CURRENT_SOURCE_DIR}/assets\") diff --git a/tests/asset_manager_test.cpp b/tests/asset_manager_test.cpp index 7f1bb3dbf..0a70e4cef 100644 --- a/tests/asset_manager_test.cpp +++ b/tests/asset_manager_test.cpp @@ -6,18 +6,18 @@ using namespace Granite; struct ActivationInterface final : AssetInstantiatorInterface { - uint64_t estimate_cost_image_resource(ImageAssetID, File &mapping) override + uint64_t estimate_cost_asset(AssetID, File &mapping) override { return mapping.get_size(); } - void instantiate_image_resource(AssetManager &manager, TaskGroup *, ImageAssetID id, File &mapping) override + void instantiate_asset(AssetManager &manager, TaskGroup *, AssetID id, File &mapping) override { LOGI("Instantiating ID: %u\n", id.id); manager.update_cost(id, mapping.get_size()); } - void release_image_resource(ImageAssetID id) override + void release_asset(AssetID id) override { LOGI("Releasing ID: %u\n", id.id); } @@ -54,29 +54,29 @@ int main() auto d = fs.open("tmp://d"); auto e = fs.open("tmp://e"); - auto id_a = manager.register_image_resource(std::move(a), ImageClass::Zeroable); - auto id_b = manager.register_image_resource(std::move(b), ImageClass::Zeroable); - auto id_c = manager.register_image_resource(std::move(c), ImageClass::Zeroable); - auto id_d = manager.register_image_resource(std::move(d), ImageClass::Zeroable); + auto id_a = manager.register_asset(std::move(a), AssetClass::ImageZeroable); + auto id_b = manager.register_asset(std::move(b), AssetClass::ImageZeroable); + auto id_c = manager.register_asset(std::move(c), AssetClass::ImageZeroable); + auto id_d = manager.register_asset(std::move(d), AssetClass::ImageZeroable); manager.set_asset_instantiator_interface(&iface); - auto id_e = manager.register_image_resource(std::move(e), ImageClass::Zeroable); + auto id_e = manager.register_asset(std::move(e), AssetClass::ImageZeroable); - manager.set_image_budget(25); - manager.set_image_budget_per_iteration(5); + manager.set_asset_budget(25); + manager.set_asset_budget_per_iteration(5); - manager.set_image_residency_priority(id_a, 1); - manager.set_image_residency_priority(id_b, 1); - manager.set_image_residency_priority(id_c, 1); - manager.set_image_residency_priority(id_d, 1); - manager.set_image_residency_priority(id_e, 2); + manager.set_asset_residency_priority(id_a, 1); + manager.set_asset_residency_priority(id_b, 1); + manager.set_asset_residency_priority(id_c, 1); + manager.set_asset_residency_priority(id_d, 1); + manager.set_asset_residency_priority(id_e, 2); manager.iterate(nullptr); LOGI("Cost: %u\n", unsigned(manager.get_current_total_consumed())); manager.iterate(nullptr); LOGI("Cost: %u\n", unsigned(manager.get_current_total_consumed())); - manager.set_image_residency_priority(id_e, 0); + manager.set_asset_residency_priority(id_e, 0); manager.iterate(nullptr); LOGI("Cost: %u\n", unsigned(manager.get_current_total_consumed())); - manager.set_image_budget(10); + manager.set_asset_budget(10); manager.iterate(nullptr); LOGI("Cost: %u\n", unsigned(manager.get_current_total_consumed())); } \ No newline at end of file diff --git a/tests/assets/shaders/meshlet_debug.frag b/tests/assets/shaders/meshlet_debug.frag new file mode 100644 index 000000000..2f2f02484 --- /dev/null +++ b/tests/assets/shaders/meshlet_debug.frag @@ -0,0 +1,12 @@ +#version 450 + +layout(location = 0) in mediump vec3 vNormal; +layout(location = 1) in mediump vec4 vTangent; +layout(location = 2) in vec2 vUV; + +layout(location = 0) out vec4 FragColor; + +void main() +{ + FragColor = vec4(vNormal.xyz * 0.5 + 0.5, 1.0); +} diff --git a/tests/assets/shaders/meshlet_debug.mesh b/tests/assets/shaders/meshlet_debug.mesh new file mode 100644 index 000000000..5098e49aa --- /dev/null +++ b/tests/assets/shaders/meshlet_debug.mesh @@ -0,0 +1,79 @@ +#version 450 +#extension GL_EXT_mesh_shader : require + +layout(max_primitives = 256, max_vertices = 255, triangles) out; + +#include "meshlet_payload_constants.h" + +#if MESHLET_PAYLOAD_LARGE_WORKGROUP +#define MESHLET_PAYLOAD_WG_Y MESHLET_PAYLOAD_NUM_CHUNKS +#else +#define MESHLET_PAYLOAD_WG_Y 1 +#endif +layout(local_size_x = 32, local_size_y = MESHLET_PAYLOAD_WG_Y) in; + +layout(constant_id = 0) const uint NUM_U32_STREAMS = MESHLET_PAYLOAD_MAX_STREAMS; +#define MESHLET_PAYLOAD_NUM_U32_STREAMS NUM_U32_STREAMS + +#define MESHLET_PAYLOAD_DESCRIPTOR_SET 0 +#define MESHLET_PAYLOAD_META_BINDING 0 +#define MESHLET_PAYLOAD_STREAM_BINDING 1 +#define MESHLET_PAYLOAD_PAYLOAD_BINDING 2 +#include "meshlet_payload_decode.h" +#include "meshlet_attribute_decode.h" + +layout(location = 0) perprimitiveEXT out uint vMeshletIndex[]; +layout(location = 1) out mediump vec3 vNormal[]; +layout(location = 2) out mediump vec4 vTangent[]; +layout(location = 3) out vec2 vUV[]; + +layout(set = 1, binding = 0) uniform UBO +{ + mat4 VP; +}; + +void main() +{ + uint meshlet_index = gl_WorkGroupID.x; + MeshletMetaRuntime meta = meshlet_metas_runtime.data[meshlet_index]; + meshlet_init_workgroup(meta.stream_offset); + + SetMeshOutputsEXT(meta.num_attributes, meta.num_primitives); + +#define INDEX(index, value) \ + if (index < meta.num_primitives) \ + { \ + gl_PrimitiveTriangleIndicesEXT[index] = uvec4(unpack8(value)).xyz; \ + vMeshletIndex[index] = meshlet_index; \ + } + MESHLET_DECODE_STREAM_32(meta.stream_offset, 0, INDEX); + +#define POSITION(index, value) \ + if (index < meta.num_attributes) \ + { \ + vec3 pos = attribute_decode_snorm_exp_position(value); \ + gl_MeshVerticesEXT[index].gl_Position = VP * vec4(pos, 1.0); \ + } + MESHLET_DECODE_STREAM_64(meta.stream_offset, 1, POSITION); + +#define NORMAL(index, value) \ + if (index < meta.num_attributes) \ + { \ + vNormal[index] = attribute_decode_oct8_normal_tangent(value).xyz; \ + } + MESHLET_DECODE_STREAM_32(meta.stream_offset, 3, NORMAL); + +#define TANGENT(index, value) \ + if (index < meta.num_attributes) \ + { \ + vTangent[index] = attribute_decode_oct8_normal_tangent(value); \ + } + MESHLET_DECODE_STREAM_32(meta.stream_offset, 4, TANGENT); + +#define UV(index, value) \ + if (index < meta.num_attributes) \ + { \ + vUV[index] = attribute_decode_snorm_exp_uv(value); \ + } + MESHLET_DECODE_STREAM_64(meta.stream_offset, 5, UV); +} \ No newline at end of file diff --git a/tests/assets/shaders/meshlet_debug.mesh.frag b/tests/assets/shaders/meshlet_debug.mesh.frag new file mode 100644 index 000000000..9eb97dabb --- /dev/null +++ b/tests/assets/shaders/meshlet_debug.mesh.frag @@ -0,0 +1,25 @@ +#version 450 +#extension GL_EXT_mesh_shader : require + +layout(location = 0) perprimitiveEXT in flat uint vMeshletIndex; +layout(location = 1) in mediump vec3 vNormal; +layout(location = 2) in mediump vec4 vTangent; +layout(location = 3) in vec2 vUV; + +layout(location = 0) out vec4 FragColor; + +vec3 decode_mesh_color() +{ + uint index = vMeshletIndex * 1991u; + index ^= (index >> 5u); + uint r = bitfieldExtract(index, 0, 2); + uint g = bitfieldExtract(index, 2, 2); + uint b = bitfieldExtract(index, 4, 2); + //return (vec3(r, g, b) + 1.0 / 3.0) / 4.0; + return vec3(1.0); +} + +void main() +{ + FragColor = vec4(decode_mesh_color() * (vNormal.xyz * 0.5 + 0.5), 1.0); +} \ No newline at end of file diff --git a/tests/assets/shaders/meshlet_debug.vert b/tests/assets/shaders/meshlet_debug.vert new file mode 100644 index 000000000..e07088150 --- /dev/null +++ b/tests/assets/shaders/meshlet_debug.vert @@ -0,0 +1,23 @@ +#version 450 + +layout(location = 0) in vec3 POS; +layout(location = 1) in mediump vec3 N; +layout(location = 2) in mediump vec4 T; +layout(location = 3) in vec2 UV; + +layout(location = 0) out mediump vec3 vNormal; +layout(location = 1) out mediump vec4 vTangent; +layout(location = 2) out vec2 vUV; + +layout(set = 1, binding = 0) uniform UBO +{ + mat4 VP; +}; + +void main() +{ + vNormal = N; + vTangent = T; + vUV = UV; + gl_Position = VP * vec4(POS, 1.0); +} diff --git a/tests/bandlimited_pixel_test.cpp b/tests/bandlimited_pixel_test.cpp index 4e9d34f18..83fa0f36d 100644 --- a/tests/bandlimited_pixel_test.cpp +++ b/tests/bandlimited_pixel_test.cpp @@ -117,7 +117,8 @@ struct BandlimitedPixelTestApplication : Application, EventHandler { "BANDLIMITED_PIXEL_USE_TRANSCENDENTAL", 1 }, }); - auto texture = GRANITE_ASSET_MANAGER()->register_image_resource(*GRANITE_FILESYSTEM(), "assets://textures/sprite.png", ImageClass::Color); + auto texture = GRANITE_ASSET_MANAGER()->register_asset(*GRANITE_FILESYSTEM(), "assets://textures/sprite.png", + AssetClass::ImageColor); auto *view = cmd->get_device().get_resource_manager().get_image_view_blocking(texture); cmd->set_texture(2, 0, *view, mode == 0 ? StockSampler::NearestWrap : StockSampler::TrilinearWrap); diff --git a/tests/meshlet_viewer.cpp b/tests/meshlet_viewer.cpp new file mode 100644 index 000000000..2e19ef3f5 --- /dev/null +++ b/tests/meshlet_viewer.cpp @@ -0,0 +1,165 @@ +/* Copyright (c) 2017-2023 Hans-Kristian Arntzen + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "application.hpp" +#include "command_buffer.hpp" +#include "device.hpp" +#include "os_filesystem.hpp" +#include "muglm/muglm_impl.hpp" +#include "meshlet.hpp" +#include "aabb.hpp" +#include "event.hpp" +#include "camera.hpp" +#include "event_manager.hpp" +#include +#include +#include + +using namespace Granite; +using namespace Vulkan; +using namespace Vulkan::Meshlet; + +static uint32_t style_to_u32_streams(MeshStyle style) +{ + switch (style) + { + case MeshStyle::Wireframe: + return 3; + case MeshStyle::Untextured: + return 4; + case MeshStyle::Textured: + return 7; + case MeshStyle::Skinned: + return 9; + default: + return 0; + } +} + +struct MeshletViewerApplication : Granite::Application, Granite::EventHandler +{ + MeshletViewerApplication(const char *path) + { + get_wsi().set_backbuffer_srgb(false); + mesh_id = GRANITE_ASSET_MANAGER()->register_asset(*GRANITE_FILESYSTEM(), path, Granite::AssetClass::Mesh); + EVENT_MANAGER_REGISTER_LATCH(MeshletViewerApplication, on_device_create, on_device_destroy, DeviceCreatedEvent); + } + + AABB aabb; + FPSCamera camera; + Granite::AssetID mesh_id; + + void on_device_create(const DeviceCreatedEvent &e) + { + e.get_device().get_shader_manager().add_include_directory("builtin://shaders/inc"); + } + + void on_device_destroy(const DeviceCreatedEvent &) + { + } + + void render_frame(double, double) override + { + auto &wsi = get_wsi(); + auto &device = wsi.get_device(); + auto cmd = device.request_command_buffer(); + + cmd->begin_render_pass(device.get_swapchain_render_pass(SwapchainRenderPass::Depth)); + camera.set_aspect(cmd->get_viewport().width / cmd->get_viewport().height); + + cmd->set_opaque_state(); + + auto vp = camera.get_projection() * camera.get_view(); + *cmd->allocate_typed_constant_data(1, 0, 1) = vp; + auto draw = device.get_resource_manager().get_mesh_draw_range(mesh_id); + + if (draw.count && device.get_resource_manager().get_mesh_encoding() == Vulkan::ResourceManager::MeshEncoding::Meshlet) + { + bool large_workgroup = + device.get_device_features().mesh_shader_properties.maxPreferredMeshWorkGroupInvocations > 32 && + device.get_device_features().mesh_shader_properties.maxMeshWorkGroupInvocations >= 256; + + cmd->set_program("", "assets://shaders/meshlet_debug.mesh", + "assets://shaders/meshlet_debug.mesh.frag", + {{"MESHLET_PAYLOAD_LARGE_WORKGROUP", int(large_workgroup)}}); + + cmd->set_storage_buffer(0, 0, *device.get_resource_manager().get_meshlet_header_buffer()); + cmd->set_storage_buffer(0, 1, *device.get_resource_manager().get_meshlet_stream_header_buffer()); + cmd->set_storage_buffer(0, 2, *device.get_resource_manager().get_meshlet_payload_buffer()); + + cmd->enable_subgroup_size_control(true, VK_SHADER_STAGE_MESH_BIT_EXT); + cmd->set_subgroup_size_log2(true, 5, 5, VK_SHADER_STAGE_MESH_BIT_EXT); + cmd->set_specialization_constant_mask(1); + cmd->set_specialization_constant(0, style_to_u32_streams(draw.style)); + + cmd->push_constants(&draw.offset, 0, sizeof(draw.offset)); + cmd->draw_mesh_tasks(draw.count, 1, 1); + } + else if (draw.count) + { + auto *ibo = device.get_resource_manager().get_index_buffer(); + auto *pos = device.get_resource_manager().get_position_buffer(); + auto *attr = device.get_resource_manager().get_attribute_buffer(); + auto *indirect = device.get_resource_manager().get_indirect_buffer(); + + cmd->set_program("assets://shaders/meshlet_debug.vert", "assets://shaders/meshlet_debug.frag"); + cmd->set_index_buffer(*ibo, 0, VK_INDEX_TYPE_UINT8_EXT); + cmd->set_vertex_binding(0, *pos, 0, 12); + cmd->set_vertex_binding(1, *attr, 0, 16); + cmd->set_vertex_attrib(0, 0, VK_FORMAT_R32G32B32_SFLOAT, 0); + cmd->set_vertex_attrib(1, 1, VK_FORMAT_A2B10G10R10_SNORM_PACK32, 0); + cmd->set_vertex_attrib(2, 1, VK_FORMAT_A2B10G10R10_SNORM_PACK32, 4); + cmd->set_vertex_attrib(3, 1, VK_FORMAT_R32G32_SFLOAT, 8); + cmd->draw_indexed_indirect(*indirect, + draw.offset * sizeof(VkDrawIndexedIndirectCommand), + draw.count, sizeof(VkDrawIndexedIndirectCommand)); + } + + cmd->end_render_pass(); + device.submit(cmd); + } +}; + +namespace Granite +{ +Application *application_create(int argc, char **argv) +{ + GRANITE_APPLICATION_SETUP_FILESYSTEM(); + + if (argc != 2) + { + LOGE("Usage: meshlet-viewer path.msh1\n"); + return nullptr; + } + + try + { + auto *app = new MeshletViewerApplication(argv[1]); + return app; + } + catch (const std::exception &e) + { + LOGE("application_create() threw exception: %s\n", e.what()); + return nullptr; + } +} +} diff --git a/tests/meshopt_sandbox.cpp b/tests/meshopt_sandbox.cpp new file mode 100644 index 000000000..84180e1e6 --- /dev/null +++ b/tests/meshopt_sandbox.cpp @@ -0,0 +1,285 @@ +#include "logging.hpp" +#include +#include "math.hpp" +#include "device.hpp" +#include "context.hpp" +#include "muglm/muglm_impl.hpp" +#include "gltf.hpp" +#include "global_managers_init.hpp" +#include "meshlet_export.hpp" +#include "meshlet.hpp" +#include +using namespace Granite; +using namespace Vulkan::Meshlet; + +static void decode_mesh_setup_buffers( + std::vector &out_index_buffer, std::vector &out_u32_stream, + const MeshView &mesh) +{ + assert(mesh.format_header->u32_stream_count > 1); + + out_index_buffer.clear(); + out_u32_stream.clear(); + out_index_buffer.resize(mesh.total_primitives * 3); + out_u32_stream.resize(mesh.total_vertices * (mesh.format_header->u32_stream_count - 1)); +} + +static void decode_mesh(std::vector &out_index_buffer, std::vector &out_u32_stream, + const MeshView &mesh) +{ + decode_mesh_setup_buffers(out_index_buffer, out_u32_stream, mesh); + out_index_buffer.clear(); + const unsigned u32_stride = mesh.format_header->u32_stream_count - 1; + + for (uint32_t meshlet_index = 0; meshlet_index < mesh.format_header->meshlet_count; meshlet_index++) + { + auto &meshlet = mesh.headers[meshlet_index]; + for (unsigned stream_index = 0; stream_index < mesh.format_header->u32_stream_count; stream_index++) + { + auto &stream = mesh.streams[meshlet_index * mesh.format_header->u32_stream_count + stream_index]; + const uint32_t *pdata = mesh.payload + stream.offset_from_base_u32; + + u8vec4 deltas[MaxElements] = {}; + const u16vec4 base_predictor = u16vec4( + stream.predictor[0], stream.predictor[1], + stream.predictor[2], stream.predictor[3]); + const u16vec4 linear_predictor = u16vec4( + stream.predictor[4], stream.predictor[5], + stream.predictor[6], stream.predictor[7]); + const u8vec4 initial_value = + u8vec4(u16vec2(stream.predictor[8], stream.predictor[9]).xxyy() >> u16vec4(0, 8, 0, 8)); + + for (unsigned chunk = 0; chunk < (MaxElements / 32); chunk++) + { + auto bits_per_u8 = (uvec4(stream.bitplane_meta[chunk]) >> uvec4(0, 4, 8, 12)) & 0xfu; + uvec4 bitplanes[8] = {}; + + for (unsigned comp = 0; comp < 4; comp++) + { + for (unsigned bit = 0; bit < bits_per_u8[comp]; bit++) + bitplanes[bit][comp] = *pdata++; + + // Sign-extend. + + unsigned bit_count = bits_per_u8[comp]; + if (bit_count) + for (unsigned bit = bit_count; bit < 8; bit++) + bitplanes[bit][comp] = bitplanes[bit_count - 1][comp]; + } + + for (unsigned i = 0; i < 32; i++) + { + for (uint32_t bit = 0; bit < 8; bit++) + deltas[chunk * 32 + i] |= u8vec4(((bitplanes[bit] >> i) & 1u) << bit); + } + } + + // Apply predictors. + deltas[0] += initial_value; + for (unsigned i = 0; i < MaxElements; i++) + deltas[i] += u8vec4((base_predictor + linear_predictor * u16vec4(i)) >> u16vec4(8)); + + // Resolve deltas. + for (unsigned i = 1; i < MaxElements; i++) + deltas[i] += deltas[i - 1]; + + if (stream_index == 0) + { + // Index decode. + unsigned num_primitives = meshlet.num_primitives_minus_1 + 1; + for (unsigned i = 0; i < num_primitives; i++) + for (unsigned j = 0; j < 3; j++) + out_index_buffer.push_back(deltas[i][j] + meshlet.base_vertex_offset); + } + else + { + // Attributes. + unsigned num_attributes = meshlet.num_attributes_minus_1 + 1; + auto *out_attr = out_u32_stream.data() + meshlet.base_vertex_offset * u32_stride + (stream_index - 1); + for (unsigned i = 0; i < num_attributes; i++, out_attr += u32_stride) + memcpy(out_attr, deltas[i].data, sizeof(*out_attr)); + } + } + } +} + +static void decode_mesh_gpu( + Vulkan::Device &dev, + std::vector &out_index_buffer, std::vector &out_u32_stream, + const MeshView &mesh) +{ + decode_mesh_setup_buffers(out_index_buffer, out_u32_stream, mesh); + + Vulkan::BufferCreateInfo buf_info = {}; + buf_info.domain = Vulkan::BufferDomain::LinkedDeviceHost; + buf_info.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; + buf_info.size = mesh.format_header->payload_size_words * sizeof(uint32_t); + auto payload_buffer = dev.create_buffer(buf_info, mesh.payload); + + buf_info.size = out_index_buffer.size() * sizeof(uint32_t); + buf_info.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; + buf_info.domain = Vulkan::BufferDomain::CachedHost; + auto readback_decoded_index_buffer = dev.create_buffer(buf_info); + + buf_info.size = out_u32_stream.size() * sizeof(uint32_t); + buf_info.domain = Vulkan::BufferDomain::CachedHost; + auto readback_decoded_u32_buffer = dev.create_buffer(buf_info); + + bool has_renderdoc = Vulkan::Device::init_renderdoc_capture(); + if (has_renderdoc) + dev.begin_renderdoc_capture(); + + auto cmd = dev.request_command_buffer(); + + DecodeInfo info = {}; + info.ibo = readback_decoded_index_buffer.get(); + info.streams[0] = readback_decoded_u32_buffer.get(); + info.target_style = mesh.format_header->style; + info.payload = payload_buffer.get(); + info.flags = DECODE_MODE_RAW_PAYLOAD; + + decode_mesh(*cmd, info, mesh); + cmd->barrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT, + VK_PIPELINE_STAGE_HOST_BIT, VK_ACCESS_HOST_READ_BIT); + dev.submit(cmd); + dev.wait_idle(); + + if (has_renderdoc) + dev.end_renderdoc_capture(); + + memcpy(out_index_buffer.data(), + dev.map_host_buffer(*readback_decoded_index_buffer, Vulkan::MEMORY_ACCESS_READ_BIT), + out_index_buffer.size() * sizeof(uint32_t)); + + memcpy(out_u32_stream.data(), + dev.map_host_buffer(*readback_decoded_u32_buffer, Vulkan::MEMORY_ACCESS_READ_BIT), + out_u32_stream.size() * sizeof(uint32_t)); +} + +static bool validate_mesh_decode(const std::vector &decoded_index_buffer, + const std::vector &decoded_u32_stream, + const std::vector &reference_index_buffer, + const std::vector &reference_u32_stream, unsigned u32_stride) +{ + std::vector decoded_output; + std::vector reference_output; + + if (decoded_index_buffer.size() != reference_index_buffer.size()) + return false; + + size_t count = decoded_index_buffer.size(); + + decoded_output.reserve(count * u32_stride); + reference_output.reserve(count * u32_stride); + for (size_t i = 0; i < count; i++) + { + uint32_t decoded_index = decoded_index_buffer[i]; + decoded_output.insert(decoded_output.end(), + decoded_u32_stream.data() + decoded_index * u32_stride, + decoded_u32_stream.data() + (decoded_index + 1) * u32_stride); + + uint32_t reference_index = reference_index_buffer[i]; + reference_output.insert(reference_output.end(), + reference_u32_stream.data() + reference_index * u32_stride, + reference_u32_stream.data() + (reference_index + 1) * u32_stride); + } + + for (size_t i = 0; i < count; i++) + { + for (unsigned j = 0; j < u32_stride; j++) + { + uint32_t decoded_value = decoded_output[i * u32_stride + j]; + uint32_t reference_value = reference_output[i * u32_stride + j]; + if (decoded_value != reference_value) + { + LOGI("Error in index %zu (prim %zu), word %u, expected %x, got %x.\n", + i, i / 3, j, reference_value, decoded_value); + return false; + } + } + } + + return true; +} + +int main(int argc, char *argv[]) +{ + if (argc != 2) + return EXIT_FAILURE; + + Global::init(Global::MANAGER_FEATURE_FILESYSTEM_BIT); + Filesystem::setup_default_filesystem(GRANITE_FILESYSTEM(), ASSET_DIRECTORY); + + GLTF::Parser parser(argv[1]); + + Vulkan::Context ctx; + Vulkan::Device dev; + if (!Vulkan::Context::init_loader(nullptr)) + return EXIT_FAILURE; + + Vulkan::Context::SystemHandles handles; + handles.filesystem = GRANITE_FILESYSTEM(); + ctx.set_system_handles(handles); + if (!ctx.init_instance_and_device(nullptr, 0, nullptr, 0)) + return EXIT_FAILURE; + dev.set_context(ctx); + dev.init_frame_contexts(4); + + auto mesh = parser.get_meshes().front(); + + if (!Meshlet::export_mesh_to_meshlet("export.msh1", + mesh, MeshStyle::Textured)) + { + return EXIT_FAILURE; + } + + auto file = GRANITE_FILESYSTEM()->open("export.msh1", FileMode::ReadOnly); + if (!file) + return EXIT_FAILURE; + + auto mapped = file->map(); + if (!mapped) + return EXIT_FAILURE; + + auto view = create_mesh_view(*mapped); + + std::vector reference_index_buffer; + std::vector reference_attributes; + std::vector gpu_index_buffer; + std::vector gpu_attributes; + + decode_mesh(reference_index_buffer, reference_attributes, view); + decode_mesh_gpu(dev, gpu_index_buffer, gpu_attributes, view); + + if (!validate_mesh_decode(gpu_index_buffer, gpu_attributes, + reference_index_buffer, reference_attributes, + view.format_header->u32_stream_count - 1)) + { + return EXIT_FAILURE; + } + + { + LOGI("Total primitives: %u\n", view.total_primitives); + LOGI("Total vertices: %u\n", view.total_vertices); + LOGI("Payload size: %llu bytes.\n", static_cast(view.format_header->payload_size_words * sizeof(uint32_t))); + + unsigned long long uncompressed_mesh_size = + view.total_primitives * sizeof(uint32_t) * 3 + + view.total_vertices * (view.format_header->u32_stream_count - 1) * sizeof(uint32_t); + unsigned long long uncompressed_payload_size = + view.total_primitives * sizeof(uint32_t) + + view.total_vertices * (view.format_header->u32_stream_count - 1) * sizeof(uint32_t); + LOGI("Uncompressed mesh size: %llu bytes.\n", uncompressed_mesh_size); + LOGI("Uncompressed payload size: %llu bytes.\n", uncompressed_payload_size); + } + + { + file = GRANITE_FILESYSTEM()->open("export.bin", FileMode::WriteOnly); + mapped = file->map_write((reference_index_buffer.size() + reference_attributes.size()) * sizeof(uint32_t)); + auto *ptr = mapped->mutable_data(); + memcpy(ptr, reference_index_buffer.data(), reference_index_buffer.size() * sizeof(uint32_t)); + memcpy(ptr + reference_index_buffer.size(), reference_attributes.data(), reference_attributes.size() * sizeof(uint32_t)); + } + + return 0; +} \ No newline at end of file diff --git a/tests/ui_sandbox.cpp b/tests/ui_sandbox.cpp index c2d7b4440..faddfb44a 100644 --- a/tests/ui_sandbox.cpp +++ b/tests/ui_sandbox.cpp @@ -50,8 +50,8 @@ UIApplication::UIApplication() window->show_title_bar(false); window->set_floating(false); window->set_background_color(vec4(0.0f, 1.0f, 0.0f, 1.0f)); - window->set_background_image(GRANITE_ASSET_MANAGER()->register_image_resource( - *GRANITE_FILESYSTEM(), "builtin://textures/checkerboard.png", ImageClass::Color)); + window->set_background_image(GRANITE_ASSET_MANAGER()->register_asset( + *GRANITE_FILESYSTEM(), "builtin://textures/checkerboard.png", AssetClass::ImageColor)); auto button = make_handle(); window->add_child(button); @@ -90,8 +90,8 @@ UIApplication::UIApplication() slider->show_value(false); slider->set_margin(5.0f); slider->show_tooltip(true); - slider->set_background_image(GRANITE_ASSET_MANAGER()->register_image_resource( - *GRANITE_FILESYSTEM(), "builtin://textures/checkerboard.png", ImageClass::Color)); + slider->set_background_image(GRANITE_ASSET_MANAGER()->register_asset( + *GRANITE_FILESYSTEM(), "builtin://textures/checkerboard.png", AssetClass::ImageColor)); slider->set_background_color(vec4(1.0f)); } @@ -111,8 +111,8 @@ UIApplication::UIApplication() sli.show_value(false); sli.set_margin(5.0f); sli.show_tooltip(true); - sli.set_background_image(GRANITE_ASSET_MANAGER()->register_image_resource( - *GRANITE_FILESYSTEM(), "builtin://textures/checkerboard.png", ImageClass::Color)); + sli.set_background_image(GRANITE_ASSET_MANAGER()->register_asset( + *GRANITE_FILESYSTEM(), "builtin://textures/checkerboard.png", AssetClass::ImageColor)); sli.set_background_color(vec4(1.0f)); } @@ -126,8 +126,8 @@ UIApplication::UIApplication() btn.set_text("Mjuu"); btn.set_toggled_font_color(vec4(0.0f, 1.0f, 0.0f, 1.0f)); btn.set_untoggled_font_color(vec4(1.0f, 0.0f, 0.0f, 1.0f)); - btn.set_background_image(GRANITE_ASSET_MANAGER()->register_image_resource( - *GRANITE_FILESYSTEM(), "builtin://textures/checkerboard.png", ImageClass::Color)); + btn.set_background_image(GRANITE_ASSET_MANAGER()->register_asset( + *GRANITE_FILESYSTEM(), "builtin://textures/checkerboard.png", AssetClass::ImageColor)); btn.set_background_color(vec4(1.0f)); } } diff --git a/third_party/meshoptimizer b/third_party/meshoptimizer index 5baa38ef5..eb385d698 160000 --- a/third_party/meshoptimizer +++ b/third_party/meshoptimizer @@ -1 +1 @@ -Subproject commit 5baa38ef5cd288c6a4d1b3a69f8a168943d593cd +Subproject commit eb385d6987d12f33a4e0284cf2ba6660c9272602 diff --git a/tools/aa_bench.cpp b/tools/aa_bench.cpp index 21b73cc33..77df0433e 100644 --- a/tools/aa_bench.cpp +++ b/tools/aa_bench.cpp @@ -27,7 +27,7 @@ class AABenchApplication : public Application, public EventHandler void on_swapchain_changed(const SwapchainParameterEvent &e); void on_swapchain_destroyed(const SwapchainParameterEvent &e); - ImageAssetID images[2] = {}; + AssetID images[2] = {}; RenderGraph graph; TemporalJitter jitter; RenderContext render_context; @@ -39,8 +39,12 @@ AABenchApplication::AABenchApplication(const std::string &input0, const std::str : input_path0(input0), input_path1(input1), scale(scale_) { type = string_to_post_antialiasing_type(method); - images[0] = input_path0.empty() ? ImageAssetID{} : GRANITE_ASSET_MANAGER()->register_image_resource(*GRANITE_FILESYSTEM(), input_path0, ImageClass::Color); - images[1] = input_path1.empty() ? ImageAssetID{} : GRANITE_ASSET_MANAGER()->register_image_resource(*GRANITE_FILESYSTEM(), input_path1, ImageClass::Color); + images[0] = input_path0.empty() ? AssetID{} : GRANITE_ASSET_MANAGER()->register_asset(*GRANITE_FILESYSTEM(), + input_path0, + AssetClass::ImageColor); + images[1] = input_path1.empty() ? AssetID{} : GRANITE_ASSET_MANAGER()->register_asset(*GRANITE_FILESYSTEM(), + input_path1, + AssetClass::ImageColor); EVENT_MANAGER_REGISTER_LATCH(AABenchApplication, on_swapchain_changed, on_swapchain_destroyed, SwapchainParameterEvent); EVENT_MANAGER_REGISTER_LATCH(AABenchApplication, on_device_created, on_device_destroyed, DeviceCreatedEvent); } diff --git a/tools/convert_cube_to_environment.cpp b/tools/convert_cube_to_environment.cpp index d1659cc8f..857ae3ac5 100644 --- a/tools/convert_cube_to_environment.cpp +++ b/tools/convert_cube_to_environment.cpp @@ -81,7 +81,7 @@ int main(int argc, char *argv[]) device.set_context(context); device.init_external_swapchain({ ImageHandle(nullptr) }); - auto cube = GRANITE_ASSET_MANAGER()->register_image_resource(*GRANITE_FILESYSTEM(), args.cube, ImageClass::Color); + auto cube = GRANITE_ASSET_MANAGER()->register_asset(*GRANITE_FILESYSTEM(), args.cube, AssetClass::ImageColor); auto *view = device.get_resource_manager().get_image_view_blocking(cube); auto specular = convert_cube_to_ibl_specular(device, *view); auto diffuse = convert_cube_to_ibl_diffuse(device, *view); diff --git a/tools/convert_equirect_to_environment.cpp b/tools/convert_equirect_to_environment.cpp index 7b6252d37..488561597 100644 --- a/tools/convert_equirect_to_environment.cpp +++ b/tools/convert_equirect_to_environment.cpp @@ -86,7 +86,8 @@ int main(int argc, char *argv[]) device.init_external_swapchain({ ImageHandle(nullptr) }); auto &textures = device.get_resource_manager(); - auto equirect = GRANITE_ASSET_MANAGER()->register_image_resource(*GRANITE_FILESYSTEM(), args.equirect, ImageClass::Color); + auto equirect = GRANITE_ASSET_MANAGER()->register_asset(*GRANITE_FILESYSTEM(), args.equirect, + AssetClass::ImageColor); auto *view = textures.get_image_view_blocking(equirect); auto cube = convert_equirect_to_cube(device, *view, args.cube_scale); diff --git a/tools/texture_viewer.cpp b/tools/texture_viewer.cpp index 6cd40d83e..617049184 100644 --- a/tools/texture_viewer.cpp +++ b/tools/texture_viewer.cpp @@ -37,8 +37,8 @@ struct TextureViewerApplication : Granite::Application, Granite::EventHandler TextureViewerApplication(std::string path_) : path(std::move(path_)) { - texture = GRANITE_ASSET_MANAGER()->register_image_resource(*GRANITE_FILESYSTEM(), - path, ImageClass::Color); + texture = GRANITE_ASSET_MANAGER()->register_asset(*GRANITE_FILESYSTEM(), + path, AssetClass::ImageColor); EVENT_MANAGER_REGISTER(TextureViewerApplication, on_key_pressed, KeyboardEvent); } @@ -138,7 +138,7 @@ struct TextureViewerApplication : Granite::Application, Granite::EventHandler unsigned layer = 0; unsigned level = 0; - ImageAssetID texture; + AssetID texture; std::string path; VkComponentMapping swiz = { VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_G, VK_COMPONENT_SWIZZLE_B, VK_COMPONENT_SWIZZLE_A }; }; diff --git a/ui/image_widget.cpp b/ui/image_widget.cpp index 687bae071..67ca4a751 100644 --- a/ui/image_widget.cpp +++ b/ui/image_widget.cpp @@ -33,9 +33,9 @@ namespace UI { Image::Image(const std::string &path, vec2 target) { - texture = GRANITE_ASSET_MANAGER()->register_image_resource( + texture = GRANITE_ASSET_MANAGER()->register_asset( *GRANITE_FILESYSTEM(), path, - ImageClass::Color); + AssetClass::ImageColor); geometry.minimum = target; geometry.target = target; diff --git a/ui/image_widget.hpp b/ui/image_widget.hpp index 39575f316..70f2ecd95 100644 --- a/ui/image_widget.hpp +++ b/ui/image_widget.hpp @@ -45,7 +45,7 @@ class Image : public Widget private: float render(FlatRenderer &renderer, float layout, vec2 offset, vec2 size) override; void reconfigure_to_canvas(vec2 offset, vec2 size) override; - ImageAssetID texture; + AssetID texture; Vulkan::StockSampler sampler = Vulkan::StockSampler::LinearClamp; vec2 sprite_offset; diff --git a/ui/widget.hpp b/ui/widget.hpp index e13a7b3e9..5ede61344 100644 --- a/ui/widget.hpp +++ b/ui/widget.hpp @@ -129,7 +129,7 @@ class Widget : public Util::IntrusivePtrEnabled needs_redraw = true; } - void set_background_image(ImageAssetID texture) + void set_background_image(AssetID texture) { bg_image = texture; needs_redraw = true; @@ -181,7 +181,7 @@ class Widget : public Util::IntrusivePtrEnabled vec2 floating_position = vec2(0.0f); vec4 bg_color = vec4(1.0f, 1.0f, 1.0f, 0.0f); - ImageAssetID bg_image; + AssetID bg_image; bool needs_redraw = true; bool floating = false; diff --git a/util/arena_allocator.hpp b/util/arena_allocator.hpp index 7a923240d..68c2378e0 100644 --- a/util/arena_allocator.hpp +++ b/util/arena_allocator.hpp @@ -98,6 +98,13 @@ struct AllocationArena uint32_t heap_availability_mask = 0; }; +struct SuballocationResult +{ + uint32_t offset; + uint32_t size; + uint32_t mask; +}; + template class ArenaAllocator { @@ -131,11 +138,16 @@ class ArenaAllocator return sub_block_size * Util::LegionAllocator::NumSubBlocks; } - inline uint32_t get_block_alignment() const + inline uint32_t get_sub_block_size() const { return sub_block_size; } + inline uint32_t get_block_alignment() const + { + return get_sub_block_size(); + } + inline bool allocate(uint32_t size, BackingAllocation *alloc) { unsigned num_blocks = (size + sub_block_size - 1) >> sub_block_size_log2; @@ -149,7 +161,7 @@ class ArenaAllocator assert(index >= (num_blocks - 1)); auto &heap = *itr; - static_cast(this)->prepare_allocation(alloc, heap, suballocate(num_blocks, heap)); + static_cast(this)->prepare_allocation(alloc, itr, suballocate(num_blocks, heap)); unsigned new_index = heap.heap.get_longest_run() - 1; @@ -168,7 +180,6 @@ class ArenaAllocator heap_arena.heap_availability_mask &= ~(1u << index); } - alloc->heap = itr; return true; } @@ -186,9 +197,8 @@ class ArenaAllocator } // This cannot fail. - static_cast(this)->prepare_allocation(alloc, heap, suballocate(num_blocks, heap)); + static_cast(this)->prepare_allocation(alloc, node, suballocate(num_blocks, heap)); - alloc->heap = node; if (heap.heap.full()) { heap_arena.full_heaps.insert_front(node); @@ -254,13 +264,6 @@ class ArenaAllocator uint32_t sub_block_size = 1; uint32_t sub_block_size_log2 = 0; - struct SuballocationResult - { - uint32_t offset; - uint32_t size; - uint32_t mask; - }; - private: inline SuballocationResult suballocate(uint32_t num_blocks, MiniHeap &heap) { diff --git a/video/ffmpeg_hw_device.cpp b/video/ffmpeg_hw_device.cpp index d952cfb8f..86131f2de 100644 --- a/video/ffmpeg_hw_device.cpp +++ b/video/ffmpeg_hw_device.cpp @@ -21,6 +21,8 @@ */ #define __STDC_LIMIT_MACROS 1 +#define __STDC_CONSTANT_MACROS 1 + #include "ffmpeg_hw_device.hpp" #include "logging.hpp" #include "device.hpp" diff --git a/vulkan/CMakeLists.txt b/vulkan/CMakeLists.txt index d25606bdb..328221cac 100644 --- a/vulkan/CMakeLists.txt +++ b/vulkan/CMakeLists.txt @@ -55,6 +55,7 @@ if (GRANITE_VULKAN_SYSTEM_HANDLES) target_sources(granite-vulkan PRIVATE texture/memory_mapped_texture.cpp texture/memory_mapped_texture.hpp + mesh/meshlet.hpp mesh/meshlet.cpp texture/texture_files.cpp texture/texture_files.hpp texture/texture_decoder.cpp texture/texture_decoder.hpp) @@ -64,7 +65,8 @@ if (GRANITE_VULKAN_SYSTEM_HANDLES) target_include_directories(granite-vulkan PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/managers - ${CMAKE_CURRENT_SOURCE_DIR}/texture) + ${CMAKE_CURRENT_SOURCE_DIR}/texture + ${CMAKE_CURRENT_SOURCE_DIR}/mesh) if (GRANITE_VULKAN_SHADER_MANAGER_RUNTIME_COMPILER) target_compile_definitions(granite-vulkan PUBLIC GRANITE_VULKAN_SHADER_MANAGER_RUNTIME_COMPILER=1) diff --git a/vulkan/context.cpp b/vulkan/context.cpp index 272a8a38c..ccb0170c1 100644 --- a/vulkan/context.cpp +++ b/vulkan/context.cpp @@ -1304,6 +1304,7 @@ bool Context::create_device(VkPhysicalDevice gpu_, VkSurfaceKHR surface, ext.pageable_device_local_memory_features = { VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PAGEABLE_DEVICE_LOCAL_MEMORY_FEATURES_EXT }; ext.mesh_shader_features = { VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MESH_SHADER_FEATURES_EXT }; ext.shader_subgroup_extended_types_features = { VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_SUBGROUP_EXTENDED_TYPES_FEATURES }; + ext.index_type_uint8_features = { VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_INDEX_TYPE_UINT8_FEATURES_EXT }; ext.compute_shader_derivative_features = { VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COMPUTE_SHADER_DERIVATIVES_FEATURES_NV }; ext.device_generated_commands_features = { VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DEVICE_GENERATED_COMMANDS_FEATURES_NV }; @@ -1501,6 +1502,13 @@ bool Context::create_device(VkPhysicalDevice gpu_, VkSurfaceKHR surface, ppNext = &ext.shader_subgroup_extended_types_features.pNext; } + if (has_extension(VK_EXT_INDEX_TYPE_UINT8_EXTENSION_NAME)) + { + enabled_extensions.push_back(VK_EXT_INDEX_TYPE_UINT8_EXTENSION_NAME); + *ppNext = &ext.index_type_uint8_features; + ppNext = &ext.index_type_uint8_features.pNext; + } + if ((flags & CONTEXT_CREATION_ENABLE_ADVANCED_WSI_BIT) != 0 && requires_swapchain) { bool broken_present_wait = ext.driver_properties.driverID == VK_DRIVER_ID_NVIDIA_PROPRIETARY && @@ -1594,6 +1602,8 @@ bool Context::create_device(VkPhysicalDevice gpu_, VkSurfaceKHR surface, enabled_features.shaderStorageImageWriteWithoutFormat = VK_TRUE; if (pdf2.features.shaderStorageImageReadWithoutFormat) enabled_features.shaderStorageImageReadWithoutFormat = VK_TRUE; + if (pdf2.features.multiDrawIndirect) + enabled_features.multiDrawIndirect = VK_TRUE; if (pdf2.features.shaderSampledImageArrayDynamicIndexing) enabled_features.shaderSampledImageArrayDynamicIndexing = VK_TRUE; diff --git a/vulkan/context.hpp b/vulkan/context.hpp index df4e2b932..ab9db29ad 100644 --- a/vulkan/context.hpp +++ b/vulkan/context.hpp @@ -104,6 +104,7 @@ struct DeviceFeatures VkPhysicalDeviceBufferDeviceAddressFeaturesKHR buffer_device_address_features = {}; VkPhysicalDeviceIDProperties id_properties = {}; VkPhysicalDeviceShaderSubgroupExtendedTypesFeaturesKHR shader_subgroup_extended_types_features = {}; + VkPhysicalDeviceIndexTypeUint8FeaturesEXT index_type_uint8_features = {}; // EXT VkPhysicalDeviceExternalMemoryHostPropertiesEXT host_memory_properties = {}; diff --git a/vulkan/managers/resource_manager.cpp b/vulkan/managers/resource_manager.cpp index 073d66ee8..febf6eb0e 100644 --- a/vulkan/managers/resource_manager.cpp +++ b/vulkan/managers/resource_manager.cpp @@ -28,12 +28,32 @@ #include "texture_decoder.hpp" #include "string_helpers.hpp" #include "thread_group.hpp" +#include "meshlet.hpp" namespace Vulkan { ResourceManager::ResourceManager(Device *device_) : device(device_) + , index_buffer_allocator(*device_, 256) + , attribute_buffer_allocator(*device_, 256) + , indirect_buffer_allocator(*device_, 1) + , mesh_header_allocator(*device_, 1) + , mesh_stream_allocator(*device_, 8) + , mesh_payload_allocator(*device_, 128) { + // Simplified style. + index_buffer_allocator.set_element_size(0, 3); // 8-bit indices. + attribute_buffer_allocator.set_soa_count(3); + attribute_buffer_allocator.set_element_size(0, sizeof(float) * 3); + attribute_buffer_allocator.set_element_size(1, sizeof(float) * 2 + sizeof(uint32_t) * 2); + attribute_buffer_allocator.set_element_size(2, sizeof(uint32_t) * 2); + indirect_buffer_allocator.set_element_size(0, sizeof(VkDrawIndexedIndirectCommand)); + + mesh_header_allocator.set_element_size(0, sizeof(Meshlet::RuntimeHeader)); + mesh_stream_allocator.set_element_size(0, sizeof(Meshlet::Stream)); + mesh_payload_allocator.set_element_size(0, sizeof(uint32_t)); + + assets.reserve(Granite::AssetID::MaxIDs); } ResourceManager::~ResourceManager() @@ -41,34 +61,61 @@ ResourceManager::~ResourceManager() // Also works as a teardown mechanism to make sure there are no async threads in flight. if (manager) manager->set_asset_instantiator_interface(nullptr); + + // Ensure resource releases go through. + latch_handles(); } void ResourceManager::set_id_bounds(uint32_t bound) { - textures.resize(bound); - views.resize(bound); + // We must avoid reallocation here to avoid a ton of extra silly locking. + VK_ASSERT(bound <= Granite::AssetID::MaxIDs); + assets.resize(bound); } -void ResourceManager::set_image_class(Granite::ImageAssetID id, Granite::ImageClass image_class) +void ResourceManager::set_asset_class(Granite::AssetID id, Granite::AssetClass asset_class) { if (id) { - textures[id.id].image_class = image_class; - if (!views[id.id]) - views[id.id] = &get_fallback_image(image_class)->get_view(); + assets[id.id].asset_class = asset_class; + if (asset_class != Granite::AssetClass::Mesh) + { + std::unique_lock holder{lock}; + views.resize(assets.size()); + + if (!views[id.id]) + views[id.id] = &get_fallback_image(asset_class)->get_view(); + } } } -void ResourceManager::release_image_resource(Granite::ImageAssetID id) +void ResourceManager::release_asset(Granite::AssetID id) { if (id) - textures[id.id].image.reset(); + { + std::unique_lock holder{lock}; + VK_ASSERT(id.id < assets.size()); + auto &asset = assets[id.id]; + asset.latchable = false; + updates.push_back(id); + } } -uint64_t ResourceManager::estimate_cost_image_resource(Granite::ImageAssetID, Granite::File &file) +uint64_t ResourceManager::estimate_cost_asset(Granite::AssetID id, Granite::File &file) { - // TODO: When we get compressed BC/ASTC, this will have to change. - return file.get_size(); + if (assets[id.id].asset_class == Granite::AssetClass::Mesh) + { + // Compression factor of 2x is reasonable to assume. + if (mesh_encoding == MeshEncoding::VBOAndIBOMDI) + return file.get_size() * 2; + else + return file.get_size(); + } + else + { + // TODO: When we get compressed BC/ASTC, this will have to change. + return file.get_size(); + } } void ResourceManager::init() @@ -103,7 +150,7 @@ void ResourceManager::init() HeapBudget budget[VK_MAX_MEMORY_HEAPS] = {}; device->get_memory_budget(budget); - // Try to set aside 50% of budgetable VRAM for the texture manager. Seems reasonable. + // Try to set aside 50% of budgetable VRAM for the resource manager. Seems reasonable. VkDeviceSize size = 0; for (uint32_t i = 0; i < device->get_memory_properties().memoryHeapCount; i++) if ((device->get_memory_properties().memoryHeaps[i].flags & VK_MEMORY_HEAP_DEVICE_LOCAL_BIT) != 0) @@ -116,14 +163,22 @@ void ResourceManager::init() } LOGI("Using texture budget of %u MiB.\n", unsigned(size / (1024 * 1024))); - manager->set_image_budget(size); + manager->set_asset_budget(size); // This is somewhat arbitrary. - manager->set_image_budget_per_iteration(2 * 1000 * 1000); + manager->set_asset_budget_per_iteration(2 * 1000 * 1000); + } + + if (device->get_device_features().mesh_shader_features.taskShader && + device->get_device_features().mesh_shader_features.meshShader && + device->supports_subgroup_size_log2(true, 5, 5, VK_SHADER_STAGE_MESH_BIT_EXT)) + { + mesh_encoding = MeshEncoding::Meshlet; + LOGI("Opting in to meshlet path.\n"); } } -ImageHandle ResourceManager::create_gtx(const MemoryMappedTexture &mapped_file, Granite::ImageAssetID id) +ImageHandle ResourceManager::create_gtx(const MemoryMappedTexture &mapped_file, Granite::AssetID id) { if (mapped_file.empty()) return {}; @@ -133,7 +188,7 @@ ImageHandle ResourceManager::create_gtx(const MemoryMappedTexture &mapped_file, VkComponentMapping swizzle = {}; mapped_file.remap_swizzle(swizzle); - Vulkan::ImageHandle image; + ImageHandle image; if (!device->image_format_is_supported(layout.get_format(), VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT) && format_compression_type(layout.get_format()) != FormatCompressionType::Uncompressed) { @@ -189,13 +244,13 @@ ImageHandle ResourceManager::create_gtx(const MemoryMappedTexture &mapped_file, if (image) { - auto name = Util::join("ImageAssetID-", id.id); + auto name = Util::join("AssetID-", id.id); device->set_name(*image, name.c_str()); } return image; } -ImageHandle ResourceManager::create_gtx(Granite::FileMappingHandle mapping, Granite::ImageAssetID id) +ImageHandle ResourceManager::create_gtx(Granite::FileMappingHandle mapping, Granite::AssetID id) { MemoryMappedTexture mapped_file; if (!mapped_file.map_read(std::move(mapping))) @@ -207,27 +262,29 @@ ImageHandle ResourceManager::create_gtx(Granite::FileMappingHandle mapping, Gran return create_gtx(mapped_file, id); } -ImageHandle ResourceManager::create_other(const Granite::FileMapping &mapping, Granite::ImageClass image_class, - Granite::ImageAssetID id) +ImageHandle ResourceManager::create_other(const Granite::FileMapping &mapping, Granite::AssetClass asset_class, + Granite::AssetID id) { auto tex = load_texture_from_memory(mapping.data(), - mapping.get_size(), image_class == Granite::ImageClass::Color ? + mapping.get_size(), asset_class == Granite::AssetClass::ImageColor ? ColorSpace::sRGB : ColorSpace::Linear); return create_gtx(tex, id); } -const Vulkan::ImageView *ResourceManager::get_image_view_blocking(Granite::ImageAssetID id) +const ImageView *ResourceManager::get_image_view_blocking(Granite::AssetID id) { std::unique_lock holder{lock}; - if (id.id >= textures.size()) + if (id.id >= assets.size()) { LOGE("ID %u is out of bounds.\n", id.id); return nullptr; } - if (textures[id.id].image) - return &textures[id.id].image->get_view(); + auto &asset = assets[id.id]; + + if (asset.image) + return &asset.image->get_view(); if (!manager->iterate_blocking(*device->get_system_handles().thread_group, id)) { @@ -235,32 +292,233 @@ const Vulkan::ImageView *ResourceManager::get_image_view_blocking(Granite::Image return nullptr; } - cond.wait(holder, [this, id]() -> bool { - return bool(textures[id.id].image); + cond.wait(holder, [&asset]() -> bool { + return bool(asset.latchable); }); - return &textures[id.id].image->get_view(); + return &asset.image->get_view(); } -void ResourceManager::instantiate_image_resource(Granite::AssetManager &manager_, Granite::TaskGroup *task, - Granite::ImageAssetID id, Granite::File &file) +void ResourceManager::instantiate_asset(Granite::AssetManager &manager_, Granite::TaskGroup *task, + Granite::AssetID id, Granite::File &file) { if (task) { task->enqueue_task([this, &manager_, &file, id]() { - instantiate_image_resource(manager_, id, file); + instantiate_asset(manager_, id, file); }); } else { - instantiate_image_resource(manager_, id, file); + instantiate_asset(manager_, id, file); + } +} + +void ResourceManager::instantiate_asset(Granite::AssetManager &manager_, + Granite::AssetID id, + Granite::File &file) +{ + auto &asset = assets[id.id]; + if (asset.asset_class == Granite::AssetClass::Mesh) + instantiate_asset_mesh(manager_, id, file); + else + instantiate_asset_image(manager_, id, file); +} + +bool ResourceManager::allocate_asset_mesh(Granite::AssetID id, const Meshlet::MeshView &view) +{ + if (!view.format_header) + return false; + + std::lock_guard holder{mesh_allocator_lock}; + auto &asset = assets[id.id]; + + if (mesh_encoding == MeshEncoding::VBOAndIBOMDI) + { + if (!index_buffer_allocator.allocate(view.total_primitives, &asset.mesh.index_or_payload)) + return false; + + if (!attribute_buffer_allocator.allocate(view.total_vertices, &asset.mesh.attr_or_stream)) + { + index_buffer_allocator.free(asset.mesh.index_or_payload); + asset.mesh.index_or_payload = {}; + return false; + } + + if (!indirect_buffer_allocator.allocate(view.format_header->meshlet_count, &asset.mesh.indirect_or_header)) + { + index_buffer_allocator.free(asset.mesh.index_or_payload); + attribute_buffer_allocator.free(asset.mesh.attr_or_stream); + asset.mesh.index_or_payload = {}; + asset.mesh.attr_or_stream = {}; + return false; + } + } + else + { + if (!mesh_header_allocator.allocate(view.format_header->meshlet_count, &asset.mesh.indirect_or_header)) + return false; + + if (!mesh_stream_allocator.allocate(view.format_header->meshlet_count * view.format_header->u32_stream_count, + &asset.mesh.attr_or_stream)) + { + mesh_header_allocator.free(asset.mesh.indirect_or_header); + asset.mesh.indirect_or_header = {}; + return false; + } + + if (!mesh_payload_allocator.allocate(view.format_header->payload_size_words, &asset.mesh.index_or_payload)) + { + mesh_header_allocator.free(asset.mesh.indirect_or_header); + mesh_stream_allocator.free(asset.mesh.attr_or_stream); + asset.mesh.indirect_or_header = {}; + asset.mesh.attr_or_stream = {}; + return false; + } } + + asset.mesh.draw = { asset.mesh.indirect_or_header.offset, view.format_header->meshlet_count }; + return true; } -void ResourceManager::instantiate_image_resource(Granite::AssetManager &manager_, - Granite::ImageAssetID id, - Granite::File &file) +void ResourceManager::instantiate_asset_mesh(Granite::AssetManager &manager_, + Granite::AssetID id, + Granite::File &file) { + Granite::FileMappingHandle mapping; + if (file.get_size()) + mapping = file.map(); + + Meshlet::MeshView view = {}; + if (mapping) + view = Meshlet::create_mesh_view(*mapping); + bool ret = allocate_asset_mesh(id, view); + + // Decode the meshlet. Later, we'll have to do a lot of device specific stuff here to select optimal + // processing: + // - Native meshlets + // - Encoded attribute + // - Decoded attributes + // - Optimize for multi-draw-indirect or not? (8-bit indices). + + auto &asset = assets[id.id]; + + if (ret) + { + if (mesh_encoding == MeshEncoding::Meshlet) + { + auto cmd = device->request_command_buffer(CommandBuffer::Type::AsyncTransfer); + + void *payload_data = cmd->update_buffer(*mesh_payload_allocator.get_buffer(0, 0), + asset.mesh.index_or_payload.offset * sizeof(uint32_t), + view.format_header->payload_size_words * sizeof(uint32_t)); + memcpy(payload_data, view.payload, view.format_header->payload_size_words * sizeof(uint32_t)); + + auto *headers = static_cast( + cmd->update_buffer(*mesh_header_allocator.get_buffer(0, 0), + asset.mesh.indirect_or_header.offset * sizeof(Meshlet::RuntimeHeader), + view.format_header->meshlet_count * sizeof(Meshlet::RuntimeHeader))); + + for (uint32_t i = 0, n = view.format_header->meshlet_count; i < n; i++) + { + headers[i].stream_offset = asset.mesh.attr_or_stream.offset + i * view.format_header->u32_stream_count; + headers[i].num_attributes = view.headers[i].num_attributes_minus_1 + 1; + headers[i].num_primitives = view.headers[i].num_primitives_minus_1 + 1; + } + + auto *streams = static_cast( + cmd->update_buffer(*mesh_stream_allocator.get_buffer(0, 0), + asset.mesh.attr_or_stream.offset * sizeof(Meshlet::Stream), + view.format_header->meshlet_count * view.format_header->u32_stream_count * + sizeof(Meshlet::Stream))); + + for (uint32_t i = 0, n = view.format_header->meshlet_count * view.format_header->u32_stream_count; i < n; i++) + { + auto in_stream = view.streams[i]; + in_stream.offset_from_base_u32 += asset.mesh.index_or_payload.offset; + streams[i] = in_stream; + } + + Semaphore sem[2]; + device->submit(cmd, nullptr, 2, sem); + device->add_wait_semaphore(CommandBuffer::Type::Generic, std::move(sem[0]), + VK_PIPELINE_STAGE_2_MESH_SHADER_BIT_EXT | + VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, false); + device->add_wait_semaphore(CommandBuffer::Type::AsyncGraphics, std::move(sem[1]), + VK_PIPELINE_STAGE_2_MESH_SHADER_BIT_EXT | + VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, false); + } + else + { + auto cmd = device->request_command_buffer(CommandBuffer::Type::AsyncCompute); + + BufferCreateInfo buf = {}; + buf.domain = BufferDomain::Host; + buf.size = view.format_header->payload_size_words * sizeof(uint32_t); + buf.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; + auto payload = device->create_buffer(buf, view.payload); + + Meshlet::DecodeInfo info = {}; + info.target_style = Meshlet::MeshStyle::Textured; + info.ibo = index_buffer_allocator.get_buffer(0, 0); + + for (unsigned i = 0; i < 3; i++) + info.streams[i] = attribute_buffer_allocator.get_buffer(0, i); + + info.payload = payload.get(); + info.indirect = indirect_buffer_allocator.get_buffer(0, 0); + + info.push.meshlet_offset = asset.mesh.indirect_or_header.offset; + info.push.primitive_offset = asset.mesh.index_or_payload.offset; + info.push.vertex_offset = asset.mesh.attr_or_stream.offset; + + Meshlet::decode_mesh(*cmd, info, view); + + Semaphore sem[2]; + device->submit(cmd, nullptr, 2, sem); + device->add_wait_semaphore(CommandBuffer::Type::Generic, std::move(sem[0]), + VK_PIPELINE_STAGE_2_VERTEX_ATTRIBUTE_INPUT_BIT | + VK_PIPELINE_STAGE_2_INDEX_INPUT_BIT, false); + device->add_wait_semaphore(CommandBuffer::Type::AsyncGraphics, std::move(sem[1]), + VK_PIPELINE_STAGE_2_VERTEX_ATTRIBUTE_INPUT_BIT | + VK_PIPELINE_STAGE_2_INDEX_INPUT_BIT, false); + } + } + + uint64_t cost = 0; + if (ret) + { + if (mesh_encoding == MeshEncoding::Meshlet) + { + cost += view.format_header->payload_size_words * mesh_payload_allocator.get_element_size(0); + cost += view.format_header->meshlet_count * mesh_header_allocator.get_element_size(0); + cost += view.format_header->meshlet_count * view.format_header->u32_stream_count * mesh_stream_allocator.get_element_size(0); + } + else + { + cost += view.total_primitives * index_buffer_allocator.get_element_size(0); + cost += view.total_vertices * attribute_buffer_allocator.get_element_size(0); + cost += view.total_vertices * attribute_buffer_allocator.get_element_size(1); + cost += view.total_vertices * attribute_buffer_allocator.get_element_size(2); + cost += view.format_header->meshlet_count * indirect_buffer_allocator.get_element_size(0); + } + + asset.mesh.draw.style = view.format_header->style; + } + + std::lock_guard holder{lock}; + updates.push_back(id); + manager_.update_cost(id, ret ? cost : 0); + asset.latchable = true; + cond.notify_all(); +} + +void ResourceManager::instantiate_asset_image(Granite::AssetManager &manager_, + Granite::AssetID id, + Granite::File &file) +{ + auto &asset = assets[id.id]; + ImageHandle image; if (file.get_size()) { @@ -270,36 +528,36 @@ void ResourceManager::instantiate_image_resource(Granite::AssetManager &manager_ if (MemoryMappedTexture::is_header(mapping->data(), mapping->get_size())) image = create_gtx(std::move(mapping), id); else - image = create_other(*mapping, textures[id.id].image_class, id); + image = create_other(*mapping, asset.asset_class, id); } else LOGE("Failed to map file.\n"); } - manager_.update_cost(id, image ? image->get_allocation().get_size() : 0); - // Have to signal something. if (!image) - image = get_fallback_image(textures[id.id].image_class); + image = get_fallback_image(asset.asset_class); std::lock_guard holder{lock}; updates.push_back(id); - textures[id.id].image = std::move(image); + asset.image = std::move(image); + asset.latchable = true; + manager_.update_cost(id, asset.image ? asset.image->get_allocation().get_size() : 0); cond.notify_all(); } -const ImageHandle &ResourceManager::get_fallback_image(Granite::ImageClass image_class) +const ImageHandle &ResourceManager::get_fallback_image(Granite::AssetClass asset_class) { - switch (image_class) + switch (asset_class) { default: - case Granite::ImageClass::Zeroable: + case Granite::AssetClass::ImageZeroable: return fallback_zero; - case Granite::ImageClass::Color: + case Granite::AssetClass::ImageColor: return fallback_color; - case Granite::ImageClass::Normal: + case Granite::AssetClass::ImageNormal: return fallback_normal; - case Granite::ImageClass::MetallicRoughness: + case Granite::AssetClass::ImageMetallicRoughness: return fallback_pbr; } } @@ -307,25 +565,265 @@ const ImageHandle &ResourceManager::get_fallback_image(Granite::ImageClass image void ResourceManager::latch_handles() { std::lock_guard holder{lock}; + + views.resize(assets.size()); + draws.resize(assets.size()); + for (auto &update : updates) { if (update.id >= views.size()) continue; + auto &asset = assets[update.id]; - const ImageView *view; - - if (textures[update.id].image) + if (asset.asset_class == Granite::AssetClass::Mesh) { - view = &textures[update.id].image->get_view(); + if (!asset.latchable) + { + { + std::lock_guard holder_alloc{mesh_allocator_lock}; + if (mesh_encoding == MeshEncoding::Meshlet) + { + mesh_payload_allocator.free(asset.mesh.index_or_payload); + mesh_stream_allocator.free(asset.mesh.attr_or_stream); + mesh_header_allocator.free(asset.mesh.indirect_or_header); + } + else + { + index_buffer_allocator.free(asset.mesh.index_or_payload); + attribute_buffer_allocator.free(asset.mesh.attr_or_stream); + indirect_buffer_allocator.free(asset.mesh.indirect_or_header); + } + } + asset.mesh = {}; + } + + draws[update.id] = asset.mesh.draw; } else { - auto &img = get_fallback_image(textures[update.id].image_class); - view = &img->get_view(); - } + const ImageView *view; + if (!asset.latchable) + asset.image.reset(); + + if (asset.image) + { + view = &asset.image->get_view(); + } + else + { + auto &img = get_fallback_image(asset.asset_class); + view = &img->get_view(); + } - views[update.id] = view; + views[update.id] = view; + } } updates.clear(); } + +const Buffer *ResourceManager::get_index_buffer() const +{ + return index_buffer_allocator.get_buffer(0, 0); +} + +const Buffer *ResourceManager::get_position_buffer() const +{ + return attribute_buffer_allocator.get_buffer(0, 0); +} + +const Buffer *ResourceManager::get_attribute_buffer() const +{ + return attribute_buffer_allocator.get_buffer(0, 1); +} + +const Buffer *ResourceManager::get_skinning_buffer() const +{ + return attribute_buffer_allocator.get_buffer(0, 2); +} + +const Buffer *ResourceManager::get_indirect_buffer() const +{ + return indirect_buffer_allocator.get_buffer(0, 0); +} + +const Buffer *ResourceManager::get_meshlet_payload_buffer() const +{ + return mesh_payload_allocator.get_buffer(0, 0); +} + +const Buffer *ResourceManager::get_meshlet_header_buffer() const +{ + return mesh_header_allocator.get_buffer(0, 0); +} + +const Buffer *ResourceManager::get_meshlet_stream_header_buffer() const +{ + return mesh_stream_allocator.get_buffer(0, 0); +} + +MeshBufferAllocator::MeshBufferAllocator(Device &device, uint32_t sub_block_size) + : global_allocator(device) +{ + for (int i = 0; i < SliceAllocatorCount - 1; i++) + allocators[i].parent = &allocators[i + 1]; + allocators[SliceAllocatorCount - 1].global_allocator = &global_allocator; + + // Basic unit of a meshlet is 256 prims / attributes. + // Maximum element count = 32M prims. + allocators[0].set_sub_block_size(sub_block_size); + for (int i = 1; i < SliceAllocatorCount; i++) + allocators[i].set_sub_block_size(allocators[i - 1].get_sub_block_size() * (Util::LegionAllocator::NumSubBlocks / 2)); + + for (auto &alloc : allocators) + alloc.set_object_pool(&object_pool); +} + +void MeshBufferAllocator::set_soa_count(unsigned soa_count) +{ + VK_ASSERT(soa_count <= Internal::MeshGlobalAllocator::MaxSoACount); + global_allocator.soa_count = soa_count; +} + +void MeshBufferAllocator::set_element_size(unsigned soa_index, uint32_t element_size) +{ + VK_ASSERT(soa_index < global_allocator.soa_count); + global_allocator.element_size[soa_index] = element_size; +} + +uint32_t MeshBufferAllocator::get_element_size(unsigned soa_index) const +{ + VK_ASSERT(soa_index < global_allocator.soa_count); + return global_allocator.element_size[soa_index]; +} + +const Buffer *MeshBufferAllocator::get_buffer(unsigned index, unsigned soa_index) const +{ + VK_ASSERT(soa_index < global_allocator.soa_count); + index = index * global_allocator.soa_count + soa_index; + + if (index < global_allocator.global_buffers.size()) + return global_allocator.global_buffers[index].get(); + else + return nullptr; +} + +namespace Internal +{ +uint32_t MeshGlobalAllocator::allocate(uint32_t count) +{ + BufferCreateInfo info = {}; + + uint32_t target_index = UINT32_MAX; + uint32_t search_index = 0; + + for (uint32_t i = 0, n = global_buffers.size(); i < n; i += soa_count, search_index++) + { + if (!global_buffers[i]) + { + target_index = search_index; + break; + } + } + + if (target_index == UINT32_MAX) + { + if (!global_buffers.empty()) + return UINT32_MAX; + + target_index = search_index; + for (uint32_t i = 0; i < soa_count; i++) + global_buffers.emplace_back(); + } + + for (uint32_t soa_index = 0; soa_index < soa_count; soa_index++) + { + info.size = VkDeviceSize(count) * element_size[soa_index]; + info.usage = VK_BUFFER_USAGE_VERTEX_BUFFER_BIT | VK_BUFFER_USAGE_INDEX_BUFFER_BIT | + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | + VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | + VK_BUFFER_USAGE_INDIRECT_BUFFER_BIT; + info.domain = BufferDomain::Device; + global_buffers[target_index * soa_count + soa_index] = device.create_buffer(info); + } + + return target_index; +} + +void MeshGlobalAllocator::free(uint32_t index) +{ + index *= soa_count; + VK_ASSERT(index < global_buffers.size()); + for (uint32_t i = 0; i < soa_count; i++) + global_buffers[index + i].reset(); +} + +MeshGlobalAllocator::MeshGlobalAllocator(Device &device_) + : device(device_) +{} + +bool SliceAllocator::allocate_backing_heap(AllocatedSlice *allocation) +{ + uint32_t count = sub_block_size * Util::LegionAllocator::NumSubBlocks; + + if (parent) + { + return parent->allocate(count, allocation); + } + else if (global_allocator) + { + uint32_t index = global_allocator->allocate(count); + if (index == UINT32_MAX) + return false; + + *allocation = {}; + allocation->count = count; + allocation->buffer_index = index; + return true; + } + else + { + return false; + } +} + +void SliceAllocator::free_backing_heap(AllocatedSlice *allocation) const +{ + if (parent) + parent->free(allocation->heap, allocation->mask); + else if (global_allocator) + global_allocator->free(allocation->buffer_index); +} + +void SliceAllocator::prepare_allocation(AllocatedSlice *allocation, Util::IntrusiveList::Iterator heap, + const Util::SuballocationResult &suballoc) +{ + allocation->buffer_index = heap->allocation.buffer_index; + allocation->offset = heap->allocation.offset + suballoc.offset; + allocation->count = suballoc.size; + allocation->mask = suballoc.mask; + allocation->heap = heap; + allocation->alloc = this; +} +} + +bool MeshBufferAllocator::allocate(uint32_t count, Internal::AllocatedSlice *slice) +{ + for (auto &alloc : allocators) + { + uint32_t max_alloc_size = alloc.get_max_allocation_size(); + if (count <= max_alloc_size) + return alloc.allocate(count, slice); + } + + LOGE("Allocation of %u elements is too large for MeshBufferAllocator.\n", count); + return false; +} + +void MeshBufferAllocator::free(const Internal::AllocatedSlice &slice) +{ + if (slice.alloc) + slice.alloc->free(slice.heap, slice.mask); + else + global_allocator.free(slice.buffer_index); +} } diff --git a/vulkan/managers/resource_manager.hpp b/vulkan/managers/resource_manager.hpp index 5d1a7dbf7..a700cf3de 100644 --- a/vulkan/managers/resource_manager.hpp +++ b/vulkan/managers/resource_manager.hpp @@ -23,7 +23,11 @@ #pragma once #include "image.hpp" +#include "buffer.hpp" #include "asset_manager.hpp" +#include "meshlet.hpp" +#include "arena_allocator.hpp" +#include "small_vector.hpp" #include #include @@ -31,14 +35,80 @@ namespace Vulkan { class MemoryMappedTexture; -class ResourceManager : private Granite::AssetInstantiatorInterface +namespace Internal +{ +struct SliceAllocator; +struct AllocatedSlice +{ + uint32_t buffer_index = 0; + uint32_t offset = 0; + uint32_t count = 0; + uint32_t mask = 0; + + SliceAllocator *alloc = nullptr; + Util::IntrusiveList>::Iterator heap = {}; +}; + +struct MeshGlobalAllocator +{ + explicit MeshGlobalAllocator(Device &device); + uint32_t allocate(uint32_t count); + void free(uint32_t index); + + enum { MaxSoACount = 3 }; // Position, attribute, skinning. + + Device &device; + uint32_t element_size[MaxSoACount] = {}; + uint32_t soa_count = 1; + Util::SmallVector global_buffers; +}; + +struct SliceAllocator : Util::ArenaAllocator +{ + SliceAllocator *parent = nullptr; + MeshGlobalAllocator *global_allocator = nullptr; + + // Implements curious recurring template pattern calls. + bool allocate_backing_heap(AllocatedSlice *allocation); + void free_backing_heap(AllocatedSlice *allocation) const; + void prepare_allocation(AllocatedSlice *allocation, Util::IntrusiveList::Iterator heap, + const Util::SuballocationResult &suballoc); +}; +} + +class MeshBufferAllocator +{ +public: + MeshBufferAllocator(Device &device, uint32_t sub_block_size); + bool allocate(uint32_t count, Internal::AllocatedSlice *slice); + void free(const Internal::AllocatedSlice &slice); + void set_soa_count(unsigned soa_count); + void set_element_size(unsigned soa_index, uint32_t element_size); + uint32_t get_element_size(unsigned soa_index) const; + + const Buffer *get_buffer(unsigned index, unsigned soa_index) const; + +private: + Util::ObjectPool> object_pool; + Internal::MeshGlobalAllocator global_allocator; + enum { SliceAllocatorCount = 4 }; + Internal::SliceAllocator allocators[SliceAllocatorCount]; +}; + +class ResourceManager final : private Granite::AssetInstantiatorInterface { public: explicit ResourceManager(Device *device); - ~ResourceManager(); + ~ResourceManager() override; void init(); - inline const Vulkan::ImageView *get_image_view(Granite::ImageAssetID id) const + enum class MeshEncoding + { + Meshlet, + VBOAndIBOMDI, + }; + + inline const Vulkan::ImageView *get_image_view(Granite::AssetID id) const { if (id.id < views.size()) return views[id.id]; @@ -46,43 +116,93 @@ class ResourceManager : private Granite::AssetInstantiatorInterface return nullptr; } - const Vulkan::ImageView *get_image_view_blocking(Granite::ImageAssetID id); + const Vulkan::ImageView *get_image_view_blocking(Granite::AssetID id); + + struct DrawRange + { + uint32_t offset = 0; + uint32_t count = 0; + Meshlet::MeshStyle style = Meshlet::MeshStyle::Wireframe; + }; + + inline DrawRange get_mesh_draw_range(Granite::AssetID id) const + { + if (id.id < draws.size()) + return draws[id.id]; + else + return {}; + } + + inline MeshEncoding get_mesh_encoding() const + { + return mesh_encoding; + } + + const Buffer *get_index_buffer() const; + const Buffer *get_position_buffer() const; + const Buffer *get_attribute_buffer() const; + const Buffer *get_skinning_buffer() const; + const Buffer *get_indirect_buffer() const; + + const Buffer *get_meshlet_payload_buffer() const; + const Buffer *get_meshlet_header_buffer() const; + const Buffer *get_meshlet_stream_header_buffer() const; private: Device *device; Granite::AssetManager *manager = nullptr; void latch_handles() override; - uint64_t estimate_cost_image_resource(Granite::ImageAssetID id, Granite::File &file) override; - void instantiate_image_resource(Granite::AssetManager &manager, Granite::TaskGroup *task, - Granite::ImageAssetID id, Granite::File &file) override; - void release_image_resource(Granite::ImageAssetID id) override; + uint64_t estimate_cost_asset(Granite::AssetID id, Granite::File &file) override; + void instantiate_asset(Granite::AssetManager &manager, Granite::TaskGroup *task, + Granite::AssetID id, Granite::File &file) override; + void release_asset(Granite::AssetID id) override; void set_id_bounds(uint32_t bound) override; - void set_image_class(Granite::ImageAssetID id, Granite::ImageClass image_class) override; + void set_asset_class(Granite::AssetID id, Granite::AssetClass asset_class) override; - struct Texture + struct Asset { ImageHandle image; - Granite::ImageClass image_class = Granite::ImageClass::Zeroable; + struct + { + Internal::AllocatedSlice index_or_payload, attr_or_stream, indirect_or_header; + DrawRange draw; + } mesh; + Granite::AssetClass asset_class = Granite::AssetClass::ImageZeroable; + bool latchable = false; }; std::mutex lock; std::condition_variable cond; - std::vector textures; + std::vector assets; std::vector views; - std::vector updates; + std::vector draws; + std::vector updates; ImageHandle fallback_color; ImageHandle fallback_normal; ImageHandle fallback_zero; ImageHandle fallback_pbr; - ImageHandle create_gtx(Granite::FileMappingHandle mapping, Granite::ImageAssetID id); - ImageHandle create_gtx(const MemoryMappedTexture &mapping, Granite::ImageAssetID id); - ImageHandle create_other(const Granite::FileMapping &mapping, Granite::ImageClass image_class, Granite::ImageAssetID id); - const ImageHandle &get_fallback_image(Granite::ImageClass image_class); + ImageHandle create_gtx(Granite::FileMappingHandle mapping, Granite::AssetID id); + ImageHandle create_gtx(const MemoryMappedTexture &mapping, Granite::AssetID id); + ImageHandle create_other(const Granite::FileMapping &mapping, Granite::AssetClass asset_class, Granite::AssetID id); + const ImageHandle &get_fallback_image(Granite::AssetClass asset_class); + + void instantiate_asset(Granite::AssetManager &manager, Granite::AssetID id, Granite::File &file); + void instantiate_asset_image(Granite::AssetManager &manager, Granite::AssetID id, Granite::File &file); + void instantiate_asset_mesh(Granite::AssetManager &manager, Granite::AssetID id, Granite::File &file); + + std::mutex mesh_allocator_lock; + MeshBufferAllocator index_buffer_allocator; + MeshBufferAllocator attribute_buffer_allocator; + MeshBufferAllocator indirect_buffer_allocator; + MeshBufferAllocator mesh_header_allocator; + MeshBufferAllocator mesh_stream_allocator; + MeshBufferAllocator mesh_payload_allocator; + MeshEncoding mesh_encoding = MeshEncoding::VBOAndIBOMDI; - void instantiate_image_resource(Granite::AssetManager &manager, Granite::ImageAssetID id, Granite::File &file); + bool allocate_asset_mesh(Granite::AssetID id, const Meshlet::MeshView &view); }; } diff --git a/vulkan/managers/shader_manager.cpp b/vulkan/managers/shader_manager.cpp index 395686f24..809569da3 100644 --- a/vulkan/managers/shader_manager.cpp +++ b/vulkan/managers/shader_manager.cpp @@ -385,19 +385,21 @@ Vulkan::Program *ShaderProgramVariant::get_program_graphics() auto *frag = stages[Util::ecast(Vulkan::ShaderStage::Fragment)]; #ifdef GRANITE_SHIPPING - if (mesh) + if (mesh && frag) { ret = device->request_program(task ? task->resolve(*device) : nullptr, mesh->resolve(*device), frag->resolve(*device), sampler_bank.get()); } - else + else if (vert && frag) { ret = device->request_program(vert->resolve(*device), frag->resolve(*device), sampler_bank.get()); } + else + return nullptr; #else auto &vert_instance = shader_instance[Util::ecast(Vulkan::ShaderStage::Vertex)]; auto &frag_instance = shader_instance[Util::ecast(Vulkan::ShaderStage::Fragment)]; @@ -413,7 +415,7 @@ Vulkan::Program *ShaderProgramVariant::get_program_graphics() // we can safely read program directly. // comp->instance will only ever be incremented in the main thread on an inotify, so this is fine. // If comp->instance changes in the interim, we are at least guaranteed to read a sensible value for program. - if (mesh) + if (mesh && frag) { if ((!task || (loaded_task_instance == task->instance)) && loaded_mesh_instance == mesh->instance && @@ -422,11 +424,13 @@ Vulkan::Program *ShaderProgramVariant::get_program_graphics() return program.load(std::memory_order_relaxed); } } - else + else if (vert && frag) { if (loaded_vert_instance == vert->instance && loaded_frag_instance == frag->instance) return program.load(std::memory_order_relaxed); } + else + return nullptr; instance_lock.lock_write(); diff --git a/vulkan/memory_allocator.cpp b/vulkan/memory_allocator.cpp index f938a4bf5..bc1568639 100644 --- a/vulkan/memory_allocator.cpp +++ b/vulkan/memory_allocator.cpp @@ -123,8 +123,11 @@ void DeviceAllocation::free_global(DeviceAllocator &allocator, uint32_t size_, u } } -void ClassAllocator::prepare_allocation(DeviceAllocation *alloc, MiniHeap &heap, const SuballocationResult &suballoc) +void ClassAllocator::prepare_allocation(DeviceAllocation *alloc, Util::IntrusiveList::Iterator heap_itr, + const Util::SuballocationResult &suballoc) { + auto &heap = *heap_itr; + alloc->heap = heap_itr; alloc->base = heap.allocation.base; alloc->offset = suballoc.offset + heap.allocation.offset; alloc->mask = suballoc.mask; diff --git a/vulkan/memory_allocator.hpp b/vulkan/memory_allocator.hpp index b5525617f..38b1f094a 100644 --- a/vulkan/memory_allocator.hpp +++ b/vulkan/memory_allocator.hpp @@ -196,7 +196,8 @@ class ClassAllocator : public Util::ArenaAllocator::Iterator heap_itr, + const Util::SuballocationResult &suballoc); }; class Allocator diff --git a/vulkan/mesh/meshlet.cpp b/vulkan/mesh/meshlet.cpp new file mode 100644 index 000000000..381635c8b --- /dev/null +++ b/vulkan/mesh/meshlet.cpp @@ -0,0 +1,243 @@ +/* Copyright (c) 2017-2023 Hans-Kristian Arntzen + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "meshlet.hpp" +#include "command_buffer.hpp" +#include "buffer.hpp" +#include "device.hpp" +#include "filesystem.hpp" + +namespace Vulkan +{ +namespace Meshlet +{ +MeshView create_mesh_view(const Granite::FileMapping &mapping) +{ + MeshView view = {}; + + if (mapping.get_size() < sizeof(magic) + sizeof(FormatHeader)) + { + LOGE("MESHLET1 file too small.\n"); + return view; + } + + auto *ptr = mapping.data(); + auto *end_ptr = ptr + mapping.get_size(); + + if (memcmp(ptr, magic, sizeof(magic)) != 0) + { + LOGE("Invalid MESHLET1 magic.\n"); + return {}; + } + + ptr += sizeof(magic); + + view.format_header = reinterpret_cast(ptr); + ptr += sizeof(*view.format_header); + + if (end_ptr - ptr < ptrdiff_t(view.format_header->meshlet_count * sizeof(Header))) + return {}; + view.headers = reinterpret_cast(ptr); + ptr += view.format_header->meshlet_count * sizeof(Header); + + if (end_ptr - ptr < ptrdiff_t(view.format_header->meshlet_count * sizeof(Bound))) + return {}; + view.bounds = reinterpret_cast(ptr); + ptr += view.format_header->meshlet_count * sizeof(Bound); + + if (end_ptr - ptr < ptrdiff_t(view.format_header->meshlet_count * view.format_header->u32_stream_count * sizeof(Stream))) + return {}; + view.streams = reinterpret_cast(ptr); + ptr += view.format_header->meshlet_count * view.format_header->u32_stream_count * sizeof(Stream); + + if (!view.format_header->payload_size_words) + return {}; + + if (end_ptr - ptr < ptrdiff_t(view.format_header->payload_size_words * sizeof(uint32_t))) + return {}; + view.payload = reinterpret_cast(ptr); + + for (uint32_t i = 0, n = view.format_header->meshlet_count; i < n; i++) + { + view.total_primitives += view.headers[i].num_primitives_minus_1 + 1; + view.total_vertices += view.headers[i].num_attributes_minus_1 + 1; + } + + return view; +} + +bool decode_mesh(CommandBuffer &cmd, const DecodeInfo &info, const MeshView &view) +{ + // TODO: Implement LDS fallback. + if (!cmd.get_device().supports_subgroup_size_log2(true, 5, 5)) + { + LOGE("Device does not support Wave32.\n"); + return false; + } + + if (!info.streams[0]) + { + LOGE("Decode stream 0 must be set.\n"); + return false; + } + + if (!info.ibo) + { + LOGE("Output IBO must be set.\n"); + return false; + } + + cmd.push_constants(&info.push, 0, sizeof(info.push)); + + BufferCreateInfo buf_info = {}; + buf_info.domain = BufferDomain::LinkedDeviceHost; + buf_info.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; + + buf_info.size = view.format_header->meshlet_count * sizeof(*view.headers); + auto meshlet_meta_buffer = cmd.get_device().create_buffer(buf_info, view.headers); + + buf_info.size = view.format_header->meshlet_count * view.format_header->u32_stream_count * sizeof(*view.streams); + auto meshlet_stream_buffer = cmd.get_device().create_buffer(buf_info, view.streams); + + // For Raw mode -> offset/stride + // For typed mode -> index offset / vertex offset + struct DecodeOffset { uint32_t arg0, arg1; }; + std::vector decode_offsets; + + cmd.set_program("builtin://shaders/decode/meshlet_decode.comp"); + cmd.enable_subgroup_size_control(true); + cmd.set_subgroup_size_log2(true, 5, 5); + + cmd.set_storage_buffer(0, 0, *meshlet_meta_buffer); + cmd.set_storage_buffer(0, 1, *meshlet_stream_buffer); + cmd.set_storage_buffer(0, 2, *info.payload); + cmd.set_storage_buffer(0, 3, *info.ibo); + + cmd.set_specialization_constant_mask(0x7); + cmd.set_specialization_constant(0, view.format_header->u32_stream_count); + cmd.set_specialization_constant(2, (info.flags & DECODE_MODE_RAW_PAYLOAD) != 0); + + if ((info.flags & DECODE_MODE_RAW_PAYLOAD) != 0) + { + uint32_t output_u32_streams; + switch (info.target_style) + { + case MeshStyle::Wireframe: + output_u32_streams = 2; + break; + + case MeshStyle::Untextured: + output_u32_streams = 3; + break; + + case MeshStyle::Textured: + output_u32_streams = 6; + break; + + case MeshStyle::Skinned: + output_u32_streams = 8; + break; + + default: + return false; + } + + if (output_u32_streams + 1 > view.format_header->u32_stream_count) + { + LOGE("Trying to decode more streams than exist in payload.\n"); + return false; + } + + for (unsigned i = 0; i < 3; i++) + cmd.set_storage_buffer(0, 4 + i, *info.streams[0]); + + decode_offsets.reserve(view.format_header->meshlet_count * (output_u32_streams + 1)); + uint32_t index_count = 0; + + for (uint32_t i = 0; i < view.format_header->meshlet_count; i++) + { + decode_offsets.push_back({ index_count, 0 }); + index_count += view.headers[i].num_primitives_minus_1 + 1; + for (uint32_t j = 0; j < output_u32_streams; j++) + decode_offsets.push_back({ view.headers[i].base_vertex_offset * output_u32_streams + j, output_u32_streams }); + } + + cmd.set_specialization_constant(1, output_u32_streams + 1); + + // Dummy bind for indirect_buffer. + cmd.set_storage_buffer(0, 8, *info.streams[0]); + } + else + { + for (unsigned i = 0; i < 3; i++) + cmd.set_storage_buffer(0, 4 + i, *info.streams[0]); + + switch (info.target_style) + { + case MeshStyle::Skinned: + cmd.set_storage_buffer(0, 6, *info.streams[2]); + // Fallthrough + case MeshStyle::Untextured: + case MeshStyle::Textured: + cmd.set_storage_buffer(0, 5, *info.streams[1]); + // Fallthrough + case MeshStyle::Wireframe: + cmd.set_storage_buffer(0, 4, *info.streams[0]); + break; + + default: + return false; + } + + decode_offsets.reserve(view.format_header->meshlet_count); + uint32_t index_count = 0; + for (uint32_t i = 0; i < view.format_header->meshlet_count; i++) + { + decode_offsets.push_back({ index_count, view.headers[i].base_vertex_offset }); + index_count += view.headers[i].num_primitives_minus_1 + 1; + } + cmd.set_specialization_constant(1, uint32_t(info.target_style)); + + cmd.set_storage_buffer(0, 8, *info.indirect); + } + + buf_info.domain = BufferDomain::LinkedDeviceHost; + buf_info.size = decode_offsets.size() * sizeof(DecodeOffset); + auto output_offset_strides_buffer = cmd.get_device().create_buffer(buf_info, decode_offsets.data()); + + cmd.set_storage_buffer(0, 7, *output_offset_strides_buffer); + + // TODO: Split dispatches for big chungus meshes. + // (Starts to become a problem around 8-16 million primitives per dispatch). + if (view.format_header->meshlet_count > cmd.get_device().get_gpu_properties().limits.maxComputeWorkGroupCount[0]) + { + LOGW("Exceeding workgroup limit (%u > %u).\n", view.format_header->meshlet_count, + cmd.get_device().get_gpu_properties().limits.maxComputeWorkGroupCount[0]); + } + + cmd.dispatch(view.format_header->meshlet_count, 1, 1); + cmd.set_specialization_constant_mask(0); + cmd.enable_subgroup_size_control(false); + return true; +} +} +} diff --git a/vulkan/mesh/meshlet.hpp b/vulkan/mesh/meshlet.hpp new file mode 100644 index 000000000..8a9e7c133 --- /dev/null +++ b/vulkan/mesh/meshlet.hpp @@ -0,0 +1,142 @@ +/* Copyright (c) 2017-2023 Hans-Kristian Arntzen + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#pragma once + +#include + +namespace Granite +{ +class FileMapping; +} + +namespace Vulkan +{ +class CommandBuffer; +class Buffer; +} + +namespace Vulkan +{ +// MESHLET1 format. +namespace Meshlet +{ +static constexpr unsigned MaxU32Streams = 16; +static constexpr unsigned MaxElements = 256; +static constexpr unsigned MaxPrimitives = MaxElements; +static constexpr unsigned MaxVertices = MaxElements; + +struct Stream +{ + uint16_t predictor[4 * 2 + 2]; + uint32_t offset_from_base_u32; + uint16_t bitplane_meta[MaxElements / 32]; +}; + +struct Header +{ + uint32_t base_vertex_offset; + uint8_t num_primitives_minus_1; + uint8_t num_attributes_minus_1; + uint16_t reserved; +}; + +// For GPU use +struct RuntimeHeader +{ + uint32_t stream_offset; + uint16_t num_primitives; + uint16_t num_attributes; +}; + +struct Bound +{ + float center[3]; + float radius; + int8_t cone_axis_cutoff[4]; +}; + +enum class StreamType +{ + Primitive = 0, // R8G8B8X8_UINT + PositionE16, // RGB16_SSCALED * 2^(A16_SINT) + NormalOct8, // Octahedron encoding in RG8. + TangentOct8, // Octahedron encoding in RG8, sign bit in B8 (if not zero, +1, otherwise -1). + UV, // R16G16_SNORM * B16_SSCALED + BoneIndices, // RGBA8_UINT + BoneWeights, // RGB8_UNORM (sums to 1, A is implied). +}; + +enum class MeshStyle : uint32_t +{ + Wireframe = 0, // Primitive + Position + Untextured, // Wireframe + NormalOct8 + Textured, // Untextured + TangentOct8 + UV + Skinned // Textured + Bone* +}; + +struct FormatHeader +{ + MeshStyle style; + uint32_t u32_stream_count; + uint32_t meshlet_count; + uint32_t payload_size_words; +}; + +struct MeshView +{ + const FormatHeader *format_header; + const Header *headers; + const Bound *bounds; + const Stream *streams; + const uint32_t *payload; + uint32_t total_primitives; + uint32_t total_vertices; +}; + +static const char magic[8] = { 'M', 'E', 'S', 'H', 'L', 'E', 'T', '1' }; + +MeshView create_mesh_view(const Granite::FileMapping &mapping); + +enum DecodeModeFlagBits : uint32_t +{ + DECODE_MODE_RAW_PAYLOAD = 1 << 0, +}; +using DecodeModeFlags = uint32_t; + +struct DecodeInfo +{ + const Vulkan::Buffer *ibo, *streams[3], *indirect, *payload; + DecodeModeFlags flags; + MeshStyle target_style; + + struct + { + uint32_t primitive_offset; + uint32_t vertex_offset; + uint32_t meshlet_offset; + } push; +}; + +bool decode_mesh(Vulkan::CommandBuffer &cmd, const DecodeInfo &decode_info, const MeshView &view); +} +}