GPU: Add optimized NEON path for analyzing index buffers (#613)

* Implement ARM NEON index buffer analysis * NEON: Fix initial index buffer minima/maxima * NEON: Fix vertex count comparison for index buffer analysis * GPU: Add SSE4.1 path for index buffer analysis * Fix oopsie * Fix oopsie, again
wheremyfoodat · Oct 20, 2024 · 5d28f11 · 5d28f11
1 parent af1fe13
commit 5d28f11
Show file tree

Hide file tree

Showing 3 changed files with 268 additions and 18 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -54,6 +54,7 @@ option(ENABLE_GIT_VERSIONING "Enables querying git for the emulator version" ON)
 option(BUILD_HYDRA_CORE "Build a Hydra core" OFF)
 option(BUILD_LIBRETRO_CORE "Build a Libretro core" OFF)
 option(ENABLE_RENDERDOC_API "Build with support for Renderdoc's capture API for graphics debugging" ON)
+option(DISABLE_SSE4 "Build with SSE4 instructions disabled, may reduce performance" OFF)
 
 set(OPENGL_PROFILE ${DEFAULT_OPENGL_PROFILE} CACHE STRING "OpenGL profile to use if OpenGL is enabled. Valid values are 'OpenGL' and 'OpenGLES'.")
 set_property(CACHE OPENGL_PROFILE PROPERTY STRINGS OpenGL OpenGLES)
@@ -211,6 +212,13 @@ else()
     set(HOST_ARM64 FALSE)
 endif()
 
+# Enable SSE4.1 if it's not explicitly disabled
+# Annoyingly, we can't easily do this if we're using MSVC cause there's no SSE4.1 flag, only SSE4.1
+if(NOT MSVC OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang" AND NOT DISABLE_SSE4 AND HOST_X64)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse4.1")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.1")
+endif()
+
 if(ENABLE_RENDERDOC_API)
     find_package(RenderDoc 1.6.0 MODULE REQUIRED)
     add_compile_definitions(PANDA3DS_ENABLE_RENDERDOC)
@@ -318,7 +326,7 @@ set(HEADER_FILES include/emulator.hpp include/helpers.hpp include/termcolor.hpp
                  include/audio/hle_core.hpp include/capstone.hpp include/audio/aac.hpp include/PICA/pica_frag_config.hpp
                  include/PICA/pica_frag_uniforms.hpp include/PICA/shader_gen_types.hpp include/PICA/shader_decompiler.hpp
                  include/PICA/pica_vert_config.hpp include/sdl_sensors.hpp include/PICA/draw_acceleration.hpp include/renderdoc.hpp
-                 include/align.hpp include/audio/aac_decoder.hpp
+                 include/align.hpp include/audio/aac_decoder.hpp include/PICA/pica_simd.hpp
 )
 
 cmrc_add_resource_library(

diff --git a/include/PICA/pica_simd.hpp b/include/PICA/pica_simd.hpp
@@ -0,0 +1,253 @@
+#pragma once
+#include <algorithm>
+#include <limits>
+#include <utility>
+
+#include "helpers.hpp"
+
+#if defined(_M_AMD64) || defined(__x86_64__)
+#define PICA_SIMD_X64
+#include <immintrin.h>
+#elif defined(_M_ARM64) || defined(__aarch64__)
+#define PICA_SIMD_ARM64
+#include <arm_neon.h>
+#endif
+
+// Optimized functions for analyzing PICA index buffers (Finding minimum and maximum index values inside them)
+namespace PICA::IndexBuffer {
+	// Non-SIMD, portable algorithm
+	template <bool useShortIndices>
+	std::pair<u16, u16> analyzePortable(u8* indexBuffer, u32 vertexCount) {
+		u16 minimumIndex = std::numeric_limits<u16>::max();
+		u16 maximumIndex = 0;
+
+		// Calculate the minimum and maximum indices used in the index buffer, so we'll only upload them
+		if constexpr (useShortIndices) {
+			u16* indexBuffer16 = reinterpret_cast<u16*>(indexBuffer);
+
+			for (u32 i = 0; i < vertexCount; i++) {
+				u16 index = indexBuffer16[i];
+				minimumIndex = std::min(minimumIndex, index);
+				maximumIndex = std::max(maximumIndex, index);
+			}
+		} else {
+			for (u32 i = 0; i < vertexCount; i++) {
+				u16 index = u16(indexBuffer[i]);
+				minimumIndex = std::min(minimumIndex, index);
+				maximumIndex = std::max(maximumIndex, index);
+			}
+		}
+
+		return {minimumIndex, maximumIndex};
+	}
+
+#ifdef PICA_SIMD_ARM64
+	template <bool useShortIndices>
+	std::pair<u16, u16> analyzeNEON(u8* indexBuffer, u32 vertexCount) {
+		// We process 16 bytes per iteration, which is 8 vertices if we're using u16 indices or 16 vertices if we're using u8 indices
+		constexpr u32 vertsPerLoop = (useShortIndices) ? 8 : 16;
+
+		if (vertexCount < vertsPerLoop) {
+			return analyzePortable<useShortIndices>(indexBuffer, vertexCount);
+		}
+
+		u16 minimumIndex, maximumIndex;
+
+		if constexpr (useShortIndices) {
+			// 16-bit indices
+			uint16x8_t minima = vdupq_n_u16(0xffff);
+			uint16x8_t maxima = vdupq_n_u16(0);
+
+			while (vertexCount >= vertsPerLoop) {
+				const uint16x8_t data = vld1q_u16(reinterpret_cast<u16*>(indexBuffer));
+				minima = vminq_u16(data, minima);
+				maxima = vmaxq_u16(data, maxima);
+
+				indexBuffer += 16;
+				vertexCount -= vertsPerLoop;
+			}
+
+			// Do horizontal min/max operations to get the actual minimum and maximum from all the vertices we processed with SIMD
+			// We want to gather the actual minimum and maximum in the line bottom lane of the minima/maxima vectors
+			// uint16x4_t foldedMinima1 = vmin_u16(vget_high_u16(minima), vget_low_u16(minima));
+			// uint16x4_t foldedMaxima1 = vmax_u16(vget_high_u16(maxima), vget_low_u16(maxima));
+
+			uint16x8_t foldedMinima1 = vpminq_u16(minima, minima);
+			uint16x8_t foldedMinima2 = vpminq_u16(foldedMinima1, foldedMinima1);
+			uint16x8_t foldedMinima3 = vpminq_u16(foldedMinima2, foldedMinima2);
+
+			uint16x8_t foldedMaxima1 = vpmaxq_u16(maxima, maxima);
+			uint16x8_t foldedMaxima2 = vpmaxq_u16(foldedMaxima1, foldedMaxima1);
+			uint16x8_t foldedMaxima3 = vpmaxq_u16(foldedMaxima2, foldedMaxima2);
+
+			minimumIndex = vgetq_lane_u16(foldedMinima3, 0);
+			maximumIndex = vgetq_lane_u16(foldedMaxima3, 0);
+		} else {
+			// 8-bit indices
+			uint8x16_t minima = vdupq_n_u8(0xff);
+			uint8x16_t maxima = vdupq_n_u8(0);
+
+			while (vertexCount >= vertsPerLoop) {
+				uint8x16_t data = vld1q_u8(indexBuffer);
+				minima = vminq_u8(data, minima);
+				maxima = vmaxq_u8(data, maxima);
+
+				indexBuffer += 16;
+				vertexCount -= vertsPerLoop;
+			}
+
+			// Do a similar horizontal min/max as in the u16 case, except now we're working uint8x16 instead of uint16x4 so we need 4 folds
+			uint8x16_t foldedMinima1 = vpminq_u8(minima, minima);
+			uint8x16_t foldedMinima2 = vpminq_u8(foldedMinima1, foldedMinima1);
+			uint8x16_t foldedMinima3 = vpminq_u8(foldedMinima2, foldedMinima2);
+			uint8x16_t foldedMinima4 = vpminq_u8(foldedMinima3, foldedMinima3);
+
+			uint8x16_t foldedMaxima1 = vpmaxq_u8(maxima, maxima);
+			uint8x16_t foldedMaxima2 = vpmaxq_u8(foldedMaxima1, foldedMaxima1);
+			uint8x16_t foldedMaxima3 = vpmaxq_u8(foldedMaxima2, foldedMaxima2);
+			uint8x16_t foldedMaxima4 = vpmaxq_u8(foldedMaxima3, foldedMaxima3);
+
+			minimumIndex = u16(vgetq_lane_u8(foldedMinima4, 0));
+			maximumIndex = u16(vgetq_lane_u8(foldedMaxima4, 0));
+		}
+
+		// If any indices could not be processed cause the buffer size is not 16-byte aligned, process them the naive way
+		// Calculate the minimum and maximum indices used in the index buffer, so we'll only upload them
+		while (vertexCount > 0) {
+			if constexpr (useShortIndices) {
+				u16 index = *reinterpret_cast<u16*>(indexBuffer);
+				minimumIndex = std::min(minimumIndex, index);
+				maximumIndex = std::max(maximumIndex, index);
+				indexBuffer += 2;
+			} else {
+				u16 index = u16(*indexBuffer++);
+				minimumIndex = std::min(minimumIndex, index);
+				maximumIndex = std::max(maximumIndex, index);
+			}
+
+			vertexCount -= 1;
+		}
+
+		return {minimumIndex, maximumIndex};
+	}
+#endif
+
+#if defined(PICA_SIMD_X64) && (defined(__SSE4_1__) || defined(__AVX__))
+	template <bool useShortIndices>
+	std::pair<u16, u16> analyzeSSE4_1(u8* indexBuffer, u32 vertexCount) {
+		// We process 16 bytes per iteration, which is 8 vertices if we're using u16
+		// indices or 16 vertices if we're using u8 indices
+		constexpr u32 vertsPerLoop = (useShortIndices) ? 8 : 16;
+
+		if (vertexCount < vertsPerLoop) {
+			return analyzePortable<useShortIndices>(indexBuffer, vertexCount);
+		}
+
+		u16 minimumIndex, maximumIndex;
+
+		if constexpr (useShortIndices) {
+			// Calculate the horizontal minimum/maximum value across an SSE vector of 16-bit unsigned integers.
+			// Based on https://stackoverflow.com/a/22259607
+			auto horizontalMin16 = [](__m128i vector) -> u16 { return u16(_mm_cvtsi128_si32(_mm_minpos_epu16(vector))); };
+
+			auto horizontalMax16 = [](__m128i vector) -> u16 {
+				// We have an instruction to compute horizontal minimum but not maximum, so we use it.
+				// To use it, we have to subtract each value from 0xFFFF (which we do with an xor), then execute a horizontal minimum
+				__m128i flipped = _mm_xor_si128(vector, _mm_set_epi32(0xffffffffu, 0xffffffffu, 0xffffffffu, 0xffffffffu));
+				u16 min = u16(_mm_cvtsi128_si32(_mm_minpos_epu16(flipped)));
+				return u16(min ^ 0xffff);
+			};
+
+			// 16-bit indices
+			// Initialize the minima vector to all FFs (So 0xFFFF for each 16-bit lane)
+			// And the maxima vector to all 0s (0 for each 16-bit lane)
+			__m128i minima = _mm_set_epi32(0xffffffffu, 0xffffffffu, 0xffffffffu, 0xffffffffu);
+			__m128i maxima = _mm_set_epi32(0, 0, 0, 0);
+
+			while (vertexCount >= vertsPerLoop) {
+				const __m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i*>(indexBuffer));
+				minima = _mm_min_epu16(data, minima);
+				maxima = _mm_max_epu16(data, maxima);
+
+				indexBuffer += 16;
+				vertexCount -= vertsPerLoop;
+			}
+
+			minimumIndex = u16(horizontalMin16(minima));
+			maximumIndex = u16(horizontalMax16(maxima));
+		} else {
+			// Calculate the horizontal minimum/maximum value across an SSE vector of 8-bit unsigned integers.
+			// Based on https://stackoverflow.com/a/22259607
+			auto horizontalMin8 = [](__m128i vector) -> u8 {
+				vector = _mm_min_epu8(vector, _mm_shuffle_epi32(vector, _MM_SHUFFLE(3, 2, 3, 2)));
+				vector = _mm_min_epu8(vector, _mm_shuffle_epi32(vector, _MM_SHUFFLE(1, 1, 1, 1)));
+				vector = _mm_min_epu8(vector, _mm_shufflelo_epi16(vector, _MM_SHUFFLE(1, 1, 1, 1)));
+				vector = _mm_min_epu8(vector, _mm_srli_epi16(vector, 8));
+				return u8(_mm_cvtsi128_si32(vector));
+			};
+
+			auto horizontalMax8 = [](__m128i vector) -> u8 {
+				vector = _mm_max_epu8(vector, _mm_shuffle_epi32(vector, _MM_SHUFFLE(3, 2, 3, 2)));
+				vector = _mm_max_epu8(vector, _mm_shuffle_epi32(vector, _MM_SHUFFLE(1, 1, 1, 1)));
+				vector = _mm_max_epu8(vector, _mm_shufflelo_epi16(vector, _MM_SHUFFLE(1, 1, 1, 1)));
+				vector = _mm_max_epu8(vector, _mm_srli_epi16(vector, 8));
+				return u8(_mm_cvtsi128_si32(vector));
+			};
+
+			// 8-bit indices
+			// Initialize the minima vector to all FFs (So 0xFF for each 8-bit lane)
+			// And the maxima vector to all 0s (0 for each 8-bit lane)
+			__m128i minima = _mm_set_epi32(0xffffffffu, 0xffffffffu, 0xffffffffu, 0xffffffffu);
+			__m128i maxima = _mm_set_epi32(0, 0, 0, 0);
+
+			while (vertexCount >= vertsPerLoop) {
+				const __m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i*>(indexBuffer));
+				minima = _mm_min_epu8(data, minima);
+				maxima = _mm_max_epu8(data, maxima);
+
+				indexBuffer += 16;
+				vertexCount -= vertsPerLoop;
+			}
+
+			minimumIndex = u16(horizontalMin8(minima));
+			maximumIndex = u16(horizontalMax8(maxima));
+		}
+
+		// If any indices could not be processed cause the buffer size
+		// is not 16-byte aligned, process them the naive way
+		// Calculate the minimum and maximum indices used in the index
+		// buffer, so we'll only upload them
+		while (vertexCount > 0) {
+			if constexpr (useShortIndices) {
+				u16 index = *reinterpret_cast<u16*>(indexBuffer);
+				minimumIndex = std::min(minimumIndex, index);
+				maximumIndex = std::max(maximumIndex, index);
+				indexBuffer += 2;
+			} else {
+				u16 index = u16(*indexBuffer++);
+				minimumIndex = std::min(minimumIndex, index);
+				maximumIndex = std::max(maximumIndex, index);
+			}
+
+			vertexCount -= 1;
+		}
+
+		return {minimumIndex, maximumIndex};
+	}
+#endif
+
+	// Analyzes a PICA index buffer to get the minimum and maximum indices in the
+	// buffer, and returns them in a pair in the form [min, max]. Takes a template
+	// parameter to decide whether the indices in the buffer are u8 or u16
+	template <bool useShortIndices>
+	std::pair<u16, u16> analyze(u8* indexBuffer, u32 vertexCount) {
+#if defined(PICA_SIMD_ARM64)
+		return analyzeNEON<useShortIndices>(indexBuffer, vertexCount);
+#elif defined(PICA_SIMD_X64) && (defined(__SSE4_1__) || defined(__AVX__))
+		// Annoyingly, MSVC refuses to define __SSE4_1__ even when we're building with AVX
+		return analyzeSSE4_1<useShortIndices>(indexBuffer, vertexCount);
+#else
+		return analyzePortable<useShortIndices>(indexBuffer, vertexCount);
+#endif
+	}
+}  // namespace PICA::IndexBuffer
diff --git a/src/core/PICA/draw_acceleration.cpp b/src/core/PICA/draw_acceleration.cpp
@@ -1,16 +1,17 @@
 #include "PICA/draw_acceleration.hpp"
 
 #include <bit>
-#include <limits>
+#include <tuple>
 
 #include "PICA/gpu.hpp"
+#include "PICA/pica_simd.hpp"
 #include "PICA/regs.hpp"
 
 void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) {
 	accel.indexed = indexed;
 	accel.totalAttribCount = totalAttribCount;
 	accel.enabledAttributeMask = 0;
-	
+
 	const u32 vertexBase = ((regs[PICA::InternalRegs::VertexAttribLoc] >> 1) & 0xfffffff) * 16;
 	const u32 vertexCount = regs[PICA::InternalRegs::VertexCountReg];  // Total # of vertices to transfer
 
@@ -27,24 +28,12 @@ void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) {
 
 		// Calculate the minimum and maximum indices used in the index buffer, so we'll only upload them
 		if (accel.useShortIndices) {
-			u16* indexBuffer16 = reinterpret_cast<u16*>(indexBuffer);
-
-			for (int i = 0; i < vertexCount; i++) {
-				u16 index = indexBuffer16[i];
-				minimumIndex = std::min(minimumIndex, index);
-				maximumIndex = std::max(maximumIndex, index);
-			}
+			std::tie(accel.minimumIndex, accel.maximumIndex) = PICA::IndexBuffer::analyze<true>(indexBuffer, vertexCount);
 		} else {
-			for (int i = 0; i < vertexCount; i++) {
-				u16 index = u16(indexBuffer[i]);
-				minimumIndex = std::min(minimumIndex, index);
-				maximumIndex = std::max(maximumIndex, index);
-			}
+			std::tie(accel.minimumIndex, accel.maximumIndex) = PICA::IndexBuffer::analyze<false>(indexBuffer, vertexCount);
 		}
 
 		accel.indexBuffer = indexBuffer;
-		accel.minimumIndex = minimumIndex;
-		accel.maximumIndex = maximumIndex;
 	} else {
 		accel.indexBuffer = nullptr;
 		accel.minimumIndex = regs[PICA::InternalRegs::VertexOffsetReg];
@@ -76,7 +65,7 @@ void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) {
 
 		// Add it to the total vertex data size, aligned to 4 bytes.
 		accel.vertexDataSize += (bytes + 3) & ~3;
-		
+
 		// Get a pointer to the data where this loader's data is stored
 		const u32 loaderAddress = vertexBase + loaderData.offset + (accel.minimumIndex * loaderData.size);
 		loader.data = getPointerPhys<u8>(loaderAddress);