diff --git a/benchmark/accuracy/quantization/mpfma.cpp b/benchmark/accuracy/quantization/mpfma.cpp index 195c40bee..d666f6001 100644 --- a/benchmark/accuracy/quantization/mpfma.cpp +++ b/benchmark/accuracy/quantization/mpfma.cpp @@ -21,6 +21,7 @@ void QuantizationExperiment(unsigned nrSamples, unsigned vectorSize, double mean gaussian_random(y_data, mean, stddev); blas::vector sorted(L); blas::vector quantized_data(L), quantized_sorted(L), quantized_y(L); + blas::vector upSampledToDouble(L); quantized_y = y_data; blas::vector upSampled(L); blas::vector y(L); @@ -48,7 +49,8 @@ void QuantizationExperiment(unsigned nrSamples, unsigned vectorSize, double mean experimentalMean += sorted_avg; quantized_data = reference_data; - auto quantized_avg = double(blas::sum(quantized_data)) / L; + upSampledToDouble = quantized_data; + auto quantized_avg = double(blas::sum(upSampledToDouble)) / L; quantizedMean += quantized_avg; // dot products in AccumulationType @@ -69,11 +71,30 @@ void QuantizationExperiment(unsigned nrSamples, unsigned vectorSize, double mean << std::setw(FIELD_WIDTH) << quantized_sorted[L - 1] << "]\n"; } } - std::cout << "experimental mean : " << (experimentalMean / N) << '\n'; - std::cout << "quantized mean : " << (quantizedMean / N) << '\n'; + std::cout << "experimental mean : " << (experimentalMean / N) << '\n'; + std::cout << "quantized mean : " << (quantizedMean / N) << '\n'; - AccumulationType avg = sum(dotProduct) / N; - std::cout << "dot product mean : " << avg << '\n'; + double dot_avg = double(sum(dotProduct)) / N; + std::cout << "dot product mean : " << dot_avg << '\n'; + double dot_stddev = 0; + for (auto e : dotProduct) { + dot_stddev += (double(e) - dot_avg); + } + dot_stddev /= double(N - 1); + std::cout << "dot product stddev : " << dot_stddev << '\n'; +} + +template +void StatisticalSampling(double mean, double stddev) { + using namespace sw::universal; + std::cout << "representation type : " << symmetry_range() << '\n'; + std::cout << "accumulation type : " << symmetry_range() << '\n'; + unsigned nrSamples{ 10000 }; + QuantizationExperiment(nrSamples, 50, mean, stddev); + QuantizationExperiment(nrSamples, 500, mean, stddev); + QuantizationExperiment(nrSamples, 1000, mean, stddev); + QuantizationExperiment(nrSamples, 2000, mean, stddev); + QuantizationExperiment(nrSamples, 4000, mean, stddev); } int main(int argc, char** argv) @@ -84,22 +105,12 @@ try { std::cout << std::setprecision(3); // generate a set of N vectors of length L in double as reference - using fp8 = fp8e2m5; using fp12 = cfloat<12, 5, uint16_t, true, true, false>; // accumulation type double mean{ 0.0 }, stddev{ 1.0 }; - std::cout << "representation type : " << symmetry_range() << '\n'; - std::cout << "accumulation type : " << symmetry_range() << '\n'; - unsigned nrSamples{ 10000 }; - QuantizationExperiment(nrSamples, 50, mean, stddev); - QuantizationExperiment(nrSamples, 100, mean, stddev); - QuantizationExperiment(nrSamples, 200, mean, stddev); - QuantizationExperiment(nrSamples, 400, mean, stddev); - QuantizationExperiment(nrSamples, 600, mean, stddev); - QuantizationExperiment(nrSamples, 800, mean, stddev); - QuantizationExperiment(nrSamples, 1000, mean, stddev); - QuantizationExperiment(nrSamples, 2000, mean, stddev); - QuantizationExperiment(nrSamples, 4000, mean, stddev); + StatisticalSampling(mean, stddev); + StatisticalSampling(mean, stddev); + StatisticalSampling(mean, stddev); std::cout << std::setprecision(prec); diff --git a/include/universal/number/cfloat/cfloat.hpp b/include/universal/number/cfloat/cfloat.hpp index 2e4dc36a6..1a1c5b85d 100644 --- a/include/universal/number/cfloat/cfloat.hpp +++ b/include/universal/number/cfloat/cfloat.hpp @@ -103,7 +103,7 @@ using amd24 = cfloat<24, 8, std::uint32_t, false, false, false>; // By default we enable both subnormals and supernormals // as the number of encodings is severely limited (128 vs 256 samples) using fp8e2m5 = cfloat<8, 2, std::uint8_t, true, true, false>; -using fp8e3m2 = cfloat<8, 3, std::uint8_t, true, true, false>; +using fp8e3m4 = cfloat<8, 3, std::uint8_t, true, true, false>; using fp8e4m3 = cfloat<8, 4, std::uint8_t, true, true, false>; using fp8e5m2 = cfloat<8, 5, std::uint8_t, true, true, false>;