diff --git a/applications/engineering/water.cpp b/applications/engineering/water.cpp index db46d25be..e04e0be95 100644 --- a/applications/engineering/water.cpp +++ b/applications/engineering/water.cpp @@ -16,9 +16,7 @@ #include // universal BLAS #include -#include -#include -#include +#include using std::string; using std::vector; diff --git a/applications/ode/convergence.cpp b/applications/ode/convergence.cpp index 70f4f660b..79707a981 100644 --- a/applications/ode/convergence.cpp +++ b/applications/ode/convergence.cpp @@ -10,8 +10,7 @@ #include #include #include -#include -#include +#include template Scalar my_ode_func(const Scalar& t, const Scalar& u) { diff --git a/include/universal/blas/blas.hpp b/include/universal/blas/blas.hpp index 46eb71b69..666d6dc28 100644 --- a/include/universal/blas/blas.hpp +++ b/include/universal/blas/blas.hpp @@ -83,4 +83,7 @@ constexpr uint64_t SIZE_512G = 512 * SIZE_1G; #include #include +// type specific overloads +#include + #endif // _UNIVERSAL_BLAS_LIBRARY diff --git a/include/universal/blas/matrix.hpp b/include/universal/blas/matrix.hpp index d482e67e4..c199b1c80 100644 --- a/include/universal/blas/matrix.hpp +++ b/include/universal/blas/matrix.hpp @@ -9,8 +9,6 @@ #include #include #include -#include -#include #if defined(__clang__) /* Clang/LLVM. ---------------------------------------------- */ @@ -200,7 +198,8 @@ class matrix { // selectors unsigned rows() const { return _m; } unsigned cols() const { return _n; } - std::pair size() const { return std::make_pair(_m, _n); } +// std::pair size() const { return std::make_pair(_m, _n); } + unsigned size() const { return data.size(); } // in-place transpose matrix& transpose() { @@ -275,7 +274,7 @@ inline unsigned num_rows(const matrix& A) { return A.rows(); } template inline unsigned num_cols(const matrix& A) { return A.cols(); } template -inline std::pair size(const matrix& A) { return A.size(); } +inline std::pair size(const matrix& A) { return std::make_pair(A.rows(), A.cols()); } // ostream operator: no need to declare as friend as it only uses public interfaces template @@ -357,20 +356,6 @@ vector operator*(const matrix& A, const vector& x) { return b; } -// overload for posits to use fused dot products -template -vector< posit > operator*(const matrix< posit >& A, const vector< posit >& x) { - constexpr unsigned capacity = 20; // FDP for vectors < 1,048,576 elements - vector< posit > b(A.rows()); - for (unsigned i = 0; i < A.rows(); ++i) { - quire q; - for (unsigned j = 0; j < A.cols(); ++j) { - q += quire_mul(A(i, j), x[j]); - } - convert(q.to_value(), b[i]); // one and only rounding step of the fused-dot product - } - return b; -} template matrix operator*(const matrix& A, const matrix& B) { @@ -410,31 +395,6 @@ matrix operator%(const matrix& A, const matrix& B) { return C; } - - -// overload for posits uses fused dot products -template -matrix< posit > operator*(const matrix< posit >& A, const matrix< posit >& B) { - constexpr unsigned capacity = 20; // FDP for vectors < 1,048,576 elements - if (A.cols() != B.rows()) throw matmul_incompatible_matrices(incompatible_matrices(A.rows(), A.cols(), B.rows(), B.cols(), "*").what()); - unsigned rows = A.rows(); - unsigned cols = B.cols(); - unsigned dots = A.cols(); - matrix< posit > C(rows, cols); - for (unsigned i = 0; i < rows; ++i) { - for (unsigned j = 0; j < cols; ++j) { - quire q; - for (unsigned k = 0; k < dots; ++k) { - q += quire_mul(A(i, k), B(k, j)); - } - convert(q.to_value(), C(i, j)); // one and only rounding step of the fused-dot product - } - } - return C; -} - - - // matrix equivalence tests template bool operator==(const matrix& A, const matrix& B) { @@ -453,8 +413,6 @@ bool operator==(const matrix& A, const matrix& B) { return equal; } - - template bool operator!=(const matrix& A, const matrix& B) { return !(A == B); diff --git a/include/universal/blas/modifiers/posit_fdp.hpp b/include/universal/blas/modifiers/posit_fdp.hpp new file mode 100644 index 000000000..cbae511f2 --- /dev/null +++ b/include/universal/blas/modifiers/posit_fdp.hpp @@ -0,0 +1,41 @@ +#pragma once + +namespace sw { namespace universal { namespace blas { + +// overload for posits to use fused dot products +template +vector< posit > operator*(const matrix< posit >& A, const vector< posit >& x) { + constexpr unsigned capacity = 20; // FDP for vectors < 1,048,576 elements + vector< posit > b(A.rows()); + for (unsigned i = 0; i < A.rows(); ++i) { + quire q; + for (unsigned j = 0; j < A.cols(); ++j) { + q += quire_mul(A(i, j), x[j]); + } + convert(q.to_value(), b[i]); // one and only rounding step of the fused-dot product + } + return b; +} + +// overload for posits uses fused dot products +template +matrix< posit > operator*(const matrix< posit >& A, const matrix< posit >& B) { + constexpr unsigned capacity = 20; // FDP for vectors < 1,048,576 elements + if (A.cols() != B.rows()) throw matmul_incompatible_matrices(incompatible_matrices(A.rows(), A.cols(), B.rows(), B.cols(), "*").what()); + unsigned rows = A.rows(); + unsigned cols = B.cols(); + unsigned dots = A.cols(); + matrix< posit > C(rows, cols); + for (unsigned i = 0; i < rows; ++i) { + for (unsigned j = 0; j < cols; ++j) { + quire q; + for (unsigned k = 0; k < dots; ++k) { + q += quire_mul(A(i, k), B(k, j)); + } + convert(q.to_value(), C(i, j)); // one and only rounding step of the fused-dot product + } + } + return C; +} + +}}} // namespace sw::universal::blas diff --git a/include/universal/blas/serialization/datafile.hpp b/include/universal/blas/serialization/datafile.hpp index a52bfe3e8..e93ae2a7e 100644 --- a/include/universal/blas/serialization/datafile.hpp +++ b/include/universal/blas/serialization/datafile.hpp @@ -305,16 +305,20 @@ namespace sw { namespace universal { namespace blas { using Scalar = typename CollectionType::value_type; saveTypeId(ostr); saveAggregationInfo(ostr); + int i{ 0 }; if (hex) { for (const auto& item : collection) { - ostr << to_hex(item, false, false) << ' '; + ostr << to_hex(item, false, false); + ++i; + if ((i % 10) == 0) ostr << '\n'; else ostr << ' '; } std::cout << std::endl; } else { for (const auto& item : collection) { - ostr << item << ' '; - //ostr << to_binary(item) << " : " << to_hex(item) << " : " << item << '\n'; + ostr << item; + ++i; + if ((i % 10) == 0) ostr << '\n'; else ostr << ' '; } std::cout << std::endl; } @@ -396,7 +400,7 @@ namespace sw { namespace universal { namespace blas { } template - void restoreCollection(std::istream& istr, uint32_t aggregationType, uint32_t nrElements) { + void restoreData(std::istream& istr, uint32_t aggregationType, uint32_t nrElements) { switch (aggregationType) { case UNIVERSAL_AGGREGATE_SCALAR: std::cout << "Creating a scalar\n"; @@ -416,6 +420,53 @@ namespace sw { namespace universal { namespace blas { std::cout << "unknown aggregate\n"; } } + void restoreCollection(std::istream& istr, uint32_t typeId, uint32_t nrParameters, uint32_t* parameter, uint32_t aggregationType, uint32_t nrElements) { + constexpr bool verbose = false; // turn debug tracking on/off + if constexpr (verbose) { + std::cout << "typeId : " << typeId << std::endl; + std::cout << "nr parameters : " << nrParameters << std::endl; + for (uint32_t i = 0; i < nrParameters; ++i) { + std::cout << "parameter[" << i << "] : " << parameter[i] << std::endl; + } + std::cout << "aggregationType : " << aggregationType << std::endl; + std::cout << "nr of elements : " << nrElements << std::endl; + } + switch (typeId) { + case UNIVERSAL_NATIVE_INT32_TYPE: + restoreData(istr, aggregationType, nrElements); + break; + case UNIVERSAL_NATIVE_FP32_TYPE: + restoreData(istr, aggregationType, nrElements); + break; + case UNIVERSAL_NATIVE_FP64_TYPE: + restoreData(istr, aggregationType, nrElements); + break; + case UNIVERSAL_CFLOAT_TYPE: + // TBD: this is ugly: we will need to enumerate all possible + // configurations in explicit template invocations, which will + // be hundreds of templates, and that will be a major compilation + // task. For the datafile restore() this would not have any positive ROI + // but it maybe an interesting interface for a CLI, so I am + // keeping it in here as a discovery note. + + // one idea is to serialize just a limited set of cfloats + // like just the small fp8 and fp16 configurations + using onecfloat = cfloat<16, 5, uint16_t, true, false, false>; + restoreData(istr, aggregationType, nrElements); + break; + case UNIVERSAL_LNS_TYPE: + using onelns = lns<8, 2, uint8_t>; + restoreData(istr, aggregationType, nrElements); + break; + case UNIVERSAL_DBNS_TYPE: + using onedbns = dbns<8, 3, uint8_t>; + restoreData(istr, aggregationType, nrElements); + break; + default: + std::cout << "unknown typeId : " << typeId << '\n'; + } + } + bool restore(std::istream& istr) { uint32_t magic_number; istr >> magic_number; @@ -434,11 +485,6 @@ namespace sw { namespace universal { namespace blas { for (uint32_t i = 0; i < nrParameters; ++i) { istr >> parameter[i]; } - std::cout << "typeId : " << typeId << std::endl; - std::cout << "nr parameters : " << nrParameters << std::endl; - for (uint32_t i = 0; i < nrParameters; ++i) { - std::cout << "parameter[" << i << "] : " << parameter[i] << std::endl; - } // read the mandatory comment line std::string aggregationTypeComment; std::string token; @@ -447,28 +493,10 @@ namespace sw { namespace universal { namespace blas { std::cout << "comment line : " << aggregationTypeComment << std::endl; uint32_t aggregationType, nrElements; istr >> aggregationType >> nrElements; -// std::cout << "aggregationType : " << aggregationType << std::endl; -// std::cout << "nr of elements : " << nrElements << std::endl; - switch (typeId) { - case UNIVERSAL_NATIVE_INT8_TYPE: - create(aggregationType); - break; - case UNIVERSAL_NATIVE_FP32_TYPE: - restoreCollection(istr, aggregationType, nrElements); - break; - case UNIVERSAL_NATIVE_FP64_TYPE: - restoreCollection(istr, aggregationType, nrElements); - break; - case UNIVERSAL_CFLOAT_TYPE: - // todo: is there a good way to synthesize a type from dynamic data? - using onecfloat = cfloat<16, 5, uint16_t, true, false, false>; - restoreCollection(istr, aggregationType, nrElements); - break; - default: - std::cout << "unknown typeId : " << typeId << '\n'; - } - // read the next record + restoreCollection(istr, typeId, nrParameters, parameter, aggregationType, nrElements); + + // read the typeId of the next record, or the termination token istr >> typeId; } return true; diff --git a/include/universal/internal/f2s/f2s.hpp b/include/universal/internal/f2s/f2s.hpp index 9e7a81ad3..079805ab9 100644 --- a/include/universal/internal/f2s/f2s.hpp +++ b/include/universal/internal/f2s/f2s.hpp @@ -388,7 +388,7 @@ namespace sw { std::stringstream s; s << '(' << (v.s() ? "-, " : "+, "); s << to_hex(v.f(), 0, true) << ", "; - s << v.e() << '(' << to_hex(v.e(), 16, true) << ')' << ')'; + s << v.e() << '(' << to_hex(v.e(), true, true) << ')' << ')'; return s.str(); } diff --git a/linalg/blas/minij.cpp b/linalg/blas/minij.cpp index 9cbbd5cfd..f1e38a36b 100644 --- a/linalg/blas/minij.cpp +++ b/linalg/blas/minij.cpp @@ -27,11 +27,11 @@ void MinIJMatrixTest(size_t N = 5) { auto M = minij(N); // normalize the column vectors - auto total = sum(M); + auto total = sumOfElements(M); std::cout << "Total : " << total << '\n'; - auto rowSums = sum(M, 1); + auto rowSums = sumOfElements(M, 1); std::cout << "Row sums : " << rowSums << '\n'; - auto colSums = sum(M, 2); + auto colSums = sumOfElements(M, 2); std::cout << "Col sums : " << colSums << '\n'; } diff --git a/linalg/blas/rand_spectral.cpp b/linalg/blas/rand_spectral.cpp index 98a74d80a..61ca32d49 100644 --- a/linalg/blas/rand_spectral.cpp +++ b/linalg/blas/rand_spectral.cpp @@ -40,11 +40,11 @@ try { std::cout << Qbase << '\n'; // normalize the column vectors - auto total = sum(Qbase); + auto total = sumOfElements(Qbase); // default is dim = 0 std::cout << "Total : " << total << '\n'; - auto rowSums = sum(Qbase, 1); + auto rowSums = sumOfElements(Qbase, 1); std::cout << "Row sums : " << rowSums << '\n'; - auto colSums = sum(Qbase, 2); + auto colSums = sumOfElements(Qbase, 2); std::cout << "Col sums : " << colSums << '\n'; normalize(Qbase, 2); // normalize columns so they are unit length diff --git a/linalg/data/serialization.cpp b/linalg/data/serialization.cpp index 5a83fe043..66e292c56 100644 --- a/linalg/data/serialization.cpp +++ b/linalg/data/serialization.cpp @@ -134,13 +134,28 @@ void TestVectorSerialization() { df.save(std::cout, false); } +template +void TestMatrixSerialization() { + using namespace sw::universal; + sw::universal::blas::matrix m(5,5); + gaussian_random(m, 0.0, 0.1); + blas::datafile df; + df.add(m); + df.save(std::cout, false); // decimal format + std::stringstream s; + df.save(s, false); // decimal format + df.clear(); + df.restore(s); + df.save(std::cout, false); +} + void TestSerialization() { using namespace sw::universal; // Create instances of different specialized collections sw::universal::blas::vector xfp32(5), yfp32(5); sw::universal::blas::matrix Afp32(5, 5); - sw::universal::blas::tensor Tfp32(5, 5); // TBD +// sw::universal::blas::tensor Tfp32(5, 5); // TBD sw::universal::blas::matrix dpfp32(1, 1); gaussian_random(xfp32, 0.0, 0.1); gaussian_random(yfp32, 0.0, 0.1); @@ -157,7 +172,7 @@ void TestSerialization() { // Use the base class reference to aggregate the collections blas::datafile df; - df.add(Tfp32); +// df.add(Tfp32); df.add(xfp32); df.add(yfp32); df.add(Afp32); @@ -216,12 +231,12 @@ try { // ReportNumberSystemFormats(); //TestVectorSerialization(); - TestVectorSerialization(); - return 0; + TestMatrixSerialization(); +// TestVectorSerialization(); - TestSaveTypeId(); +// TestSaveTypeId(); - TestSerialization(); + // TestSerialization(); return 0; unsigned N = 32;