rearchitecting the FDP overloads in BLAS

stillwater-sc · Aug 17, 2023 · 567744d · 567744d
1 parent e68bf50
commit 567744d
Show file tree

Hide file tree

Showing 10 changed files with 135 additions and 93 deletions.
diff --git a/applications/engineering/water.cpp b/applications/engineering/water.cpp
@@ -16,9 +16,7 @@
 #include <cmath>
 // universal BLAS
 #include <universal/number/posit/posit.hpp>
-#include <universal/blas/vector.hpp>
-#include <universal/blas/matrix.hpp>
-#include <universal/blas/solvers/lu.hpp>
+#include <universal/blas/blas.hpp>
 
 using std::string;
 using std::vector;

diff --git a/applications/ode/convergence.cpp b/applications/ode/convergence.cpp
@@ -10,8 +10,7 @@
 #include <vector>
 #include <algorithm>
 #include <universal/number/posit/posit.hpp>
-#include <universal/blas/vector.hpp>
-#include <universal/blas/linspace.hpp>
+#include <universal/blas/blas.hpp>
 
 template<typename Scalar>
 Scalar my_ode_func(const Scalar& t, const Scalar& u) {

diff --git a/include/universal/blas/blas.hpp b/include/universal/blas/blas.hpp
@@ -83,4 +83,7 @@ constexpr uint64_t SIZE_512G = 512 * SIZE_1G;
 #include <universal/blas/vmath/power.hpp>
 #include <universal/blas/vmath/trigonometry.hpp>
 
+// type specific overloads
+#include <universal/blas/modifiers/posit_fdp.hpp>
+
 #endif // _UNIVERSAL_BLAS_LIBRARY
diff --git a/include/universal/blas/matrix.hpp b/include/universal/blas/matrix.hpp
@@ -9,8 +9,6 @@
 #include <initializer_list>
 #include <map>
 #include <universal/blas/exceptions.hpp>
-#include <universal/number/posit/posit_fwd.hpp>
-#include <universal/number/cfloat/cfloat_fwd.hpp>
 
 #if defined(__clang__)
 /* Clang/LLVM. ---------------------------------------------- */
@@ -200,7 +198,8 @@ class matrix {
 	// selectors
 	unsigned rows() const { return _m; }
 	unsigned cols() const { return _n; }
-	std::pair<unsigned, unsigned> size() const { return std::make_pair(_m, _n); }
+//	std::pair<unsigned, unsigned> size() const { return std::make_pair(_m, _n); }
+	unsigned size() const { return data.size(); }
 
 	// in-place transpose
 	matrix& transpose() {
@@ -275,7 +274,7 @@ inline unsigned num_rows(const matrix<Scalar>& A) { return A.rows(); }
 template<typename Scalar>
 inline unsigned num_cols(const matrix<Scalar>& A) { return A.cols(); }
 template<typename Scalar>
-inline std::pair<unsigned, unsigned> size(const matrix<Scalar>& A) { return A.size(); }
+inline std::pair<unsigned, unsigned> size(const matrix<Scalar>& A) { return std::make_pair(A.rows(), A.cols()); }
 
 // ostream operator: no need to declare as friend as it only uses public interfaces
 template<typename Scalar>
@@ -357,20 +356,6 @@ vector<Scalar> operator*(const matrix<Scalar>& A, const vector<Scalar>& x) {
 	return b;
 }
 
-// overload for posits to use fused dot products
-template<unsigned nbits, unsigned es>
-vector< posit<nbits, es> > operator*(const matrix< posit<nbits, es> >& A, const vector< posit<nbits, es> >& x) {
-	constexpr unsigned capacity = 20; // FDP for vectors < 1,048,576 elements
-	vector< posit<nbits, es> > b(A.rows());
-	for (unsigned i = 0; i < A.rows(); ++i) {
-		quire<nbits, es, capacity> q;
-		for (unsigned j = 0; j < A.cols(); ++j) {
-			q += quire_mul(A(i, j), x[j]);
-		}
-		convert(q.to_value(), b[i]); // one and only rounding step of the fused-dot product
-	}
-	return b;
-}
 
 template<typename Scalar>
 matrix<Scalar> operator*(const matrix<Scalar>& A, const matrix<Scalar>& B) {
@@ -410,31 +395,6 @@ matrix<Scalar> operator%(const matrix<Scalar>& A, const matrix<Scalar>& B) {
 	return C;
 }
 
-
-
-// overload for posits uses fused dot products
-template<unsigned nbits, unsigned es>
-matrix< posit<nbits, es> > operator*(const matrix< posit<nbits, es> >& A, const matrix< posit<nbits, es> >& B) {
-	constexpr unsigned capacity = 20; // FDP for vectors < 1,048,576 elements
-	if (A.cols() != B.rows()) throw matmul_incompatible_matrices(incompatible_matrices(A.rows(), A.cols(), B.rows(), B.cols(), "*").what());
-	unsigned rows = A.rows();
-	unsigned cols = B.cols();
-	unsigned dots = A.cols();
-	matrix< posit<nbits, es> > C(rows, cols);
-	for (unsigned i = 0; i < rows; ++i) {
-		for (unsigned j = 0; j < cols; ++j) {
-			quire<nbits, es, capacity> q;
-			for (unsigned k = 0; k < dots; ++k) {
-				q += quire_mul(A(i, k), B(k, j));
-			}
-			convert(q.to_value(), C(i, j)); // one and only rounding step of the fused-dot product
-		}
-	}
-	return C;
-}
-
-
-
 // matrix equivalence tests
 template<typename Scalar>
 bool operator==(const matrix<Scalar>& A, const matrix<Scalar>& B) {
@@ -453,8 +413,6 @@ bool operator==(const matrix<Scalar>& A, const matrix<Scalar>& B) {
 	return equal;
 }
 
-
-
 template<typename Scalar>
 bool operator!=(const matrix<Scalar>& A, const matrix<Scalar>& B) {
 	return !(A == B);

diff --git a/include/universal/blas/modifiers/posit_fdp.hpp b/include/universal/blas/modifiers/posit_fdp.hpp
@@ -0,0 +1,41 @@
+#pragma once
+
+namespace sw { namespace universal { namespace blas {
+
+// overload for posits to use fused dot products
+template<unsigned nbits, unsigned es>
+vector< posit<nbits, es> > operator*(const matrix< posit<nbits, es> >& A, const vector< posit<nbits, es> >& x) {
+	constexpr unsigned capacity = 20; // FDP for vectors < 1,048,576 elements
+	vector< posit<nbits, es> > b(A.rows());
+	for (unsigned i = 0; i < A.rows(); ++i) {
+		quire<nbits, es, capacity> q;
+		for (unsigned j = 0; j < A.cols(); ++j) {
+			q += quire_mul(A(i, j), x[j]);
+		}
+		convert(q.to_value(), b[i]); // one and only rounding step of the fused-dot product
+	}
+	return b;
+}
+
+// overload for posits uses fused dot products
+template<unsigned nbits, unsigned es>
+matrix< posit<nbits, es> > operator*(const matrix< posit<nbits, es> >& A, const matrix< posit<nbits, es> >& B) {
+	constexpr unsigned capacity = 20; // FDP for vectors < 1,048,576 elements
+	if (A.cols() != B.rows()) throw matmul_incompatible_matrices(incompatible_matrices(A.rows(), A.cols(), B.rows(), B.cols(), "*").what());
+	unsigned rows = A.rows();
+	unsigned cols = B.cols();
+	unsigned dots = A.cols();
+	matrix< posit<nbits, es> > C(rows, cols);
+	for (unsigned i = 0; i < rows; ++i) {
+		for (unsigned j = 0; j < cols; ++j) {
+			quire<nbits, es, capacity> q;
+			for (unsigned k = 0; k < dots; ++k) {
+				q += quire_mul(A(i, k), B(k, j));
+			}
+			convert(q.to_value(), C(i, j)); // one and only rounding step of the fused-dot product
+		}
+	}
+	return C;
+}
+
+}}} // namespace sw::universal::blas
diff --git a/include/universal/blas/serialization/datafile.hpp b/include/universal/blas/serialization/datafile.hpp
@@ -305,16 +305,20 @@ namespace sw { namespace universal { namespace blas {
             using Scalar = typename CollectionType::value_type;
             saveTypeId<Scalar>(ostr);
             saveAggregationInfo(ostr);
+            int i{ 0 };
             if (hex) {
                 for (const auto& item : collection) {
-                    ostr << to_hex(item, false, false) << ' ';
+                    ostr << to_hex(item, false, false);
+                    ++i;
+                    if ((i % 10) == 0) ostr << '\n'; else ostr << ' ';
                 }
                 std::cout << std::endl;
             }
             else {
                 for (const auto& item : collection) {
-                    ostr << item << ' ';
-                    //ostr << to_binary(item) << " : " << to_hex(item) << " : " << item << '\n';
+                    ostr << item;
+                    ++i;
+                    if ((i % 10) == 0) ostr << '\n'; else ostr << ' ';
                 }
                 std::cout << std::endl;
             }
@@ -396,7 +400,7 @@ namespace sw { namespace universal { namespace blas {
         }
 
         template<typename Scalar>
-        void restoreCollection(std::istream& istr, uint32_t aggregationType, uint32_t nrElements) {
+        void restoreData(std::istream& istr, uint32_t aggregationType, uint32_t nrElements) {
             switch (aggregationType) {
             case UNIVERSAL_AGGREGATE_SCALAR:
                 std::cout << "Creating a scalar\n";
@@ -416,6 +420,53 @@ namespace sw { namespace universal { namespace blas {
                 std::cout << "unknown aggregate\n";
             }
         }
+        void restoreCollection(std::istream& istr, uint32_t typeId, uint32_t nrParameters, uint32_t* parameter, uint32_t aggregationType, uint32_t nrElements) {
+            constexpr bool verbose = false; // turn debug tracking on/off
+            if constexpr (verbose) {
+                std::cout << "typeId        : " << typeId << std::endl;
+                std::cout << "nr parameters : " << nrParameters << std::endl;
+                for (uint32_t i = 0; i < nrParameters; ++i) {
+                    std::cout << "parameter[" << i << "] : " << parameter[i] << std::endl;
+                }
+                std::cout << "aggregationType : " << aggregationType << std::endl;
+                std::cout << "nr of elements  : " << nrElements << std::endl;
+            }
+            switch (typeId) {
+            case UNIVERSAL_NATIVE_INT32_TYPE:
+                restoreData<int32_t>(istr, aggregationType, nrElements);
+                break;
+            case UNIVERSAL_NATIVE_FP32_TYPE:
+                restoreData<float>(istr, aggregationType, nrElements);
+                break;
+            case UNIVERSAL_NATIVE_FP64_TYPE:
+                restoreData<double>(istr, aggregationType, nrElements);
+                break;
+            case UNIVERSAL_CFLOAT_TYPE:
+                // TBD: this is ugly: we will need to enumerate all possible
+                // configurations in explicit template invocations, which will
+                // be hundreds of templates, and that will be a major compilation
+                // task. For the datafile restore() this would not have any positive ROI
+                // but it maybe an interesting interface for a CLI, so I am 
+                // keeping it in here as a discovery note.
+
+                // one idea is to serialize just a limited set of cfloats
+                // like just the small fp8 and fp16 configurations
+                using onecfloat = cfloat<16, 5, uint16_t, true, false, false>;
+                restoreData<onecfloat>(istr, aggregationType, nrElements);
+                break;
+            case UNIVERSAL_LNS_TYPE:
+                using onelns = lns<8, 2, uint8_t>;
+                restoreData<onelns>(istr, aggregationType, nrElements);
+                break;
+            case UNIVERSAL_DBNS_TYPE:
+                using onedbns = dbns<8, 3, uint8_t>;
+                restoreData<onedbns>(istr, aggregationType, nrElements);
+                break;
+            default:
+                std::cout << "unknown typeId : " << typeId << '\n';
+            }
+        }
+
 		bool restore(std::istream& istr) {
             uint32_t magic_number;
             istr >> magic_number;
@@ -434,11 +485,6 @@ namespace sw { namespace universal { namespace blas {
                 for (uint32_t i = 0; i < nrParameters; ++i) {
                     istr >> parameter[i];
                 }
-                std::cout << "typeId        : " << typeId << std::endl;
-                std::cout << "nr parameters : " << nrParameters << std::endl;
-                for (uint32_t i = 0; i < nrParameters; ++i) {
-                    std::cout << "parameter[" << i << "] : " << parameter[i] << std::endl;
-                }
                 // read the mandatory comment line
                 std::string aggregationTypeComment;
                 std::string token;
@@ -447,28 +493,10 @@ namespace sw { namespace universal { namespace blas {
                 std::cout << "comment line : " << aggregationTypeComment << std::endl;
                 uint32_t aggregationType, nrElements;
                 istr >> aggregationType >> nrElements;
-//                std::cout << "aggregationType : " << aggregationType << std::endl;
-//                std::cout << "nr of elements  : " << nrElements << std::endl;
-                switch (typeId) {
-                case UNIVERSAL_NATIVE_INT8_TYPE:
-                    create<int8_t>(aggregationType);
-                    break;
-                case UNIVERSAL_NATIVE_FP32_TYPE:
-                    restoreCollection<float>(istr, aggregationType, nrElements);
-                    break;
-                case UNIVERSAL_NATIVE_FP64_TYPE:
-                    restoreCollection<double>(istr, aggregationType, nrElements);
-                    break;
-                case UNIVERSAL_CFLOAT_TYPE:
-                    // todo: is there a good way to synthesize a type from dynamic data?
-                    using onecfloat = cfloat<16, 5, uint16_t, true, false, false>;
-                    restoreCollection<onecfloat>(istr, aggregationType, nrElements);
-                    break;
-                default:
-                    std::cout << "unknown typeId : " << typeId << '\n';
-                }
 
-                // read the next record
+                restoreCollection(istr, typeId, nrParameters, parameter, aggregationType, nrElements);
+
+                // read the typeId of the next record, or the termination token
                 istr >> typeId;
             }
             return true;

diff --git a/include/universal/internal/f2s/f2s.hpp b/include/universal/internal/f2s/f2s.hpp
@@ -388,7 +388,7 @@ namespace sw {
 			std::stringstream s;
 			s << '(' << (v.s() ? "-, " : "+, ");
 			s << to_hex(v.f(), 0, true) << ", ";
-			s << v.e() << '(' << to_hex(v.e(), 16, true) << ')' << ')';
+			s << v.e() << '(' << to_hex(v.e(), true, true) << ')' << ')';
 			return s.str();
 		}
 

diff --git a/linalg/blas/minij.cpp b/linalg/blas/minij.cpp
@@ -27,11 +27,11 @@ void MinIJMatrixTest(size_t N = 5) {
 	auto M = minij<Scalar>(N);
 
 	// normalize the column vectors
-	auto total = sum(M);
+	auto total = sumOfElements(M);
 	std::cout << "Total    : " << total << '\n';
-	auto rowSums = sum(M, 1);
+	auto rowSums = sumOfElements(M, 1);
 	std::cout << "Row sums : " << rowSums << '\n';
-	auto colSums = sum(M, 2);
+	auto colSums = sumOfElements(M, 2);
 	std::cout << "Col sums : " << colSums << '\n';
 }
 

diff --git a/linalg/blas/rand_spectral.cpp b/linalg/blas/rand_spectral.cpp
@@ -40,11 +40,11 @@ try {
 	std::cout << Qbase << '\n';
 
 	// normalize the column vectors
-	auto total = sum(Qbase);
+	auto total = sumOfElements(Qbase); // default is dim = 0
 	std::cout << "Total    : " << total << '\n';
-	auto rowSums = sum(Qbase, 1);
+	auto rowSums = sumOfElements(Qbase, 1);
 	std::cout << "Row sums : " << rowSums << '\n';
-	auto colSums = sum(Qbase, 2);
+	auto colSums = sumOfElements(Qbase, 2);
 	std::cout << "Col sums : " << colSums << '\n';
 
 	normalize(Qbase, 2);  // normalize columns so they are unit length

diff --git a/linalg/data/serialization.cpp b/linalg/data/serialization.cpp
@@ -134,13 +134,28 @@ void TestVectorSerialization() {
 	df.save(std::cout, false);
 }
 
+template<typename Scalar>
+void TestMatrixSerialization() {
+	using namespace sw::universal;
+	sw::universal::blas::matrix<Scalar> m(5,5);
+	gaussian_random(m, 0.0, 0.1);
+	blas::datafile<blas::TextFormat> df;
+	df.add(m);
+	df.save(std::cout, false);  // decimal format
+	std::stringstream s;
+	df.save(s, false);  // decimal format
+	df.clear();
+	df.restore(s);
+	df.save(std::cout, false);
+}
+
 void TestSerialization() {
 	using namespace sw::universal;
 
 	// Create instances of different specialized collections
 	sw::universal::blas::vector<float> xfp32(5), yfp32(5);
 	sw::universal::blas::matrix<float> Afp32(5, 5);
-	sw::universal::blas::tensor<float> Tfp32(5, 5); // TBD
+//	sw::universal::blas::tensor<float> Tfp32(5, 5); // TBD
 	sw::universal::blas::matrix<float> dpfp32(1, 1);
 	gaussian_random(xfp32, 0.0, 0.1);
 	gaussian_random(yfp32, 0.0, 0.1);
@@ -157,7 +172,7 @@ void TestSerialization() {
 
 	// Use the base class reference to aggregate the collections
 	blas::datafile<blas::TextFormat> df;
-	df.add(Tfp32);
+//	df.add(Tfp32);
 	df.add(xfp32);
 	df.add(yfp32);
 	df.add(Afp32);
@@ -216,12 +231,12 @@ try {
 	// ReportNumberSystemFormats();
 
 	//TestVectorSerialization<float>();
-	TestVectorSerialization<half>();
-	return 0;
+	TestMatrixSerialization<float>();
+//	TestVectorSerialization<half>();
 
-	TestSaveTypeId();
+//	TestSaveTypeId();
 
-	TestSerialization();
+	// TestSerialization();
 	return 0;
 
 	unsigned N = 32;