Skip to content

Commit

Permalink
rearchitecting the FDP overloads in BLAS
Browse files Browse the repository at this point in the history
  • Loading branch information
Ravenwater committed Aug 17, 2023
1 parent e68bf50 commit 567744d
Show file tree
Hide file tree
Showing 10 changed files with 135 additions and 93 deletions.
4 changes: 1 addition & 3 deletions applications/engineering/water.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,7 @@
#include <cmath>
// universal BLAS
#include <universal/number/posit/posit.hpp>
#include <universal/blas/vector.hpp>
#include <universal/blas/matrix.hpp>
#include <universal/blas/solvers/lu.hpp>
#include <universal/blas/blas.hpp>

using std::string;
using std::vector;
Expand Down
3 changes: 1 addition & 2 deletions applications/ode/convergence.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,7 @@
#include <vector>
#include <algorithm>
#include <universal/number/posit/posit.hpp>
#include <universal/blas/vector.hpp>
#include <universal/blas/linspace.hpp>
#include <universal/blas/blas.hpp>

template<typename Scalar>
Scalar my_ode_func(const Scalar& t, const Scalar& u) {
Expand Down
3 changes: 3 additions & 0 deletions include/universal/blas/blas.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -83,4 +83,7 @@ constexpr uint64_t SIZE_512G = 512 * SIZE_1G;
#include <universal/blas/vmath/power.hpp>
#include <universal/blas/vmath/trigonometry.hpp>

// type specific overloads
#include <universal/blas/modifiers/posit_fdp.hpp>

#endif // _UNIVERSAL_BLAS_LIBRARY
48 changes: 3 additions & 45 deletions include/universal/blas/matrix.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,6 @@
#include <initializer_list>
#include <map>
#include <universal/blas/exceptions.hpp>
#include <universal/number/posit/posit_fwd.hpp>
#include <universal/number/cfloat/cfloat_fwd.hpp>

#if defined(__clang__)
/* Clang/LLVM. ---------------------------------------------- */
Expand Down Expand Up @@ -200,7 +198,8 @@ class matrix {
// selectors
unsigned rows() const { return _m; }
unsigned cols() const { return _n; }
std::pair<unsigned, unsigned> size() const { return std::make_pair(_m, _n); }
// std::pair<unsigned, unsigned> size() const { return std::make_pair(_m, _n); }
unsigned size() const { return data.size(); }

// in-place transpose
matrix& transpose() {
Expand Down Expand Up @@ -275,7 +274,7 @@ inline unsigned num_rows(const matrix<Scalar>& A) { return A.rows(); }
template<typename Scalar>
inline unsigned num_cols(const matrix<Scalar>& A) { return A.cols(); }
template<typename Scalar>
inline std::pair<unsigned, unsigned> size(const matrix<Scalar>& A) { return A.size(); }
inline std::pair<unsigned, unsigned> size(const matrix<Scalar>& A) { return std::make_pair(A.rows(), A.cols()); }

// ostream operator: no need to declare as friend as it only uses public interfaces
template<typename Scalar>
Expand Down Expand Up @@ -357,20 +356,6 @@ vector<Scalar> operator*(const matrix<Scalar>& A, const vector<Scalar>& x) {
return b;
}

// overload for posits to use fused dot products
template<unsigned nbits, unsigned es>
vector< posit<nbits, es> > operator*(const matrix< posit<nbits, es> >& A, const vector< posit<nbits, es> >& x) {
constexpr unsigned capacity = 20; // FDP for vectors < 1,048,576 elements
vector< posit<nbits, es> > b(A.rows());
for (unsigned i = 0; i < A.rows(); ++i) {
quire<nbits, es, capacity> q;
for (unsigned j = 0; j < A.cols(); ++j) {
q += quire_mul(A(i, j), x[j]);
}
convert(q.to_value(), b[i]); // one and only rounding step of the fused-dot product
}
return b;
}

template<typename Scalar>
matrix<Scalar> operator*(const matrix<Scalar>& A, const matrix<Scalar>& B) {
Expand Down Expand Up @@ -410,31 +395,6 @@ matrix<Scalar> operator%(const matrix<Scalar>& A, const matrix<Scalar>& B) {
return C;
}



// overload for posits uses fused dot products
template<unsigned nbits, unsigned es>
matrix< posit<nbits, es> > operator*(const matrix< posit<nbits, es> >& A, const matrix< posit<nbits, es> >& B) {
constexpr unsigned capacity = 20; // FDP for vectors < 1,048,576 elements
if (A.cols() != B.rows()) throw matmul_incompatible_matrices(incompatible_matrices(A.rows(), A.cols(), B.rows(), B.cols(), "*").what());
unsigned rows = A.rows();
unsigned cols = B.cols();
unsigned dots = A.cols();
matrix< posit<nbits, es> > C(rows, cols);
for (unsigned i = 0; i < rows; ++i) {
for (unsigned j = 0; j < cols; ++j) {
quire<nbits, es, capacity> q;
for (unsigned k = 0; k < dots; ++k) {
q += quire_mul(A(i, k), B(k, j));
}
convert(q.to_value(), C(i, j)); // one and only rounding step of the fused-dot product
}
}
return C;
}



// matrix equivalence tests
template<typename Scalar>
bool operator==(const matrix<Scalar>& A, const matrix<Scalar>& B) {
Expand All @@ -453,8 +413,6 @@ bool operator==(const matrix<Scalar>& A, const matrix<Scalar>& B) {
return equal;
}



template<typename Scalar>
bool operator!=(const matrix<Scalar>& A, const matrix<Scalar>& B) {
return !(A == B);
Expand Down
41 changes: 41 additions & 0 deletions include/universal/blas/modifiers/posit_fdp.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#pragma once

namespace sw { namespace universal { namespace blas {

// overload for posits to use fused dot products
template<unsigned nbits, unsigned es>
vector< posit<nbits, es> > operator*(const matrix< posit<nbits, es> >& A, const vector< posit<nbits, es> >& x) {
constexpr unsigned capacity = 20; // FDP for vectors < 1,048,576 elements
vector< posit<nbits, es> > b(A.rows());
for (unsigned i = 0; i < A.rows(); ++i) {
quire<nbits, es, capacity> q;
for (unsigned j = 0; j < A.cols(); ++j) {
q += quire_mul(A(i, j), x[j]);
}
convert(q.to_value(), b[i]); // one and only rounding step of the fused-dot product
}
return b;
}

// overload for posits uses fused dot products
template<unsigned nbits, unsigned es>
matrix< posit<nbits, es> > operator*(const matrix< posit<nbits, es> >& A, const matrix< posit<nbits, es> >& B) {
constexpr unsigned capacity = 20; // FDP for vectors < 1,048,576 elements
if (A.cols() != B.rows()) throw matmul_incompatible_matrices(incompatible_matrices(A.rows(), A.cols(), B.rows(), B.cols(), "*").what());
unsigned rows = A.rows();
unsigned cols = B.cols();
unsigned dots = A.cols();
matrix< posit<nbits, es> > C(rows, cols);
for (unsigned i = 0; i < rows; ++i) {
for (unsigned j = 0; j < cols; ++j) {
quire<nbits, es, capacity> q;
for (unsigned k = 0; k < dots; ++k) {
q += quire_mul(A(i, k), B(k, j));
}
convert(q.to_value(), C(i, j)); // one and only rounding step of the fused-dot product
}
}
return C;
}

}}} // namespace sw::universal::blas
88 changes: 58 additions & 30 deletions include/universal/blas/serialization/datafile.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -305,16 +305,20 @@ namespace sw { namespace universal { namespace blas {
using Scalar = typename CollectionType::value_type;
saveTypeId<Scalar>(ostr);
saveAggregationInfo(ostr);
int i{ 0 };
if (hex) {
for (const auto& item : collection) {
ostr << to_hex(item, false, false) << ' ';
ostr << to_hex(item, false, false);
++i;
if ((i % 10) == 0) ostr << '\n'; else ostr << ' ';
}
std::cout << std::endl;
}
else {
for (const auto& item : collection) {
ostr << item << ' ';
//ostr << to_binary(item) << " : " << to_hex(item) << " : " << item << '\n';
ostr << item;
++i;
if ((i % 10) == 0) ostr << '\n'; else ostr << ' ';
}
std::cout << std::endl;
}
Expand Down Expand Up @@ -396,7 +400,7 @@ namespace sw { namespace universal { namespace blas {
}

template<typename Scalar>
void restoreCollection(std::istream& istr, uint32_t aggregationType, uint32_t nrElements) {
void restoreData(std::istream& istr, uint32_t aggregationType, uint32_t nrElements) {
switch (aggregationType) {
case UNIVERSAL_AGGREGATE_SCALAR:
std::cout << "Creating a scalar\n";
Expand All @@ -416,6 +420,53 @@ namespace sw { namespace universal { namespace blas {
std::cout << "unknown aggregate\n";
}
}
void restoreCollection(std::istream& istr, uint32_t typeId, uint32_t nrParameters, uint32_t* parameter, uint32_t aggregationType, uint32_t nrElements) {
constexpr bool verbose = false; // turn debug tracking on/off
if constexpr (verbose) {
std::cout << "typeId : " << typeId << std::endl;
std::cout << "nr parameters : " << nrParameters << std::endl;
for (uint32_t i = 0; i < nrParameters; ++i) {
std::cout << "parameter[" << i << "] : " << parameter[i] << std::endl;
}
std::cout << "aggregationType : " << aggregationType << std::endl;
std::cout << "nr of elements : " << nrElements << std::endl;
}
switch (typeId) {
case UNIVERSAL_NATIVE_INT32_TYPE:
restoreData<int32_t>(istr, aggregationType, nrElements);
break;
case UNIVERSAL_NATIVE_FP32_TYPE:
restoreData<float>(istr, aggregationType, nrElements);
break;
case UNIVERSAL_NATIVE_FP64_TYPE:
restoreData<double>(istr, aggregationType, nrElements);
break;
case UNIVERSAL_CFLOAT_TYPE:
// TBD: this is ugly: we will need to enumerate all possible
// configurations in explicit template invocations, which will
// be hundreds of templates, and that will be a major compilation
// task. For the datafile restore() this would not have any positive ROI
// but it maybe an interesting interface for a CLI, so I am
// keeping it in here as a discovery note.

// one idea is to serialize just a limited set of cfloats
// like just the small fp8 and fp16 configurations
using onecfloat = cfloat<16, 5, uint16_t, true, false, false>;
restoreData<onecfloat>(istr, aggregationType, nrElements);
break;
case UNIVERSAL_LNS_TYPE:
using onelns = lns<8, 2, uint8_t>;
restoreData<onelns>(istr, aggregationType, nrElements);
break;
case UNIVERSAL_DBNS_TYPE:
using onedbns = dbns<8, 3, uint8_t>;
restoreData<onedbns>(istr, aggregationType, nrElements);
break;
default:
std::cout << "unknown typeId : " << typeId << '\n';
}
}

bool restore(std::istream& istr) {
uint32_t magic_number;
istr >> magic_number;
Expand All @@ -434,11 +485,6 @@ namespace sw { namespace universal { namespace blas {
for (uint32_t i = 0; i < nrParameters; ++i) {
istr >> parameter[i];
}
std::cout << "typeId : " << typeId << std::endl;
std::cout << "nr parameters : " << nrParameters << std::endl;
for (uint32_t i = 0; i < nrParameters; ++i) {
std::cout << "parameter[" << i << "] : " << parameter[i] << std::endl;
}
// read the mandatory comment line
std::string aggregationTypeComment;
std::string token;
Expand All @@ -447,28 +493,10 @@ namespace sw { namespace universal { namespace blas {
std::cout << "comment line : " << aggregationTypeComment << std::endl;
uint32_t aggregationType, nrElements;
istr >> aggregationType >> nrElements;
// std::cout << "aggregationType : " << aggregationType << std::endl;
// std::cout << "nr of elements : " << nrElements << std::endl;
switch (typeId) {
case UNIVERSAL_NATIVE_INT8_TYPE:
create<int8_t>(aggregationType);
break;
case UNIVERSAL_NATIVE_FP32_TYPE:
restoreCollection<float>(istr, aggregationType, nrElements);
break;
case UNIVERSAL_NATIVE_FP64_TYPE:
restoreCollection<double>(istr, aggregationType, nrElements);
break;
case UNIVERSAL_CFLOAT_TYPE:
// todo: is there a good way to synthesize a type from dynamic data?
using onecfloat = cfloat<16, 5, uint16_t, true, false, false>;
restoreCollection<onecfloat>(istr, aggregationType, nrElements);
break;
default:
std::cout << "unknown typeId : " << typeId << '\n';
}

// read the next record
restoreCollection(istr, typeId, nrParameters, parameter, aggregationType, nrElements);

// read the typeId of the next record, or the termination token
istr >> typeId;
}
return true;
Expand Down
2 changes: 1 addition & 1 deletion include/universal/internal/f2s/f2s.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -388,7 +388,7 @@ namespace sw {
std::stringstream s;
s << '(' << (v.s() ? "-, " : "+, ");
s << to_hex(v.f(), 0, true) << ", ";
s << v.e() << '(' << to_hex(v.e(), 16, true) << ')' << ')';
s << v.e() << '(' << to_hex(v.e(), true, true) << ')' << ')';
return s.str();
}

Expand Down
6 changes: 3 additions & 3 deletions linalg/blas/minij.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,11 @@ void MinIJMatrixTest(size_t N = 5) {
auto M = minij<Scalar>(N);

// normalize the column vectors
auto total = sum(M);
auto total = sumOfElements(M);
std::cout << "Total : " << total << '\n';
auto rowSums = sum(M, 1);
auto rowSums = sumOfElements(M, 1);
std::cout << "Row sums : " << rowSums << '\n';
auto colSums = sum(M, 2);
auto colSums = sumOfElements(M, 2);
std::cout << "Col sums : " << colSums << '\n';
}

Expand Down
6 changes: 3 additions & 3 deletions linalg/blas/rand_spectral.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,11 @@ try {
std::cout << Qbase << '\n';

// normalize the column vectors
auto total = sum(Qbase);
auto total = sumOfElements(Qbase); // default is dim = 0
std::cout << "Total : " << total << '\n';
auto rowSums = sum(Qbase, 1);
auto rowSums = sumOfElements(Qbase, 1);
std::cout << "Row sums : " << rowSums << '\n';
auto colSums = sum(Qbase, 2);
auto colSums = sumOfElements(Qbase, 2);
std::cout << "Col sums : " << colSums << '\n';

normalize(Qbase, 2); // normalize columns so they are unit length
Expand Down
27 changes: 21 additions & 6 deletions linalg/data/serialization.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -134,13 +134,28 @@ void TestVectorSerialization() {
df.save(std::cout, false);
}

template<typename Scalar>
void TestMatrixSerialization() {
using namespace sw::universal;
sw::universal::blas::matrix<Scalar> m(5,5);
gaussian_random(m, 0.0, 0.1);
blas::datafile<blas::TextFormat> df;
df.add(m);
df.save(std::cout, false); // decimal format
std::stringstream s;
df.save(s, false); // decimal format
df.clear();
df.restore(s);
df.save(std::cout, false);
}

void TestSerialization() {
using namespace sw::universal;

// Create instances of different specialized collections
sw::universal::blas::vector<float> xfp32(5), yfp32(5);
sw::universal::blas::matrix<float> Afp32(5, 5);
sw::universal::blas::tensor<float> Tfp32(5, 5); // TBD
// sw::universal::blas::tensor<float> Tfp32(5, 5); // TBD
sw::universal::blas::matrix<float> dpfp32(1, 1);
gaussian_random(xfp32, 0.0, 0.1);
gaussian_random(yfp32, 0.0, 0.1);
Expand All @@ -157,7 +172,7 @@ void TestSerialization() {

// Use the base class reference to aggregate the collections
blas::datafile<blas::TextFormat> df;
df.add(Tfp32);
// df.add(Tfp32);
df.add(xfp32);
df.add(yfp32);
df.add(Afp32);
Expand Down Expand Up @@ -216,12 +231,12 @@ try {
// ReportNumberSystemFormats();

//TestVectorSerialization<float>();
TestVectorSerialization<half>();
return 0;
TestMatrixSerialization<float>();
// TestVectorSerialization<half>();

TestSaveTypeId();
// TestSaveTypeId();

TestSerialization();
// TestSerialization();
return 0;

unsigned N = 32;
Expand Down

0 comments on commit 567744d

Please sign in to comment.