-
Notifications
You must be signed in to change notification settings - Fork 6
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Idx variants #160
Idx variants #160
Changes from 8 commits
59d7500
a388329
7eac3cc
8400abc
9342283
e725f8c
ae6817d
978386c
cb34642
0cfe9e6
2ca3adf
a52b015
ef57f2f
8ee6d25
dcfd320
27c0574
6d6be9f
8f9aca6
8b07ef4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -685,6 +685,169 @@ namespace distopia { | |
} | ||
} | ||
|
||
template <typename V> | ||
HWY_INLINE void LoadInterleavedIdx(const unsigned int *idx, const float *src, | ||
V &x_dst, V &y_dst, V &z_dst) { | ||
hn::ScalableTag<float> d; | ||
hn::ScalableTag<int> d_int; | ||
auto vidx = hn::LoadU(d_int, (int*)idx); | ||
auto Vthree = hn::Set(d_int, 3); | ||
auto Vone = hn::Set(d_int, 1); | ||
// convert indices to 3D coordinate indices, i.e. index 10 at pointer 30 etc | ||
vidx = hn::Mul(vidx, Vthree); | ||
x_dst = hn::GatherIndex(d, src, vidx); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This isn't actually that good, I imagine for performance you instead would want to do loads of xyz vectors (with junk) then a transpose, but you'd have to write this for all vector widths, at which point maybe you'd be better off doing a contribution upstream ( There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah this is still clever, prepopulating the indices for a gather. |
||
vidx = hn::Add(vidx, Vone); | ||
y_dst = hn::GatherIndex(d, src, vidx); | ||
vidx = hn::Add(vidx, Vone); | ||
z_dst = hn::GatherIndex(d, src, vidx); | ||
} | ||
|
||
template <typename V> | ||
HWY_INLINE void LoadInterleavedIdx(const unsigned int *idx, const double *src, | ||
V &x_dst, V &y_dst, V &z_dst) { | ||
hn::ScalableTag<double> d; | ||
hn::ScalableTag<int64_t> d_long; | ||
// "int64_t" number of lanes but "int" type | ||
hn::Rebind<int, hn::ScalableTag<int64_t>> d_int; | ||
|
||
// can't call Gather to doubles with int indices so: load int32s and cast them up to int64s so widths match | ||
// first load NLONG values from idx | ||
// !! important to not segfault here by loading LONG*2 (i.e. a full vector of 32s) !! | ||
auto vidx_narrow = hn::LoadU(d_int, (int*)idx); | ||
// then cast int32 values to int64, taking only lower half of vector | ||
auto vidx = hn::PromoteTo(d_long, vidx_narrow); | ||
auto Vthree = hn::Set(d_long, 3); | ||
auto Vone = hn::Set(d_long, 1); | ||
|
||
vidx = hn::Mul(vidx, Vthree); | ||
x_dst = hn::GatherIndex(d, src, vidx); | ||
vidx = hn::Add(vidx, Vone); | ||
y_dst = hn::GatherIndex(d, src, vidx); | ||
vidx = hn::Add(vidx, Vone); | ||
z_dst = hn::GatherIndex(d, src, vidx); | ||
} | ||
|
||
template <typename T, typename B> | ||
void CalcBondsIdx(const T *coords, const unsigned int *a_idx, const unsigned int *b_idx, int n, | ||
T *out, const B &box) { | ||
const hn::ScalableTag<T> d; | ||
int nlanes = hn::Lanes(d); | ||
|
||
unsigned int a_sub[3 * HWY_MAX_LANES_D(hn::ScalableTag<T>)]; | ||
unsigned int b_sub[3 * HWY_MAX_LANES_D(hn::ScalableTag<T>)]; | ||
T out_sub[HWY_MAX_LANES_D(hn::ScalableTag<T>)]; | ||
T *dst; | ||
|
||
if (HWY_UNLIKELY(n < nlanes)) { | ||
// zero out idx arrays, so we can safely load 0th values when we spill | ||
memset(a_sub, 0, sizeof(int) * nlanes); | ||
memset(b_sub, 0, sizeof(int) * nlanes); | ||
memcpy(a_sub, a_idx, sizeof(int) * n); | ||
memcpy(b_sub, b_idx, sizeof(int) * n); | ||
// switch to use a_sub as index array now we've copied contents | ||
a_idx = a_sub; | ||
b_idx = b_sub; | ||
dst = out_sub; | ||
} else { | ||
dst = out; | ||
} | ||
|
||
auto a_x = hn::Undefined(d); | ||
auto a_y = hn::Undefined(d); | ||
auto a_z = hn::Undefined(d); | ||
auto b_x = hn::Undefined(d); | ||
auto b_y = hn::Undefined(d); | ||
auto b_z = hn::Undefined(d); | ||
|
||
for (size_t i=0; i <= n - nlanes; i += nlanes) { | ||
// load N indices of each source | ||
// interleaved gather these indices | ||
LoadInterleavedIdx(a_idx + i, coords, a_x, a_y, a_z); | ||
LoadInterleavedIdx(b_idx + i, coords, b_x, b_y, b_z); | ||
|
||
auto result = box.Distance(a_x, a_y, a_z, b_x, b_y, b_z); | ||
|
||
hn::StoreU(result, d, dst + i); | ||
} | ||
size_t rem = n % nlanes; | ||
if (rem) { // if we had a non-multiple of nlanes, do final nlanes values again | ||
LoadInterleavedIdx(a_idx + n - nlanes, coords, a_x, a_y, a_z); | ||
LoadInterleavedIdx(b_idx + n - nlanes, coords, b_x, b_y, b_z); | ||
|
||
auto result = box.Distance(a_x, a_y, a_z, b_x, b_y, b_z); | ||
|
||
hn::StoreU(result, d, dst + n - nlanes); | ||
} | ||
|
||
if (HWY_UNLIKELY(n < nlanes)) { | ||
// copy results back from temp array into actual output array | ||
memcpy(out, out_sub, n * sizeof(T)); | ||
} | ||
} | ||
|
||
template <typename T, typename B> | ||
void CalcAnglesIdx(const T *coords, const unsigned int *a_idx, const unsigned int *b_idx, const unsigned int *c_idx, int n, | ||
T *out, const B &box) { | ||
// Logic here closely follows BondsIdx, except with an additional coordinate | ||
const hn::ScalableTag<T> d; | ||
int nlanes = hn::Lanes(d); | ||
|
||
unsigned int a_sub[3 * HWY_MAX_LANES_D(hn::ScalableTag<T>)]; | ||
unsigned int b_sub[3 * HWY_MAX_LANES_D(hn::ScalableTag<T>)]; | ||
unsigned int c_sub[3 * HWY_MAX_LANES_D(hn::ScalableTag<T>)]; | ||
T out_sub[HWY_MAX_LANES_D(hn::ScalableTag<T>)]; | ||
T *dst; | ||
|
||
if (HWY_UNLIKELY(n < nlanes)) { | ||
memset(a_sub, 0, sizeof(int) * nlanes); | ||
memset(b_sub, 0, sizeof(int) * nlanes); | ||
memset(c_sub, 0, sizeof(int) * nlanes); | ||
memcpy(a_sub, a_idx, sizeof(int) * n); | ||
memcpy(b_sub, b_idx, sizeof(int) * n); | ||
memcpy(c_sub, c_idx, sizeof(int) * n); | ||
a_idx = a_sub; | ||
b_idx = b_sub; | ||
c_idx = c_sub; | ||
dst = out_sub; | ||
} else { | ||
dst = out; | ||
} | ||
|
||
auto a_x = hn::Undefined(d); | ||
auto a_y = hn::Undefined(d); | ||
auto a_z = hn::Undefined(d); | ||
auto b_x = hn::Undefined(d); | ||
auto b_y = hn::Undefined(d); | ||
auto b_z = hn::Undefined(d); | ||
auto c_x = hn::Undefined(d); | ||
auto c_y = hn::Undefined(d); | ||
auto c_z = hn::Undefined(d); | ||
|
||
for (size_t i=0; i <= n - nlanes; i += nlanes) { | ||
LoadInterleavedIdx(a_idx + i, coords, a_x, a_y, a_z); | ||
LoadInterleavedIdx(b_idx + i, coords, b_x, b_y, b_z); | ||
LoadInterleavedIdx(c_idx + i, coords, c_x, c_y, c_z); | ||
|
||
auto result = Angle(a_x, a_y, a_z, b_x, b_y, b_z, c_x, c_y, c_z, box); | ||
|
||
hn::StoreU(result, d, dst + i); | ||
} | ||
size_t rem = n % nlanes; | ||
if (rem) { | ||
LoadInterleavedIdx(a_idx + n - nlanes, coords, a_x, a_y, a_z); | ||
LoadInterleavedIdx(b_idx + n - nlanes, coords, b_x, b_y, b_z); | ||
LoadInterleavedIdx(c_idx + n - nlanes, coords, c_x, c_y, c_z); | ||
|
||
auto result = Angle(a_x, a_y, a_z, b_x, b_y, b_z, c_x, c_y, c_z, box); | ||
|
||
hn::StoreU(result, d, dst + n - nlanes); | ||
} | ||
|
||
if (HWY_UNLIKELY(n < nlanes)) { | ||
memcpy(out, out_sub, n * sizeof(T)); | ||
} | ||
} | ||
|
||
void CalcBondsNoBoxDouble(const double *a, const double *b, int n, double *out) { | ||
hn::ScalableTag<double> d; | ||
const NoBox vbox(d); | ||
|
@@ -852,6 +1015,88 @@ namespace distopia { | |
const TriclinicBox vbox(d, box); | ||
CalcSelfDistanceArray(a, n, out, vbox); | ||
} | ||
void CalcBondsNoBoxIdxSingle(const float *coords, const unsigned int *a_idx, const unsigned int *b_idx, | ||
int n, float *out) { | ||
hn::ScalableTag<float> d; | ||
const NoBox box(d); | ||
CalcBondsIdx(coords, a_idx, b_idx, n, out, box); | ||
} | ||
void CalcBondsNoBoxIdxDouble(const double *coords, const unsigned int *a_idx, const unsigned int *b_idx, | ||
int n, double *out) { | ||
hn::ScalableTag<double> d; | ||
const NoBox box(d); | ||
CalcBondsIdx(coords, a_idx, b_idx, n, out, box); | ||
} | ||
void CalcBondsOrthoIdxSingle(const float *coords, const unsigned int *a_idx, const unsigned int *b_idx, | ||
int n, const float *box, float *out) { | ||
hn::ScalableTag<float> d; | ||
const OrthogonalBox vbox(d, box); | ||
|
||
CalcBondsIdx(coords, a_idx, b_idx, n, out, vbox); | ||
} | ||
void CalcBondsOrthoIdxDouble(const double *coords, const unsigned int *a_idx, const unsigned int *b_idx, | ||
int n, const double *box, double *out) { | ||
hn::ScalableTag<double> d; | ||
const OrthogonalBox vbox(d, box); | ||
|
||
CalcBondsIdx(coords, a_idx, b_idx, n, out, vbox); | ||
} | ||
void CalcBondsTriclinicIdxSingle(const float *coords, const unsigned int *a_idx, const unsigned int *b_idx, | ||
int n, const float *box, float *out) { | ||
hn::ScalableTag<float> d; | ||
const TriclinicBox vbox(d, box); | ||
|
||
CalcBondsIdx(coords, a_idx, b_idx, n, out, vbox); | ||
} | ||
void CalcBondsTriclinicIdxDouble(const double *coords, const unsigned int *a_idx, const unsigned int *b_idx, | ||
int n, const double *box, double *out) { | ||
hn::ScalableTag<double> d; | ||
const TriclinicBox vbox(d, box); | ||
|
||
CalcBondsIdx(coords, a_idx, b_idx, n, out, vbox); | ||
} | ||
void CalcAnglesNoBoxIdxSingle(const float *coords, const unsigned int *a_idx, const unsigned int *b_idx, const unsigned int *c_idx, | ||
int n, float *out) { | ||
hn::ScalableTag<float> d; | ||
const NoBox vbox(d); | ||
|
||
CalcAnglesIdx(coords, a_idx, b_idx, c_idx, n, out, vbox); | ||
} | ||
void CalcAnglesNoBoxIdxDouble(const double *coords, const unsigned int *a_idx, const unsigned int *b_idx, const unsigned int *c_idx, | ||
int n, double *out) { | ||
hn::ScalableTag<double> d; | ||
const NoBox vbox(d); | ||
|
||
CalcAnglesIdx(coords, a_idx, b_idx, c_idx, n, out, vbox); | ||
} | ||
void CalcAnglesOrthoIdxSingle(const float *coords, const unsigned int *a_idx, const unsigned int *b_idx, const unsigned int *c_idx, | ||
int n, const float *box, float *out) { | ||
hn::ScalableTag<float> d; | ||
const OrthogonalBox vbox(d, box); | ||
|
||
CalcAnglesIdx(coords, a_idx, b_idx, c_idx, n, out, vbox); | ||
} | ||
void CalcAnglesOrthoIdxDouble(const double *coords, const unsigned int *a_idx, const unsigned int *b_idx, const unsigned int *c_idx, | ||
int n, const double *box, double *out) { | ||
hn::ScalableTag<double> d; | ||
const OrthogonalBox vbox(d, box); | ||
|
||
CalcAnglesIdx(coords, a_idx, b_idx, c_idx, n, out, vbox); | ||
} | ||
void CalcAnglesTriclinicIdxSingle(const float *coords, const unsigned int *a_idx, const unsigned int *b_idx, const unsigned int *c_idx, | ||
int n, const float *box, float *out) { | ||
hn::ScalableTag<float> d; | ||
const TriclinicBox vbox(d, box); | ||
|
||
CalcAnglesIdx(coords, a_idx, b_idx, c_idx, n, out, vbox); | ||
} | ||
void CalcAnglesTriclinicIdxDouble(const double *coords, const unsigned int *a_idx, const unsigned int *b_idx, const unsigned int *c_idx, | ||
int n, const double *box, double *out) { | ||
hn::ScalableTag<double> d; | ||
const TriclinicBox vbox(d, box); | ||
|
||
CalcAnglesIdx(coords, a_idx, b_idx, c_idx, n, out, vbox); | ||
} | ||
|
||
int GetNFloatLanes() { | ||
hn::ScalableTag<float> d; | ||
|
@@ -901,6 +1146,18 @@ namespace distopia { | |
HWY_EXPORT(CalcSelfDistanceArrayOrthoSingle); | ||
HWY_EXPORT(CalcSelfDistanceArrayTriclinicDouble); | ||
HWY_EXPORT(CalcSelfDistanceArrayTriclinicSingle); | ||
HWY_EXPORT(CalcBondsNoBoxIdxSingle); | ||
HWY_EXPORT(CalcBondsNoBoxIdxDouble); | ||
HWY_EXPORT(CalcBondsOrthoIdxSingle); | ||
HWY_EXPORT(CalcBondsOrthoIdxDouble); | ||
HWY_EXPORT(CalcBondsTriclinicIdxSingle); | ||
HWY_EXPORT(CalcBondsTriclinicIdxDouble); | ||
HWY_EXPORT(CalcAnglesNoBoxIdxSingle); | ||
HWY_EXPORT(CalcAnglesNoBoxIdxDouble); | ||
HWY_EXPORT(CalcAnglesOrthoIdxSingle); | ||
HWY_EXPORT(CalcAnglesOrthoIdxDouble); | ||
HWY_EXPORT(CalcAnglesTriclinicIdxSingle); | ||
HWY_EXPORT(CalcAnglesTriclinicIdxDouble); | ||
HWY_EXPORT(GetNFloatLanes); | ||
HWY_EXPORT(GetNDoubleLanes); | ||
|
||
|
@@ -1038,8 +1295,56 @@ namespace distopia { | |
} | ||
return HWY_DYNAMIC_DISPATCH(CalcSelfDistanceArrayTriclinicDouble)(a, n, box, out); | ||
} | ||
HWY_DLLEXPORT template <> void CalcBondsNoBoxIdx(const float *coords, const unsigned int *a_idx, const unsigned int *b_idx, | ||
int n, float *out) { | ||
return HWY_DYNAMIC_DISPATCH(CalcBondsNoBoxIdxSingle)(coords, a_idx, b_idx, n, out); | ||
} | ||
HWY_DLLEXPORT template <> void CalcBondsNoBoxIdx(const double *coords, const unsigned int *a_idx, const unsigned int *b_idx, | ||
int n, double *out) { | ||
return HWY_DYNAMIC_DISPATCH(CalcBondsNoBoxIdxDouble)(coords, a_idx, b_idx, n, out); | ||
} | ||
HWY_DLLEXPORT template <> void CalcBondsOrthoIdx(const float *coords, const unsigned int *a_idx, const unsigned int *b_idx, | ||
int n, const float *box, float *out) { | ||
return HWY_DYNAMIC_DISPATCH(CalcBondsOrthoIdxSingle)(coords, a_idx, b_idx, n, box, out); | ||
} | ||
HWY_DLLEXPORT template <> void CalcBondsOrthoIdx(const double *coords, const unsigned int *a_idx, const unsigned int *b_idx, | ||
int n, const double *box, double *out) { | ||
return HWY_DYNAMIC_DISPATCH(CalcBondsOrthoIdxDouble)(coords, a_idx, b_idx, n, box, out); | ||
} | ||
HWY_DLLEXPORT template <> void CalcBondsTriclinicIdx(const float *coords, const unsigned int *a_idx, const unsigned int *b_idx, | ||
int n, const float *box, float *out) { | ||
return HWY_DYNAMIC_DISPATCH(CalcBondsTriclinicIdxSingle)(coords, a_idx, b_idx, n, box, out); | ||
} | ||
HWY_DLLEXPORT template <> void CalcBondsTriclinicIdx(const double *coords, const unsigned int *a_idx, const unsigned int *b_idx, | ||
int n, const double *box, double *out) { | ||
return HWY_DYNAMIC_DISPATCH(CalcBondsTriclinicIdxDouble)(coords, a_idx, b_idx, n, box, out); | ||
} | ||
HWY_DLLEXPORT template <> void CalcAnglesNoBoxIdx(const float *coords, const unsigned int *a_idx, const unsigned int *b_idx, const unsigned int *c_idx, | ||
int n, float *out) { | ||
return HWY_DYNAMIC_DISPATCH(CalcAnglesNoBoxIdxSingle)(coords, a_idx, b_idx, c_idx, n, out); | ||
} | ||
HWY_DLLEXPORT template <> void CalcAnglesNoBoxIdx(const double *coords, const unsigned int *a_idx, const unsigned int *b_idx, const unsigned int *c_idx, | ||
int n, double *out) { | ||
return HWY_DYNAMIC_DISPATCH(CalcAnglesNoBoxIdxDouble)(coords, a_idx, b_idx, c_idx, n, out); | ||
} | ||
HWY_DLLEXPORT template <> void CalcAnglesOrthoIdx(const float *coords, const unsigned int *a_idx, const unsigned int *b_idx, const unsigned int *c_idx, | ||
int n, const float *box, float *out) { | ||
return HWY_DYNAMIC_DISPATCH(CalcAnglesOrthoIdxSingle)(coords, a_idx, b_idx, c_idx, n, box, out); | ||
} | ||
HWY_DLLEXPORT template <> void CalcAnglesOrthoIdx(const double *coords, const unsigned int *a_idx, const unsigned int *b_idx, const unsigned int *c_idx, | ||
int n, const double *box, double *out) { | ||
return HWY_DYNAMIC_DISPATCH(CalcAnglesOrthoIdxDouble)(coords, a_idx, b_idx, c_idx, n, box, out); | ||
} | ||
HWY_DLLEXPORT template <> void CalcAnglesTriclinicIdx(const float *coords, const unsigned int *a_idx, const unsigned int *b_idx, const unsigned int *c_idx, | ||
int n, const float *box, float *out) { | ||
return HWY_DYNAMIC_DISPATCH(CalcAnglesTriclinicIdxSingle)(coords, a_idx, b_idx, c_idx, n, box, out); | ||
} | ||
HWY_DLLEXPORT template <> void CalcAnglesTriclinicIdx(const double *coords, const unsigned int *a_idx, const unsigned int *b_idx, const unsigned int *c_idx, | ||
int n, const double *box, double *out) { | ||
return HWY_DYNAMIC_DISPATCH(CalcAnglesTriclinicIdxDouble)(coords, a_idx, b_idx, c_idx, n, box, out); | ||
} | ||
|
||
std::vector<std::string> DistopiaSupportedAndGeneratedTargets() { | ||
std::vector<std::string> DistopiaSupportedAndGeneratedTargets() { | ||
std::vector<int64_t> targets = hwy::SupportedAndGeneratedTargets(); | ||
// for each print the name | ||
std::vector<std::string> names; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@hmacdope this is the interesting function that does the IDX gather here.
In general, the width of the integer part of the
GatherIndex
needs to match the width of the floating point type, so for the current signature ofint
indices, the float case is simple and the double case requires an upcase (see below).If we wanted to have int64 indices in the signature for
Idx
functions, we'd probably need two more specialisations to this function, which isn't too much of a drama. The int64 and float case is a little odd as arguably the ints could overflow, so maybe you couldn't vectorise this as hard... (using current highway bindings at least).